In [1]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,confusion_matrix

In [2]:
!cat /proc/meminfo | grep Mem

MemTotal:        3856324 kB
MemFree:         2261396 kB
MemAvailable:    2944660 kB


In [3]:
df = pd.read_csv('/datasets/toxic_comments.csv',index_col=0,encoding='utf-8')

In [41]:
%%time
import re
from nltk.stem import LancasterStemmer
lst = LancasterStemmer()
stemmer = lst.stem

def lemmatize(text):
    text = ''.join( re.sub(r"([^a-z\'])+",' ',text.lower()) )
    return ''.join( stemmer(text)).strip('\n')[:512] )               

# lemmatize(df.text[2])                    
df['lemmas'] = df.text.apply(lemmatize)

CPU times: user 4.01 s, sys: 19.5 ms, total: 4.03 s
Wall time: 4.03 s


In [87]:
df.loc[ 116444,'text' ]

"Accept my apology! \n\nAccept my apology THIS MINUTE you big, fat noob (this is doughnuthead talking,and if you don't think so, tough.)78.144.87.71"

In [4]:
%%time
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

CPU times: user 17.6 s, sys: 13.8 s, total: 31.4 s
Wall time: 22.6 s


In [5]:
!cat /proc/meminfo | grep Mem

MemTotal:        3856324 kB
MemFree:         1408928 kB
MemAvailable:    1822964 kB


In [67]:
sample = df.sample(1000)

In [68]:
%%time
pr0 = sentiment_analysis(list( sample.lemmas ) )

CPU times: user 27min 17s, sys: 6.61 s, total: 27min 24s
Wall time: 6min 51s


In [52]:
pd.Series(  pr0)

0     {'label': 'NEGATIVE', 'score': 0.9957147240638...
1     {'label': 'POSITIVE', 'score': 0.9915220737457...
2     {'label': 'NEGATIVE', 'score': 0.9994970560073...
3     {'label': 'NEGATIVE', 'score': 0.9980974793434...
4     {'label': 'POSITIVE', 'score': 0.9973760843276...
                            ...                        
95    {'label': 'NEGATIVE', 'score': 0.9985480904579...
96    {'label': 'POSITIVE', 'score': 0.998691737651825}
97    {'label': 'NEGATIVE', 'score': 0.9975747466087...
98    {'label': 'NEGATIVE', 'score': 0.9976218342781...
99    {'label': 'NEGATIVE', 'score': 0.9944867491722...
Length: 100, dtype: object

In [71]:
pr1 = pd.DataFrame( [ [ int(x['label']=='NEGATIVE'),x['score'] ] for x in pr0],
                   columns=['label','score'] ,index=sample.index)
res = pd.concat([ sample,pr1 ]  , axis=1)

In [84]:
res[(res.toxic==1) & (res.label== 0)]

Unnamed: 0,text,toxic,lemmas,label,score
116444,Accept my apology! \n\nAccept my apology THIS ...,1,accept my apology accept my apology this minut...,0,0.991083
114687,Epbr123 is a bigger faggot then wikipedia and ...,1,epbr is a bigger faggot then wikipedia and lov...,0,0.996384
155051,"""\n\n Yoshendi \n\nHello! I acknowledge why yo...",1,yoshendi hello i acknowledge why you have del...,0,0.996584
442,Hi \n\nIm a fucking bitch.\n\n50.180.208.181,1,hi im a fucking bitch,0,0.985596
153296,"""\n\nBarnstar\n\n The Working Donkey's Barnst...",1,barnstar the working donkey's barnstar for wo...,0,0.998497
148072,""" (UTC)\nThat is very rich coming from you. Do...",1,utc that is very rich coming from you don't f...,0,0.993505
149157,For sanity's sake I hope santa gets you a new ...,1,for sanity's sake i hope santa gets you a new ...,0,0.97937
36903,"love to suck fat, black cock. I",1,love to suck fat black cock i,0,0.996386
72412,"Hmm, I had never heard of him so perhaps not, ...",1,hmm i had never heard of him so perhaps not be...,0,0.982621


In [72]:
!cat /proc/meminfo | grep Mem

MemTotal:        3856324 kB
MemFree:          330284 kB
MemAvailable:    1180268 kB


In [83]:
for metric in [f1_score,precision_score,recall_score,accuracy_score]:
    print( f"{metric.__name__}\t {metric(sample.toxic,pr1.label)} " )

f1_score	 0.23192019950124687 
precision_score	 0.13285714285714287 
recall_score	 0.9117647058823529 
accuracy_score	 0.384 


In [44]:
pr1

[{'label': 'NEGATIVE', 'score': 0.9957147240638733},
 {'label': 'POSITIVE', 'score': 0.9915220737457275},
 {'label': 'NEGATIVE', 'score': 0.9994970560073853},
 {'label': 'NEGATIVE', 'score': 0.9980974793434143},
 {'label': 'POSITIVE', 'score': 0.9973760843276978},
 {'label': 'POSITIVE', 'score': 0.9987456798553467},
 {'label': 'NEGATIVE', 'score': 0.9992076754570007},
 {'label': 'NEGATIVE', 'score': 0.9969756603240967},
 {'label': 'NEGATIVE', 'score': 0.9994739890098572},
 {'label': 'POSITIVE', 'score': 0.9938358664512634},
 {'label': 'NEGATIVE', 'score': 0.9767663478851318},
 {'label': 'POSITIVE', 'score': 0.9965537786483765},
 {'label': 'NEGATIVE', 'score': 0.9989187717437744},
 {'label': 'NEGATIVE', 'score': 0.9993783235549927},
 {'label': 'POSITIVE', 'score': 0.9957348704338074},
 {'label': 'POSITIVE', 'score': 0.9928533434867859},
 {'label': 'NEGATIVE', 'score': 0.9978758096694946},
 {'label': 'NEGATIVE', 'score': 0.9925879836082458},
 {'label': 'NEGATIVE', 'score': 0.999473512172