In [1]:
!pip install transformers
!pip install catboost

from google.colab import drive
drive.mount("/content/drive")
data_path='/content/drive/MyDrive/datasets/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!cat /proc/meminfo | grep Mem

MemTotal:       13294208 kB
MemFree:         9757048 kB
MemAvailable:   11942784 kB


In [3]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,confusion_matrix

import re
from nltk.stem import LancasterStemmer

In [4]:
sample_size = 30_000

In [5]:
# from transformers import RobertaTokenizer, RobertaForSequenceClassification

# # load tokenizer and model weights
# tokenizer = RobertaTokenizer.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
# model = RobertaForSequenceClassification.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')

# # prepare the input
# batch = tokenizer.encode('you are amazing', return_tensors='pt')

# # inference
# model(batch)

In [6]:
pipe = pipeline(model='SkolkovoInstitute/roberta_toxicity_classifier')

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
df = pd.read_csv(f"{data_path}toxic_comments.csv",index_col=0,encoding='utf-8')

In [8]:
sample = df.sample(sample_size)

In [9]:
%%time
lst = LancasterStemmer()
stemmer = lst.stem

def lemmatize(text):
    text = ''.join( re.sub(r"([^a-z\'])+",' ',text.lower()) )
    return ''.join( stemmer(text).strip('\n')[:512] )

# lemmatize(df.text[2])
sample['lemmas'] = sample.text.apply(lemmatize)

CPU times: user 3.25 s, sys: 21.1 ms, total: 3.27 s
Wall time: 4.81 s


In [10]:
!cat /proc/meminfo | grep Mem

MemTotal:       13294208 kB
MemFree:         8835400 kB
MemAvailable:   11085964 kB


In [None]:
%%time
pred = pipe(list( sample.lemmas ) )
pd.Series(  pred ).head()

CPU times: user 2h 28min 6s, sys: 13.1 s, total: 2h 28min 19s
Wall time: 2h 29min 45s


0    {'label': 'neutral', 'score': 0.9999631643295288}
1    {'label': 'neutral', 'score': 0.9998753070831299}
2    {'label': 'neutral', 'score': 0.9999372959136963}
3     {'label': 'neutral', 'score': 0.999842643737793}
4    {'label': 'neutral', 'score': 0.9970797896385193}
dtype: object

In [None]:
transformed_pred = pd.DataFrame( [ [ int(x['label']=='toxic'),x['score'] ] for x in pred],
                   columns=['label','score'] ,index=sample.index)
to_compare = pd.concat([ sample,transformed_pred ]  , axis=1)
to_compare

Unnamed: 0,text,toxic,lemmas,label,score
87377,|I apologise for making that remark to Sidaway...,0,i apologise for making that remark to sidaway...,0,0.999963
108232,"Warning?\nYou should have given me a warning, ...",0,warning you should have given me a warning rea...,0,0.999875
48711,I did not remove or add any languages or other...,0,i did not remove or add any languages or other...,0,0.999937
77557,"""\n\nProblems with the new paragraphs\n\nIn ge...",0,problems with the new paragraphs in general t...,0,0.999843
149186,I have posted this edit war to AN/I. Please st...,0,i have posted this edit war to an i please sto...,0,0.997080
...,...,...,...,...,...
128125,"""\n\n Good progress \nYour group is making ver...",0,good progress your group is making very good ...,0,0.999956
143274,"""\n\n Is the British National Front running th...",0,is the british national front running things ...,0,0.999713
86131,"I swear, we need some admins that are actually...",0,i swear we need some admins that are actually ...,0,0.999794
120376,nikko smells nikko smells nikko smells nikk...,0,nikko smells nikko smells nikko smells nikko s...,0,0.998606


In [None]:
to_compare[~(to_compare.toxic==to_compare.label)]

Unnamed: 0,text,toxic,lemmas,label,score
43939,Wizzy - you are the problem of your own creati...,0,wizzy you are the problem of your own creation...,1,0.572054
57789,It doesn't matter what I think. I'm a big ret...,1,it doesn't matter what i think i'm a big retar...,0,0.776359
22272,you deleted my page \nand i am extrememly mad ...,1,you deleted my page and i am extrememly mad an...,0,0.985366
52689,fok yu 2 \n\nYOU OLSOW SUK AS MUZIKANIMUL FOK ...,1,fok yu you olsow suk as muzikanimul fok u what...,0,0.876597
87926,You are all both nuts. Mariah Carey is the hig...,0,you are all both nuts mariah carey is the high...,1,0.985766
...,...,...,...,...,...
138918,"""\nHa ha Ha! Using our logic Purrum, tells us ...",0,ha ha ha using our logic purrum tells us clea...,1,0.991141
50070,"You just can't get over it \n\nSeriously, I ne...",1,you just can't get over it seriously i never e...,0,0.965204
25272,8. Wikipedia is not about the spread of knowle...,0,wikipedia is not about the spread of knowledg...,1,0.750414
55171,Tedious and dishonest. I'm having nothing more...,0,tedious and dishonest i'm having nothing more ...,1,0.710588


In [None]:
!cat /proc/meminfo | grep Mem

MemTotal:       13294208 kB
MemFree:         8884480 kB
MemAvailable:   11159584 kB


In [None]:
for metric in [f1_score,precision_score,recall_score,accuracy_score]:
    print( f"{metric.__name__}\t {metric(to_compare.toxic,to_compare.label)} " )

f1_score	 0.8381972025556897 
precision_score	 0.8643162393162394 
recall_score	 0.8136104592691921 
accuracy_score	 0.9687666666666667 
