# Toxic Span Detection - Bayes
Project for Text Data Mining course at EITI-WUT

Authors: Julia Kłos, Patrycja Cieplicka

Date: 12.01.2020

### Import library

In [1]:
import nltk.stem
import re
import numpy as np
import os
import nltk
import pandas as pd
from src.SemEvalData import SemEvalData
from src.JigsawData import JigsawData
from src.preprocessing import preprocess_bayes, getSpansByToxicWords, getToxicWordsBayes
import joblib
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
import sklearn.feature_extraction.text
import sklearn.naive_bayes
import sklearn.metrics
import sklearn.model_selection
import sklearn.pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from test_sentence import test_bayes




### Load Lemmatizer

nltk.download('wordnet')
lemmer = nltk.stem.WordNetLemmatizer()

### Load train data SemEval 2021 Toxic Span Detection

In [3]:
train_data_semeval = SemEvalData()
train_data_semeval.load_data("data/tsd_train.csv")
train_df_preprocessed = train_data_semeval.preprocess()

In [4]:
train_df_preprocessed.head(25)

Unnamed: 0,spans,text,toxicity,toxic_words,original_text,sentences,diff,toxicity_sentence
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",another violent and aggressive immigrant killi...,1,[violent and aggressive immigrant],Another violent and aggressive immigrant killi...,[another violent and aggressive immigrant kill...,[],[1.0]
1,"[33, 34, 35, 36, 37, 38, 39]","i am 56 years old, i am not your fucking junio...",1,[fucking],"I am 56 years old, I am not your fucking junio...","[i am 56 years old, i am not your fucking juni...","[94, 575, 588, 647]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,"[0, 1, 2, 3]","damn, a whole family. sad indeed.",1,[damn],"Damn, a whole family. Sad indeed.","[damn, a whole family., sad indeed.]",[],"[1.0, 0.0]"
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",what a knucklehead. how can anyone not know th...,1,[knucklehead],What a knucklehead. How can anyone not know th...,"[what a knucklehead., how can anyone not know ...",[],"[1.0, 0.0, 0.0]"
4,"[32, 33, 34, 35, 36, 37, 38]",who do you think should do the killing? anyon...,1,[killing],"""who do you think should do the killing?""\n\nA...","[who do you think should do the killing?, anyo...","[0, 40, 442, 622, 933]","[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
5,[],"but, but, but, is not a defense. its not even...",0,[],"But, but, but, is NOT a defense. It's not eve...","[but, but, but, is not a defense., its not eve...","[36, 250, 276]","[0.0, 0.0, 0.0, 0.0, 0.0]"
6,"[39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 5...","please people, stop using these silly, stupid ...",1,"[stupid, emoticons]","Please people, stop using these silly, stupid ...","[please people, stop using these silly, stupid...",[],[1.0]
7,"[0, 1, 2, 3]",dumb.,1,[dumb],Dumb.,[dumb.],[],[1.0]
8,"[49, 50, 51, 52, 53, 54, 147, 148, 149, 150, 1...",obamacare is on its last gasping breaths. yo...,1,"[idiots, stupid]",Obamacare is on it's last gasping breaths. Y...,"[obamacare is on its last gasping breaths., yo...","[18, 63, 97, 227]","[0.0, 1.0, 0.0, 0.0]"
9,"[32, 33, 34, 35, 36, 37, 38, 39]",crooked trump = guilty as hell. pathetic,1,[pathetic],CROOKED Trump = GUILTY as hell.\npathetic,"[crooked trump = guilty as hell., pathetic]",[],"[0.0, 1.0]"


### Preprocess data

Divide data into sentences

In [5]:
train_data = {
    'sentence':  train_df_preprocessed.sentences.sum(),
    'toxicity_sentence': train_df_preprocessed.toxicity_sentence.sum()
        }

train_df = pd.DataFrame (train_data, columns = ['sentence','toxicity_sentence'])

In [6]:
train_df['sentence'] = train_df.apply(lambda row: preprocess_bayes(row.sentence), axis=1)
print(train_df.head(5))

                                            sentence  toxicity_sentence
0  another violent and aggressive immigrant killi...                1.0
1   i am  years old i am not your fucking junior pal                1.0
2                 what you are saying makes no sense                0.0
3            i dont know what you are basing this on                0.0
4  the cheap black market crap is still coming up...                0.0


In [7]:
len(train_df[train_df["toxicity_sentence"] == 0])

13264

In [8]:
len(train_df[train_df["toxicity_sentence"] == 1])

9500

Lemmatization

In [9]:
def lemmatization(text):
    data = ' '.join([lemmer.lemmatize(word) for word in text.split()])
    return data

In [10]:
train_df['sentence'] = train_df.apply(lambda row: lemmatization(row.sentence), axis=1)

In [11]:
print(train_df.head())

                                            sentence  toxicity_sentence
0  another violent and aggressive immigrant killi...                1.0
1     i am year old i am not your fucking junior pal                1.0
2                  what you are saying make no sense                0.0
3            i dont know what you are basing this on                0.0
4  the cheap black market crap is still coming up...                0.0


Perform a grid search to find the best hyperparameters

In [18]:
grid_search(train_df)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   22.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 240 out of 240 | elapsed:  2.0min finished


---- Results ----
Best score: 0.7661224546514649
c__alpha: 1.0
t__use_idf: False
v__lowercase: True
v__ngram_range: (1, 1)


### Train and evaluate a model

In [12]:
def train_and_evaluate(train):
    
    # Convert to bag of words
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', lowercase=True, ngram_range=(1,1))
    X = count_vect.fit_transform(train['sentence'])
    # Convert from occurrences to frequencies
    # Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.
    # To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.
    transformer = TfidfTransformer()
    X = transformer.fit_transform(X)
    # Create a model
    # get the first vector out (for the first document) 
    first_vector_tfidfvectorizer=X[0] 
 
    # place tf-idf values in a pandas data frame 
    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=count_vect.get_feature_names(), columns=["tfidf"]) 
    df.sort_values(by=["tfidf"],ascending=True)
    print(df.head)
    model = sklearn.naive_bayes.MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None)
    # Train the model
    model.fit(X, train['toxicity_sentence'])
    # Save models
    joblib.dump(count_vect, 'vectorizer_bayes.jbl')
    joblib.dump(transformer, 'transformer_bayes.jbl')
    joblib.dump(model, 'model_bayes.jbl')
    # Evaluate on training data
    print('-- Training data --')
    predictions = model.predict(X)
    print('here',model.coef_, len(count_vect.get_feature_names()), predictions)
    print(predictions,X, train['sentence'])
    accuracy = sklearn.metrics.accuracy_score(train['toxicity_sentence'], predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(sklearn.metrics.classification_report(train['toxicity_sentence'], predictions))
    print('')
#     Evaluate with 10-fold CV
    print('-- 10-fold CV --')
    predictions = sklearn.model_selection.cross_val_predict(model, X, train['toxicity_sentence'], cv=10)
    accuracy = sklearn.metrics.accuracy_score(train['toxicity_sentence'], predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(sklearn.metrics.classification_report(train['toxicity_sentence'], predictions))
    return model


In [15]:
train_and_evaluate(train_df)

<bound method NDFrame.head of          tfidf
aa         0.0
aaa        0.0
aahhh      0.0
aapoor     0.0
abandon    0.0
...        ...
zuptoid    0.0
zuri       0.0
zwei       0.0
zz         0.0
zzzzzz     0.0

[17812 rows x 1 columns]>
-- Training data --
here [[ -8.99365183 -10.46043916 -10.3795395  ... -11.42007242 -11.42007242
  -11.42007242]] 17812 [1. 1. 0. ... 0. 0. 1.]
[1. 1. 0. ... 0. 0. 1.]   (0, 17025)	0.3612465882102366
  (0, 13766)	0.4250020107598058
  (0, 8722)	0.2989512824534744
  (0, 8171)	0.349476259432824
  (0, 8079)	0.3323369916456108
  (0, 7760)	0.3248117045584413
  (0, 2604)	0.3030879853045209
  (0, 320)	0.41177156518958624
  (1, 17689)	0.31672999210173375
  (1, 11354)	0.4934807414263961
  (1, 10986)	0.3800480289834355
  (1, 8568)	0.5424937095001565
  (1, 6174)	0.4662863291616598
  (2, 14090)	0.658914520992524
  (2, 13808)	0.5930647728437727
  (2, 9514)	0.46271571103319686
  (3, 8797)	0.3879256621830136
  (3, 4524)	0.35660724746419836
  (3, 1290)	0.8499087902097956

MultinomialNB(alpha=0.3)

## Evaluation

Load models from file

In [10]:
count_vect = joblib.load('vectorizer_bayes.jbl')
transformer = joblib.load('transformer_bayes.jbl')
model = joblib.load('model_bayes.jbl')

Load test data

In [11]:
test_data_semeval = SemEvalData()
test_data_semeval.load_data("data/tsd_trial.csv")
test_df_preprocessed = test_data_semeval.preprocess()

Predict test data and print classification report

In [12]:
def transform_and_predict(sentence):
    x = count_vect.transform([sentence])
    return model.predict(x)[0]

In [13]:
train_data = {
    'sentence':  test_df_preprocessed.sentences.sum(),
    'toxicity_sentence': test_df_preprocessed.toxicity_sentence.sum()
        }

test = pd.DataFrame (train_data, columns = ['sentence','toxicity_sentence'])
test['sentence'] = test.apply(lambda row: preprocess_bayes(row.sentence), axis=1)
test['sentence'] = test.apply(lambda row: lemmatization(row.sentence), axis=1)
test['prediction'] = test.apply(lambda row: transform_and_predict(row.sentence), axis=1)
accuracy = sklearn.metrics.accuracy_score(test['toxicity_sentence'], test['prediction'])
print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
print('Classification Report:')
print(sklearn.metrics.classification_report(test['toxicity_sentence'], test['prediction']))

Accuracy: 77.16
Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81      1188
         1.0       0.72      0.72      0.72       804

    accuracy                           0.77      1992
   macro avg       0.76      0.76      0.76      1992
weighted avg       0.77      0.77      0.77      1992



Extract toxic words

In [14]:
test_df_preprocessed["predicted_span"]=[test_bayes(sentences,0)for sentences in test_df_preprocessed["original_text"] ]

In [15]:
test_df_preprocessed.head(50)

Unnamed: 0,spans,text,toxicity,toxic_words,original_text,sentences,diff,toxicity_sentence,predicted_span
0,"[15, 16, 17, 18, 19, 27, 28, 29, 30, 31]",because hes a moron and a bigot. its not any m...,1,"[moron, bigot]",Because he's a moron and a bigot. It's not any...,"[because hes a moron and a bigot., its not any...","[10, 36]","[1.0, 0.0]","[27, 28, 29, 30, 31, 32, 15, 16, 17, 18, 19]"
1,"[29, 30, 31, 32, 33, 34]",how about we stop protecting idiots and let na...,1,[idiots],How about we stop protecting idiots and let na...,[how about we stop protecting idiots and let n...,[],"[1.0, 0.0]","[51, 52, 53, 60, 61, 62, 63, 64, 65, 74, 75, 7..."
2,"[166, 167, 168, 169, 170, 171]","if people were smart, they would boycott th...",1,[idiots],"If people were smart, they would Boycott th...","[if people were smart, they would boycott t...",[],[1.0],[]
3,"[87, 88, 89, 90, 91, 92]",trump claimed that russia will never invade th...,1,[stupid],Trump Claimed that Russia will never invade th...,[trump claimed that russia will never invade t...,[],[1.0],[]
4,[],as long as your willing to pay a lot more for ...,0,[],As long as your willing to pay a lot more for ...,[as long as your willing to pay a lot more for...,[148],"[0.0, 0.0, 0.0]","[188, 189, 190, 191, 192, 193, 194, 195, 196, ..."
5,"[8, 9, 10, 11, 12]",only an idiot would use and believe anything t...,1,[idiot],Only an idiot would use and believe anything t...,[only an idiot would use and believe anything ...,[],[1.0],"[28, 29, 30, 31, 32, 33, 34, 8, 9, 10, 11, 12,..."
6,"[265, 266, 267, 268, 269, 270, 271, 272, 273, ...",thanks a lot douchebag. youre the reason the p...,1,[o try to turn salem into some kind of new-stu...,Thanks a lot douchebag. You're the reason the ...,"[thanks a lot douchebag., youre the reason the...","[27, 467]","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0]","[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 9, 10..."
7,[],kick all the non human criminal illegals out o...,0,[],kick all the non human criminal illegals out o...,[kick all the non human criminal illegals out ...,[],[0.0],"[23, 24, 25, 26, 27, 28, 29, 30, 73, 74, 75, 7..."
8,"[38, 39, 40, 41, 42, 43]",because driving under ontario laws is stupid e...,1,[stupid],Because driving under Ontario laws is stupid e...,[because driving under ontario laws is stupid ...,[],[1.0],"[8, 9, 10, 11, 12, 13, 14, 30, 31, 32, 22, 23,..."
9,"[277, 278, 279, 280, 281, 282, 283, 284, 285, ...",youre wrong. the delay between retirement and...,1,[dont make ignorant statements],You're wrong. The delay between retirement an...,"[youre wrong., the delay between retirement an...","[3, 264, 280]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[277, 278, 279, 280, 281, 288, 289, 290, 291, ..."


Calculate precission, recall and F for extracted words

In [16]:
test_df_preprocessed["Pscore"] = [ 1 if (len(s) == 0 and len(ps) == 0) 
                             else 0 if len(ps) == 0 
                             else len( set(s).intersection(set(ps) ))/ len(set(ps))  for s, ps in zip(test_df_preprocessed["spans"],test_df_preprocessed["predicted_span"]) ]

In [17]:
test_df_preprocessed["Rscore"] = [ 1 if (len(s) == 0 and len(ps) == 0) 
                             else 0 if len(s) == 0 
                             else len( set(s).intersection(set(ps) ))/ len(set(s))  for s, ps in zip(test_df_preprocessed["spans"],test_df_preprocessed["predicted_span"]) ]

In [18]:
test_df_preprocessed["Fscore"] = [ 0 if (p == 0 and r == 0)
    else 2 * p *r /(p + r) for p, r in zip(test_df_preprocessed["Pscore"], test_df_preprocessed["Rscore"] )]

In [19]:
F_score= np.mean(test_df_preprocessed["Fscore"])
F_score

0.31541734570505897

In [20]:
P_score = np.mean(test_df_preprocessed["Pscore"])
P_score

0.2691079867711252

In [21]:
R_score = np.mean(test_df_preprocessed["Rscore"])
R_score

0.6064137850976644

In [22]:
test_df_preprocessed

Unnamed: 0,spans,text,toxicity,toxic_words,original_text,sentences,diff,toxicity_sentence,predicted_span,Pscore,Rscore,Fscore
0,"[15, 16, 17, 18, 19, 27, 28, 29, 30, 31]",because hes a moron and a bigot. its not any m...,1,"[moron, bigot]",Because he's a moron and a bigot. It's not any...,"[because hes a moron and a bigot., its not any...","[10, 36]","[1.0, 0.0]","[27, 28, 29, 30, 31, 32, 15, 16, 17, 18, 19]",0.909091,1.000000,0.952381
1,"[29, 30, 31, 32, 33, 34]",how about we stop protecting idiots and let na...,1,[idiots],How about we stop protecting idiots and let na...,[how about we stop protecting idiots and let n...,[],"[1.0, 0.0]","[51, 52, 53, 60, 61, 62, 63, 64, 65, 74, 75, 7...",0.108696,0.833333,0.192308
2,"[166, 167, 168, 169, 170, 171]","if people were smart, they would boycott th...",1,[idiots],"If people were smart, they would Boycott th...","[if people were smart, they would boycott t...",[],[1.0],[],0.000000,0.000000,0.000000
3,"[87, 88, 89, 90, 91, 92]",trump claimed that russia will never invade th...,1,[stupid],Trump Claimed that Russia will never invade th...,[trump claimed that russia will never invade t...,[],[1.0],[],0.000000,0.000000,0.000000
4,[],as long as your willing to pay a lot more for ...,0,[],As long as your willing to pay a lot more for ...,[as long as your willing to pay a lot more for...,[148],"[0.0, 0.0, 0.0]","[188, 189, 190, 191, 192, 193, 194, 195, 196, ...",0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
685,"[129, 130, 131, 132, 133, 134]",but ... trumps not bluffing. hes prepared to g...,1,[stupid],But ... Trump's not bluffing. He's prepared to...,"[but ... trumps not bluffing., hes prepared to...","[13, 32, 151, 159, 166]","[0.0, 0.0, 0.0, 1.0, 0.0]","[129, 130, 131, 132, 133, 134, 135, 113, 114, ...",0.545455,1.000000,0.705882
686,"[126, 127, 128, 129, 130, 131]",cant believe the limited knowledge of this art...,1,[stupid],Can't believe the limited knowledge of this Ar...,[cant believe the limited knowledge of this ar...,[3],"[0.0, 1.0]","[44, 45, 46, 47, 48, 49, 50, 6, 7, 8, 9, 10, 1...",0.081081,1.000000,0.150000
687,"[24, 25, 26, 27, 28, 29]",i think it conservative idiots who cannot reac...,1,[idiots],I think it conservative idiots who cannot reac...,[i think it conservative idiots who cannot rea...,[],[1.0],"[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2...",0.131579,0.833333,0.227273
688,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",youre an id*ot...go away.,1,[youre an id*ot],You're an id*ot...Go away.,[youre an id*ot...go away.],[3],[1.0],[],0.000000,0.000000,0.000000
