Trigger warning: This notebook contains words or language that are considered profane, vulgar, or offensive by some.

In [None]:
!pip install tweet-preprocessor -q

# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis

In [None]:
# explainability (why did the model say it's hate speech)
!pip install eli5

In [None]:
import pandas as pd
import numpy as np
import preprocessor as prepro # twitter prepro
import tqdm #progress bar

import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction
from xgboost import XGBClassifier

import altair as alt #viz

#explainability
import eli5
from eli5.lime import TextExplainer

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
pyLDAvis.enable_notebook()

In [None]:
# prepro settings
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION, prepro.OPT.SMILEY)

In [None]:
data = pd.read_csv('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/twitter_hate.zip')

In [None]:
data['text_clean'] = data['tweet'].map(lambda t: prepro.clean(t))
data['text_clean'] = data['text_clean'].str.replace('#','')

In [None]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data['text_clean']),position=0, leave=True)

for text in nlp.pipe(data['text_clean'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

 99%|█████████▉| 24577/24783 [00:32<00:00, 718.48it/s]

In [None]:
# write everything into one function that can be re-used later
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  removes twitter stuff
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))
  texts_clean = texts_clean.str.replace('#','')

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [None]:
# apply all prepro-pipeline to texts
data['text_clean'] = text_prepro(data['tweet'])

100%|██████████| 24783/24783 [00:23<00:00, 1075.64it/s]


In [None]:
data

Unnamed: 0.1,Unnamed: 0,class,tweet,text_clean
0,0,2,!!! RT @mayasolovely: As a woman you shouldn't...,rt woman complain cleaning house amp man trash
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt boy dats cold tyga dwn bad cuffin dat hoe s...
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt dawg rt fuck bitch start cry confused shit
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt look like tranny
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt shit hear true faker bitch told ya
...,...,...,...,...
24778,24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,lie right tl trash bible scriptures
24779,24779,2,"you've gone and broke the wrong heart baby, an...",gone broke wrong heart baby drove redneck crazy
24780,24780,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like ai nt fuc...
24781,24781,1,youu got wild bitches tellin you lies,youu got wild bitches tellin lies


In [None]:
# renaming and reordering

data_df = pd.DataFrame({'label':data['class'], 'text':data['text_clean']})

In [None]:

data_df.label.value_counts().reset_index()

Unnamed: 0,index,label
0,1,19190
1,2,4163
2,0,1430


In [None]:
alt.Chart(data_df.label.value_counts().reset_index()).mark_bar(filled=True).encode(
    alt.X('label:Q', title='N Tweets'),
    alt.Y('index:N', title='Category')
)

In [None]:
# fixing sample imbalance
rus = RandomUnderSampler(random_state=42)
data_df_res, y_res = rus.fit_resample(data_df, data_df['label'])

In [None]:
data_df_res['label'].value_counts()

0    1430
1    1430
2    1430
Name: label, dtype: int64

In [None]:
# Splitting the dataset into the Training set and Test set (since we have a new output variable)
X_train, X_test, y_train, y_test = train_test_split(data_df_res['text'], y_res, test_size = 0.4, random_state = 42)

In [None]:
#instantiate models and "bundle up as pipeline"

tfidf = TfidfVectorizer()
cls = LogisticRegression()

pipe = make_pipeline(tfidf, cls)

In [None]:
pipe.fit(X_train,y_train) # fit model

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('logisticregression', LogisticRegression())])

In [None]:
# evaluate model performance on training set

y_eval = pipe.predict(X_train)
report = classification_report(y_train, y_eval)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.87      0.90       840
           1       0.91      0.91      0.91       842
           2       0.94      0.99      0.97       892

    accuracy                           0.93      2574
   macro avg       0.93      0.93      0.93      2574
weighted avg       0.93      0.93      0.93      2574



In [None]:
# run single prediction

t1 = ['you stupid fag bitch']

In [None]:
# preprocess

t1_p = text_prepro(pd.Series(t1)) # note, we need to pack text up as pd.Series 

100%|██████████| 1/1 [00:00<00:00, 141.49it/s]


In [None]:
# predict

pipe.predict(t1_p)

array([0])

In [None]:
# overall weights (works only for linear models)
eli5.show_weights(pipe, top=10, target_names=['hate','offensive','nothing'])



Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+3.612,white,
+3.435,faggot,
+3.297,nigger,
+3.006,nigga,
+2.613,fag,
+2.494,faggots,
+2.392,fags,
+2.328,niggas,
+2.227,ass,
+2.225,niggers,

Weight?,Feature
+3.612,white
+3.435,faggot
+3.297,nigger
+3.006,nigga
+2.613,fag
+2.494,faggots
+2.392,fags
+2.328,niggas
+2.227,ass
+2.225,niggers

Weight?,Feature
+6.354,bitch
+3.785,pussy
+3.763,bitches
+3.662,hoes
+2.842,hoe
+1.791,shit
+1.596,ai
… 1789 more positive …,… 1789 more positive …
… 3847 more negative …,… 3847 more negative …
-1.262,charlie

Weight?,Feature
+2.695,charlie
+2.517,bird
… 2790 more positive …,… 2790 more positive …
… 2846 more negative …,… 2846 more negative …
-2.238,white
-2.246,nigger
-2.648,hoe
-2.808,faggot
-2.890,pussy
-3.242,hoes


In [None]:
# explain one prediction
eli5.show_prediction(pipe[1], t1_p[0], vec=pipe[0],
                     target_names=['hate','offensive','nothing'])

Contribution?,Feature
2.18,Highlighted in text (sum)
-0.175,<BIAS>

Contribution?,Feature
1.661,Highlighted in text (sum)
-0.392,<BIAS>

Contribution?,Feature
0.567,<BIAS>
-3.841,Highlighted in text (sum)


In [None]:
data['tweet'][100]

'"@ClicquotSuave: LMAOOOOOOOOOOO this nigga @Krillz_Nuh_Care http://t.co/AAnpSUjmYI" &lt;bitch want likes for some depressing shit..foh'

In [None]:
data['class'][100]

1

In [None]:
eli5.show_prediction(pipe[1], data['text_clean'][100], vec=pipe[0],
                     target_names=['hate','offensive','nothing'])



Contribution?,Feature
1.254,Highlighted in text (sum)
-0.175,<BIAS>

Contribution?,Feature
0.367,Highlighted in text (sum)
-0.392,<BIAS>

Contribution?,Feature
0.567,<BIAS>
-1.62,Highlighted in text (sum)


## Let's try a complex (black-box) model

In [None]:
#instantiate models and "bundle up as pipeline"

tfidf = TfidfVectorizer()
svd = TruncatedSVD(n_components = 100)
cls_xg = XGBClassifier()

pipe_xg = make_pipeline(tfidf, svd, cls)

In [None]:
pipe_xg.fit(X_train,y_train) # fit model

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('truncatedsvd', TruncatedSVD(n_components=100)),
                ('logisticregression', LogisticRegression())])

In [None]:
# evaluate model performance on training set

y_eval = pipe_xg.predict(X_train)
report = classification_report(y_train, y_eval)
print(report)

              precision    recall  f1-score   support

           0       0.82      0.65      0.72       840
           1       0.82      0.79      0.80       842
           2       0.78      0.97      0.87       892

    accuracy                           0.80      2574
   macro avg       0.81      0.80      0.80      2574
weighted avg       0.81      0.80      0.80      2574



In [None]:
# evaluate model performance on test set

y_pred = pipe_xg.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.79      0.62      0.70       590
           1       0.79      0.78      0.79       588
           2       0.74      0.94      0.83       538

    accuracy                           0.77      1716
   macro avg       0.78      0.78      0.77      1716
weighted avg       0.78      0.77      0.77      1716



In [None]:
# explain single prediction
te = TextExplainer(random_state=42)
te.fit(data['text_clean'][100], pipe_xg.predict_proba)
te.show_prediction(target_names=['hate','offensive','nothing'])

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


Contribution?,Feature
0.539,Highlighted in text (sum)
-0.385,<BIAS>

Contribution?,Feature
-0.282,Highlighted in text (sum)
-0.573,<BIAS>

Contribution?,Feature
-0.209,<BIAS>
-1.617,Highlighted in text (sum)


[K     |████████████████████████████████| 24.1 MB 1.6 MB/s 
[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone


In [None]:
# preprocess texts (we need tokens)
tokens = []

for summary in nlp.pipe(data['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [None]:
data['tokens'] = tokens

In [None]:
data

Unnamed: 0.1,Unnamed: 0,class,tweet,text_clean,tokens
0,0,2,!!! RT @mayasolovely: As a woman you shouldn't...,rt woman complain cleaning house amp man trash,"[rt, woman, house, amp, man, trash]"
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt boy dats cold tyga dwn bad cuffin dat hoe s...,"[rt, boy, cold, tyga, dwn, bad, cuffin, hoe, s..."
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt dawg rt fuck bitch start cry confused shit,"[rt, dawg, rt, fuck, bitch, confused, shit]"
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt look like tranny,[tranny]
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt shit hear true faker bitch told ya,"[rt, shit, true, faker, bitch, ya]"
...,...,...,...,...,...
24778,24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,lie right tl trash bible scriptures,"[right, tl, trash, bible, scripture]"
24779,24779,2,"you've gone and broke the wrong heart baby, an...",gone broke wrong heart baby drove redneck crazy,"[wrong, heart, baby, redneck, crazy]"
24780,24780,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wanna eat dat nigguh like ai nt fuc...,"[young, buck, wanna, dat, nigguh, fuckin, di]"
24781,24781,1,youu got wild bitches tellin you lies,youu got wild bitches tellin lies,"[youu, wild, bitch, tellin]"


In [None]:
data_hate = data[data['class'] == 0]

In [None]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data_hate['tokens'])
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data_hate['tokens']]

In [None]:
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=10, workers = 4, passes=10)

In [None]:
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only


In [None]:
 # Let's Visualize
pyLDAvis.display(lda_display)