## 1. DATA IMPORT
Hate speech dataset from a white supremacist forum:  https://github.com/Vicomtech/hate-speech-dataset

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from tqdm.notebook import tqdm
import ntpath
import matplotlib.pyplot as plt
import nltk
import nltk.sentiment.vader as vd
import string
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import download
from langdetect import detect
import spacy
from wordcloud import WordCloud
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    plot_confusion_matrix,
    plot_precision_recall_curve,
    plot_roc_curve,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import sklearn as sk
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import preprocessor as pproc
from cleantext import clean
from wordcloud import WordCloud

In [2]:
tqdm.pandas()

  from pandas import Panel


In [3]:
download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
all_files_paths = glob.glob("hate-speech-dataset/all_files/*.txt")

In [6]:
all_files_paths = [f for f in all_files_paths if os.path.isfile(f)]

In [7]:
all_files_names = [str(ntpath.basename(f)).replace(".txt", "") for f in all_files_paths]

## 1.1 Error removal

Some sentences cannot be correctly read in Windows due to error in encoding. Since they are few, we will just remove them.

In [8]:
txt_content = {}
errors = []
for name, path in tqdm(list(zip(all_files_names, all_files_paths))):
    with open(path, "r") as txt:
        try:
            txt_content[name] = txt.readline().replace(",", "")
        except Exception as ex:
            errors.append((name, str(ex)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10944.0), HTML(value='')))




In [9]:
errors_list = [err[0] for err in errors]

In [10]:
df = pd.DataFrame.from_dict(txt_content, orient='index').reset_index()
df.columns = ["file_id","text"]

In [11]:
ann = pd.read_csv('hate-speech-dataset/annotations_metadata.csv')

In [12]:
ann = ann[~ann['file_id'].isin(errors_list)]

In [13]:
data = pd.merge(left=ann, right=df, left_on='file_id', right_on='file_id')

In [14]:
data['label'].unique()

array(['noHate', 'hate', 'idk/skip', 'relation'], dtype=object)

In [15]:
data = data.loc[(data["label"] != "relation") & (data["label"] != "idk/skip")]

In [16]:
data.drop(columns=["file_id", "user_id", "subforum_id", "num_contexts"], inplace=True)

In [17]:
data['label'] = data.apply(lambda x: 0 if x['label'] == "noHate" else 1, axis=1)

In [18]:
data.head()

Unnamed: 0,label,text
0,0,As of March 13th 2014 the booklet had been d...
1,0,In order to help increase the booklets downloa...
2,0,( Simply copy and paste the following text int...
3,1,Click below for a FREE download of a colorfull...
4,0,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...


## 2. CLEANING

### 2.1 Removing links, tags, numbers and bias

In [19]:
def expand_contractions(text):
    cList = {
        "n't": " not",
        "/TD": " ",
        " PM ": " personal message ",
        " pm ": " personal message ",
        "PM ": "personal message ",
        " Donot ": " do not ",
        " MB ": " megabytes ",
    }
    
    c_re = re.compile("(%s)" % "|".join(cList.keys()))

    return c_re.sub(lambda match: cList[match.group(0)], text)

def full_text_clean(text):
    aa = expand_contractions(text)
    
    bb = pproc.clean(
        clean(pproc.clean(aa),
              fix_unicode=True,               # fix various unicode errors
              to_ascii=True,                  # transliterate to closest ASCII representation
              lower=True,                     # lowercase text
              no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
              no_urls=True,                  # replace all URLs with a special token
              no_emails=True,                # replace all email addresses with a special token
              no_phone_numbers=False,         # replace all phone numbers with a special token
              no_numbers=False,               # replace all numbers with a special token
              no_digits=False,                # replace all digits with a special token
              no_currency_symbols=False,      # replace all currency symbols with a special token
              no_punct=True,                 # remove punctuations
              replace_with_url=" ",
              replace_with_email=" ",
        )
    )
    
    swords = stopwords.words("english")
    swords.extend(string.punctuation)

    cc = (
        bb.lower()
        .replace(r"(@[a-z0-9]+)\w+", " ")
        .replace(r"http\S+", " ")
        .replace(r"www\S+", " ")
        .replace(r"com/watch", " ")
        .replace(r"\S*[.,:;!?-]\S*[^\s\.,:;!?-]", " ")
        .replace(r" th ", " ")
        .replace(r"\w*\d\w*", " ")
        .replace(r"rlm", " ")
        .replace(r"pttm", " ")
        .replace(r"ghlight", " ")
        .replace(r"[0-9]+(?:st| st|nd| nd|rd| rd|th| th)", " ")
        .replace(r"([^a-z \t])", " ")
        .replace(r" +", " ")
        )
    
    cc = " ".join([i for i in cc.split() if not i in swords])
    
    return cc

In [20]:
data['text_clean'] = data["text"].progress_apply(lambda x: full_text_clean(x))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10694.0), HTML(value='')))




In [21]:
data.head()

Unnamed: 0,label,text,text_clean
0,0,As of March 13th 2014 the booklet had been d...,march booklet downloaded times counting
1,0,In order to help increase the booklets downloa...,order help increase booklets downloads would g...
2,0,( Simply copy and paste the following text int...,simply copy paste following text youtube video...
3,1,Click below for a FREE download of a colorfull...,click free download colorfully illustrated pag...
4,0,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,click download megabytes green banner link


In [22]:
data['word_count_before'] = data['text'].apply(lambda x: len(x.split()))

In [23]:
data['word_count'] = data['text_clean'].apply(lambda x: len(x.split()))

In [24]:
data['word_cleaning'] = data['word_count_before'] - data['word_count']

In [25]:
data[['word_count_before','word_count','word_cleaning']].describe()

Unnamed: 0,word_count_before,word_count,word_cleaning
count,10694.0,10694.0,10694.0
mean,17.636993,8.184215,9.452777
std,13.349767,6.690266,7.413768
min,1.0,0.0,0.0
25%,9.0,4.0,5.0
50%,15.0,7.0,8.0
75%,24.0,11.0,13.0
max,343.0,141.0,202.0


In [26]:
data = data.loc[data['word_count'] > 0, ]

### 2.2 Lemmatizer and Tokenization

In [27]:
pipe = nlp.pipe(data['text_clean'], n_process=2, batch_size=2000)

In [28]:
docs = [x for x in tqdm(pipe)]

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [29]:
data['spacy_doc'] = docs

In [30]:
data['POS_spacy'] = data['spacy_doc'].progress_apply(lambda x: [(y.text, y.pos_) for y in x])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10547.0), HTML(value='')))




In [31]:
data['lemmatized'] = data['spacy_doc'].progress_apply(lambda x: " ".join([y.lemma_ for y in x]))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10547.0), HTML(value='')))




In [32]:
data['tokens'] = data['spacy_doc'].progress_apply(lambda x: [y.text for y in x])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10547.0), HTML(value='')))




In [33]:
data['language'] = data['spacy_doc'].progress_apply(lambda x: set([y.lang_ for y in x]))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10547.0), HTML(value='')))




In [34]:
length = {}
for line in data['language']:
    if len(line) in length:
        length[len(line)] += 1
    else:
        length[len(line)] = 1
length

{1: 10547}

In [35]:
data['language'] = data['language'].progress_apply(lambda x: list(x)[0])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10547.0), HTML(value='')))




In [36]:
data['language'].unique()

array(['en'], dtype=object)

## 3. POS

In [37]:
def filter_text_pos(x):
    final_pos_text = []
    for elem in x:
        for pos in pos_list:
            if elem[1] == pos:
                final_pos_text.append(elem[0])
    
    return " ".join(final_pos_text)

In [38]:
pos_list = ["NOUN"]
data["NOUN"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['NOUN_count'] = data['NOUN'].apply(lambda x: len(x.split()))

In [39]:
pos_list = ["PROPN"]
data["PROPN"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['PROPN_count'] = data['PROPN'].apply(lambda x: len(x.split()))

In [40]:
pos_list = ["VERB"]
data["VERB"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['VERB_count'] = data['VERB'].apply(lambda x: len(x.split()))

In [41]:
pos_list = ["ADJ"]
data["ADJ"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['ADJ_count'] = data['ADJ'].apply(lambda x: len(x.split()))

In [42]:
pos_list = ["ADV"]
data["ADV"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['ADV_count'] = data['ADV'].apply(lambda x: len(x.split()))

In [43]:
pos_list = ["PRON"]
data["PRON"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['PRON_count'] = data['PRON'].apply(lambda x: len(x.split()))

In [44]:
pos_list = ["SCONJ"]
data["SCONJ"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['SCONJ_count'] = data['SCONJ'].apply(lambda x: len(x.split()))

In [45]:
pos_list = ["INTJ"]
data["INTJ"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['INTJ_count'] = data['SCONJ'].apply(lambda x: len(x.split()))

In [46]:
data.head()

Unnamed: 0,label,text,text_clean,word_count_before,word_count,word_cleaning,spacy_doc,POS_spacy,lemmatized,tokens,...,ADJ,ADJ_count,ADV,ADV_count,PRON,PRON_count,SCONJ,SCONJ_count,INTJ,INTJ_count
0,0,As of March 13th 2014 the booklet had been d...,march booklet downloaded times counting,16,5,11,"(march, booklet, downloaded, times, counting)","[(march, PROPN), (booklet, PROPN), (downloaded...",march booklet download time count,"[march, booklet, downloaded, times, counting]",...,,0,,0,,0,,0,,0
1,0,In order to help increase the booklets downloa...,order help increase booklets downloads would g...,34,19,15,"(order, help, increase, booklets, downloads, w...","[(order, NOUN), (help, VERB), (increase, VERB)...",order help increase booklet download would gre...,"[order, help, increase, booklets, downloads, w...",...,great youtube,2,,0,,0,,0,,0
2,0,( Simply copy and paste the following text int...,simply copy paste following text youtube video...,15,9,6,"(simply, copy, paste, following, text, youtube...","[(simply, ADV), (copy, VERB), (paste, NOUN), (...",simply copy paste follow text youtube videos d...,"[simply, copy, paste, following, text, youtube...",...,,0,simply,1,,0,,0,,0
3,1,Click below for a FREE download of a colorfull...,click free download colorfully illustrated pag...,22,12,10,"(click, free, download, colorfully, illustrate...","[(click, VERB), (free, ADJ), (download, NOUN),...",click free download colorfully illustrate page...,"[click, free, download, colorfully, illustrate...",...,free intentional western,3,colorfully,1,,0,,0,,0
4,0,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,click download megabytes green banner link,14,6,8,"(click, download, megabytes, green, banner, link)","[(click, VERB), (download, PROPN), (megabytes,...",click download megabyte green banner link,"[click, download, megabytes, green, banner, link]",...,green,1,,0,,0,,0,,0


In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10547 entries, 0 to 10925
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   label              10547 non-null  int64 
 1   text               10547 non-null  object
 2   text_clean         10547 non-null  object
 3   word_count_before  10547 non-null  int64 
 4   word_count         10547 non-null  int64 
 5   word_cleaning      10547 non-null  int64 
 6   spacy_doc          10547 non-null  object
 7   POS_spacy          10547 non-null  object
 8   lemmatized         10547 non-null  object
 9   tokens             10547 non-null  object
 10  language           10547 non-null  object
 11  NOUN               10547 non-null  object
 12  NOUN_count         10547 non-null  int64 
 13  PROPN              10547 non-null  object
 14  PROPN_count        10547 non-null  int64 
 15  VERB               10547 non-null  object
 16  VERB_count         10547 non-null  int64

In [48]:
data.drop(columns=["spacy_doc"], inplace=True)

In [49]:
data = data[["label", "text", "text_clean", "POS_spacy", "lemmatized", "tokens", "language", "word_count_before","word_count", "word_cleaning","NOUN", "NOUN_count", "PROPN", "PROPN_count", "VERB", "VERB_count", "ADJ", "ADJ_count", "ADV", "ADV_count", "PRON", "PRON_count", "SCONJ", "SCONJ_count", "INTJ", "INTJ_count"]]

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10547 entries, 0 to 10925
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   label              10547 non-null  int64 
 1   text               10547 non-null  object
 2   text_clean         10547 non-null  object
 3   POS_spacy          10547 non-null  object
 4   lemmatized         10547 non-null  object
 5   tokens             10547 non-null  object
 6   language           10547 non-null  object
 7   word_count_before  10547 non-null  int64 
 8   word_count         10547 non-null  int64 
 9   word_cleaning      10547 non-null  int64 
 10  NOUN               10547 non-null  object
 11  NOUN_count         10547 non-null  int64 
 12  PROPN              10547 non-null  object
 13  PROPN_count        10547 non-null  int64 
 14  VERB               10547 non-null  object
 15  VERB_count         10547 non-null  int64 
 16  ADJ                10547 non-null  objec

In [51]:
with open("data.pkl", "wb") as f:
    pickle.dump(data, f)