## 1. DATA IMPORT
Hate speech dataset from a white supremacist forum:  https://github.com/Vicomtech/hate-speech-dataset

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from tqdm.notebook import tqdm
import ntpath
import matplotlib.pyplot as plt
import nltk
import nltk.sentiment.vader as vd
import string
import re
import pickle
from nltk import download
import spacy
from collections import Counter
import preprocessor as pproc
from cleantext import clean
import dataframe_image as dfi

In [2]:
tqdm.pandas()

  from pandas import Panel


In [3]:
download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
all_files_paths = glob.glob("hate-speech-dataset/all_files/*.txt")

In [6]:
all_files_paths = [f for f in all_files_paths if os.path.isfile(f)]

In [7]:
all_files_names = [str(ntpath.basename(f)).replace(".txt", "") for f in all_files_paths]

## 1.1 Error removal

Some sentences cannot be correctly read in Windows due to error in encoding. Since they are few, we will just remove them.

In [8]:
txt_content = {}
errors = []
for name, path in tqdm(list(zip(all_files_names, all_files_paths))):
    with open(path, "r") as txt:
        try:
            txt_content[name] = txt.readline().replace(",", "")
        except Exception as ex:
            errors.append((name, str(ex)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10944.0), HTML(value='')))




In [9]:
errors_list = [err[0] for err in errors]

In [10]:
df = pd.DataFrame.from_dict(txt_content, orient='index').reset_index()
df.columns = ["file_id","text"]

In [11]:
ann = pd.read_csv('hate-speech-dataset/annotations_metadata.csv')

In [12]:
ann = ann[~ann['file_id'].isin(errors_list)]

In [13]:
data = pd.merge(left=ann, right=df, left_on='file_id', right_on='file_id')

In [14]:
data['label'].unique()

array(['noHate', 'hate', 'idk/skip', 'relation'], dtype=object)

In [15]:
data = data.loc[(data["label"] != "relation") & (data["label"] != "idk/skip")]

In [16]:
data.drop(columns=["file_id", "user_id", "subforum_id", "num_contexts"], inplace=True)

In [17]:
data['label'] = data.apply(lambda x: 0 if x['label'] == "noHate" else 1, axis=1)

In [18]:
data.head()

Unnamed: 0,label,text
0,0,As of March 13th 2014 the booklet had been d...
1,0,In order to help increase the booklets downloa...
2,0,( Simply copy and paste the following text int...
3,1,Click below for a FREE download of a colorfull...
4,0,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...


## 2. CLEANING

### 2.1 Removing links, tags, numbers and bias

In [19]:
def expand_contractions(text):
    cList = {
        "n't": " not",
        "/TD": " ",
        " PM ": " personal message ",
        " pm ": " personal message ",
        "PM ": "personal message ",
        " Donot ": " do not ",
        " MB ": " megabytes ",
        "I'm" : "I am",
        " 've " : " have ",
        " 're " : " are ",
        " 'll " : " will "
    }
    
    c_re = re.compile("(%s)" % "|".join(cList.keys()))

    return c_re.sub(lambda match: cList[match.group(0)], text)

def full_text_clean(text):
    aa = expand_contractions(text)
    
    bb = pproc.clean(
        clean(pproc.clean(aa),
              fix_unicode=True,               # fix various unicode errors
              to_ascii=True,                  # transliterate to closest ASCII representation
              lower=True,                     # lowercase text
              no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
              no_urls=True,                  # replace all URLs with a special token
              no_emails=True,                # replace all email addresses with a special token
              no_phone_numbers=False,         # replace all phone numbers with a special token
              no_numbers=False,               # replace all numbers with a special token
              no_digits=False,                # replace all digits with a special token
              no_currency_symbols=False,      # replace all currency symbols with a special token
              no_punct=True,                 # remove punctuations
              replace_with_url=" ",
              replace_with_email=" ",
        )
    )
    
    swords = string.punctuation

    cc = (
        bb.lower()
        .replace(r"(@[a-z0-9]+)\w+", " ")
        .replace(r"www\S+", " ")
        .replace(r"com/watch", " ")
        .replace(r"\S*[.,:;!?-]\S*[^\s\.,:;!?-]", " ")
        .replace(r" th ", " ")
        .replace(r"\w*\d\w*", " ")
        .replace(r"rlm", " ")
        .replace(r"pttm", " ")
        .replace(r"ghlight", " ")
        .replace(r"[0-9]+(?:st| st|nd| nd|rd| rd|th| th)", " ")
        .replace(r"([^a-z \t])", " ")
        .replace(r" +", " ")
        .replace(r"http", " ")
        )
    
    cc = " ".join([i for i in cc.split() if not i in swords and len(i) >1 ])
    
    return cc

In [20]:
data['text_clean'] = data["text"].progress_apply(lambda x: full_text_clean(x))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10694.0), HTML(value='')))




In [21]:
data.head()

Unnamed: 0,label,text,text_clean
0,0,As of March 13th 2014 the booklet had been d...,as of march the booklet had been downloaded ov...
1,0,In order to help increase the booklets downloa...,in order to help increase the booklets downloa...
2,0,( Simply copy and paste the following text int...,simply copy and paste the following text into ...
3,1,Click below for a FREE download of a colorfull...,click below for free download of colorfully il...
4,0,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,click on the download megabytes green banner link


In [22]:
data['word_count_before'] = data['text'].apply(lambda x: len(x.split()))

In [23]:
data['word_count'] = data['text_clean'].apply(lambda x: len(x.split()))

In [24]:
data['word_cleaning'] = data['word_count_before'] - data['word_count']

In [25]:
word_count_df = data[['word_count_before','word_count','word_cleaning']].describe()
word_count_df

Unnamed: 0,word_count_before,word_count,word_cleaning
count,10694.0,10694.0,10694.0
mean,17.636993,14.966336,2.670656
std,13.349767,11.492576,3.587754
min,1.0,0.0,-1.0
25%,9.0,7.0,1.0
50%,15.0,13.0,2.0
75%,24.0,21.0,3.0
max,343.0,286.0,108.0


In [26]:
df_styled = word_count_df
dfi.export(df_styled,"images_d\word_count_df.png")

In [27]:
data = data.loc[data['word_count'] > 0, ]

### 2.2 Lemmatizer and Tokenization

In [28]:
pipe = nlp.pipe(data['text_clean'], n_process=2, batch_size=2000)

In [29]:
docs = [x for x in tqdm(pipe)]

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [30]:
data['spacy_doc'] = docs

In [31]:
data['POS_spacy'] = data['spacy_doc'].progress_apply(lambda x: [(y.text, y.pos_) for y in x])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10570.0), HTML(value='')))




In [32]:
data['lemmatized'] = data['spacy_doc'].progress_apply(lambda x: " ".join([y.lemma_ for y in x if len(x)>1]))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10570.0), HTML(value='')))




In [33]:
data['tokens'] = data['spacy_doc'].progress_apply(lambda x: [y.text for y in x])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10570.0), HTML(value='')))




In [34]:
data['language'] = data['spacy_doc'].progress_apply(lambda x: set([y.lang_ for y in x]))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10570.0), HTML(value='')))




In [35]:
length = {}
for line in data['language']:
    if len(line) in length:
        length[len(line)] += 1
    else:
        length[len(line)] = 1
length

{1: 10570}

In [36]:
data['language'] = data['language'].progress_apply(lambda x: list(x)[0])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10570.0), HTML(value='')))




In [37]:
data['language'].unique()

array(['en'], dtype=object)

## 3. POS

In [38]:
def filter_text_pos(x):
    final_pos_text = []
    for elem in x:
        for pos in pos_list:
            if elem[1] == pos:
                final_pos_text.append(elem[0])
    
    return " ".join(final_pos_text)

In [39]:
pos_list = ["NOUN"]
data["NOUN"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['NOUN_count'] = data['NOUN'].apply(lambda x: len(x.split()))

In [40]:
pos_list = ["PROPN"]
data["PROPN"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['PROPN_count'] = data['PROPN'].apply(lambda x: len(x.split()))

In [41]:
pos_list = ["VERB"]
data["VERB"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['VERB_count'] = data['VERB'].apply(lambda x: len(x.split()))

In [42]:
pos_list = ["ADJ"]
data["ADJ"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['ADJ_count'] = data['ADJ'].apply(lambda x: len(x.split()))

In [43]:
pos_list = ["ADV"]
data["ADV"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['ADV_count'] = data['ADV'].apply(lambda x: len(x.split()))

In [44]:
pos_list = ["PRON"]
data["PRON"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['PRON_count'] = data['PRON'].apply(lambda x: len(x.split()))

In [45]:
pos_list = ["SCONJ"]
data["SCONJ"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['SCONJ_count'] = data['SCONJ'].apply(lambda x: len(x.split()))

In [46]:
pos_list = ["INTJ"]
data["INTJ"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['INTJ_count'] = data['SCONJ'].apply(lambda x: len(x.split()))

In [47]:
data.head()

Unnamed: 0,label,text,text_clean,word_count_before,word_count,word_cleaning,spacy_doc,POS_spacy,lemmatized,tokens,...,ADJ,ADJ_count,ADV,ADV_count,PRON,PRON_count,SCONJ,SCONJ_count,INTJ,INTJ_count
0,0,As of March 13th 2014 the booklet had been d...,as of march the booklet had been downloaded ov...,16,12,4,"(as, of, march, the, booklet, had, been, downl...","[(as, SCONJ), (of, ADP), (march, PROPN), (the,...",as of march the booklet have be download over ...,"[as, of, march, the, booklet, had, been, downl...",...,,0,,0,,0,as,1,,1
1,0,In order to help increase the booklets downloa...,in order to help increase the booklets downloa...,34,33,1,"(in, order, to, help, increase, the, booklets,...","[(in, ADP), (order, NOUN), (to, PART), (help, ...",in order to help increase the booklet download...,"[in, order, to, help, increase, the, booklets,...",...,great uploaded,2,,0,it who,2,if,1,,1
2,0,( Simply copy and paste the following text int...,simply copy and paste the following text into ...,15,13,2,"(simply, copy, and, paste, the, following, tex...","[(simply, ADV), (copy, VERB), (and, CCONJ), (p...",simply copy and paste the follow text into -PR...,"[simply, copy, and, paste, the, following, tex...",...,youtube,1,simply,1,,0,,0,,0
3,1,Click below for a FREE download of a colorfull...,click below for free download of colorfully il...,22,18,4,"(click, below, for, free, download, of, colorf...","[(click, VERB), (below, ADV), (for, ADP), (fre...",click below for free download of colorfully il...,"[click, below, for, free, download, of, colorf...",...,free zionistengineered intentional western,4,below colorfully,2,,0,,0,,0
4,0,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,click on the download megabytes green banner link,14,8,6,"(click, on, the, download, megabytes, green, b...","[(click, VERB), (on, ADP), (the, DET), (downlo...",click on the download megabyte green banner link,"[click, on, the, download, megabytes, green, b...",...,green,1,,0,,0,,0,,0


In [48]:
data.drop(columns=["spacy_doc"], inplace=True)

In [49]:
data = data[["label", "text", "text_clean", "POS_spacy", "lemmatized", "tokens", "language", "word_count_before","word_count", "word_cleaning","NOUN", "NOUN_count", "PROPN", "PROPN_count", "VERB", "VERB_count", "ADJ", "ADJ_count", "ADV", "ADV_count", "PRON", "PRON_count", "SCONJ", "SCONJ_count", "INTJ", "INTJ_count"]]

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10570 entries, 0 to 10925
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   label              10570 non-null  int64 
 1   text               10570 non-null  object
 2   text_clean         10570 non-null  object
 3   POS_spacy          10570 non-null  object
 4   lemmatized         10570 non-null  object
 5   tokens             10570 non-null  object
 6   language           10570 non-null  object
 7   word_count_before  10570 non-null  int64 
 8   word_count         10570 non-null  int64 
 9   word_cleaning      10570 non-null  int64 
 10  NOUN               10570 non-null  object
 11  NOUN_count         10570 non-null  int64 
 12  PROPN              10570 non-null  object
 13  PROPN_count        10570 non-null  int64 
 14  VERB               10570 non-null  object
 15  VERB_count         10570 non-null  int64 
 16  ADJ                10570 non-null  objec

In [51]:
with open("serialized/data.pkl", "wb") as f:
    pickle.dump(data, f)

In [52]:
pd.set_option('display.max_colwidth', None)
data[data['text'].str.contains('I hate to see such beautiful white ')]

Unnamed: 0,label,text,text_clean,POS_spacy,lemmatized,tokens,language,word_count_before,word_count,word_cleaning,...,ADJ,ADJ_count,ADV,ADV_count,PRON,PRON_count,SCONJ,SCONJ_count,INTJ,INTJ_count
187,1,I hate to see such beautiful white females become victims of these sick violent merciless animals .,hate to see such beautiful white females become victims of these sick violent merciless animals,"[(hate, VERB), (to, PART), (see, VERB), (such, ADJ), (beautiful, ADJ), (white, ADJ), (females, NOUN), (become, VERB), (victims, NOUN), (of, ADP), (these, DET), (sick, ADJ), (violent, ADJ), (merciless, NOUN), (animals, NOUN)]",hate to see such beautiful white female become victim of these sick violent merciless animal,"[hate, to, see, such, beautiful, white, females, become, victims, of, these, sick, violent, merciless, animals]",en,17,15,2,...,such beautiful white sick violent,5,,0,,0,,0,,0
