## 1. DATA IMPORT
Hate speech dataset from a white supremacist forum:  https://github.com/Vicomtech/hate-speech-dataset

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from tqdm.notebook import tqdm
import ntpath
import matplotlib.pyplot as plt
import nltk
import nltk.sentiment.vader as vd
import string
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import download
from langdetect import detect
import spacy
from wordcloud import WordCloud
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    plot_confusion_matrix,
    plot_precision_recall_curve,
    plot_roc_curve,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import sklearn as sk
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import preprocessor as pproc
from cleantext import clean
from wordcloud import WordCloud

In [2]:
tqdm.pandas()

  from pandas import Panel


In [3]:
download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
all_files_paths = glob.glob("hate-speech-dataset/all_files/*.txt")

In [6]:
all_files_paths = [f for f in all_files_paths if os.path.isfile(f)]

In [7]:
all_files_names = [str(ntpath.basename(f)).replace(".txt", "") for f in all_files_paths]

## 1.1 Error removal

In [None]:
txt_content = {}
errors = []
for name, path in tqdm(list(zip(all_files_names, all_files_paths))):
    with open(path, "r") as txt:
        try:
            txt_content[name] = txt.readline().replace(",", "")
        except Exception as ex:
            errors.append((name, str(ex)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10944.0), HTML(value='')))

In [None]:
errors_list = [err[0] for err in errors]

In [None]:
df = pd.DataFrame.from_dict(txt_content, orient='index').reset_index()
df.columns = ["file_id","text"]

In [None]:
ann = pd.read_csv('hate-speech-dataset/annotations_metadata.csv')

In [None]:
ann = ann[~ann['file_id'].isin(errors_list)]

In [None]:
data = pd.merge(left=ann, right=df, left_on='file_id', right_on='file_id')
data.head()

In [None]:
data['label'].unique()

In [None]:
data = data.loc[(data["label"] != "relation") & (data["label"] != "idk/skip")]

In [None]:
data.drop(columns=["file_id", "user_id", "subforum_id", "num_contexts"], inplace=True)

In [None]:
data['label'] = data.apply(lambda x: 0 if x['label'] == "noHate" else 1, axis=1)

## 2. CLEANING

### 2.1 Removing links, tags, numbers and bias

In [None]:
def full_text_clean(text):
    cList = {
        "n't": " not",
        "/TD": " ",
        " PM ": " personal message ",
        " pm ": " personal message ",
        "PM ": "personal message ",
        " Donot ": " do not ",
        " MB ": " megabytes ",
    }

    c_re = re.compile("(%s)" % "|".join(cList.keys()))
    
    def expand_contractions(text, c_re=c_re):
        def replace(match):
            return cList[match.group(0)]

        return c_re.sub(replace, text)
    
    qualcosa = expand_contractions(text)
    
    qualcosaltro = pproc.clean(
            clean(
                pproc.clean(qualcosa),
                fix_unicode=True,  # fix various unicode errors
                to_ascii=True,  # transliterate to closest ASCII representation
                lower=True,  # lowercase text
                no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
                no_urls=True,  # replace all URLs with a special token
                no_emails=True,  # replace all email addresses with a special token
                no_phone_numbers=True,  # replace all phone numbers with a special token
                no_numbers=True,  # replace all numbers with a special token
                no_digits=True,  # replace all digits with a special token
                no_currency_symbols=True,  # replace all currency symbols with a special token
                no_punct=True,
            )
        )
    
    swords = set().union(stopwords.words("english"), string.punctuation)

    altroancora = (
        qualcosaltro
        .lower()
        .replace(r"(@[a-z0-9]+)\w+", " ")
        .replace(r"http\S+", "")
        .replace(r"www\S+", " ")
        .replace(r"com/watch", " ")
        .replace(r"\S*[.,:;!?-]\S*[^\s\.,:;!?-]", " ")
        .replace(r" th ", " ")
        .replace(r" th ", " ")
        .replace(r"\w*\d\w*", " ")
        .replace(r"rlm", " ")
        .replace(r"pttm", " ")
        .replace(r"ghlight", " ")
        .replace(r"[0-9]+(?:st| st|nd| nd|rd| rd|th| th)", "")
        .replace(r"([^a-z \t])", " ")
        .replace(r" +", " ")
        )
    
    altroancora = " ".join([i for i in altroancora.split() if not i in swords])
    
    return altroancora

In [None]:
data['text_clean'] = [full_text_clean(text) for text in data['text']]

In [None]:
data.head()

In [None]:
data['word_count_before'] = data['text'].apply(lambda x: len(x.split())) # Number of words in the string

In [None]:
data['word_count'] = data['text_clean'].apply(lambda x: len(x.split()))

In [None]:
data['word_cleaning'] = data['word_count_before'] - data['word_count']

In [None]:
data[['word_count_before','word_count','word_cleaning']].describe()

In [None]:
data = data.loc[data['word_count'] > 0, ]

### 2.2 Lemmatizer and Tokenization

In [None]:
docs = nlp.pipe(data['text_clean'], n_process=2, batch_size=2000)

In [None]:
data['spacy_doc'] = [x for x in tqdm(docs)]

In [None]:
data['POS_spacy'] = data['spacy_doc'].progress_apply(lambda x: [(y.text, y.pos_) for y in x])

In [None]:
data['lemmatized'] = data['spacy_doc'].progress_apply(lambda x: " ".join([y.lemma_ for y in x]))

In [None]:
data['tokens'] = data['spacy_doc'].progress_apply(lambda x: [y.text for y in x])

In [None]:
data['language'] = data['spacy_doc'].progress_apply(lambda x: set([y.lang_ for y in x]))

In [None]:
length = {}
for line in data['language']:
    if len(line) in length:
        length[len(line)] += 1
    else:
        length[len(line)] = 1
length

In [None]:
data['language'] = data['language'].progress_apply(lambda x: list(x)[0])

In [None]:
data['language'].unique()

## 3. POS

In [None]:
def filter_text_pos(x):
    final_pos_text = []
    for elem in x:
        for pos in pos_list:
            if elem[1] == pos:
                final_pos_text.append(elem[0])
    
    return " ".join(final_pos_text)

In [None]:
pos_list = ["NOUN"]
data["NOUN"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['NOUN_count'] = data['NOUN'].apply(lambda x: len(x.split()))

In [None]:
pos_list = ["PROPN"]
data["PROPN"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['PROPN_count'] = data['PROPN'].apply(lambda x: len(x.split()))

In [None]:
pos_list = ["VERB"]
data["VERB"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['VERB_count'] = data['VERB'].apply(lambda x: len(x.split()))

In [None]:
pos_list = ["ADJ"]
data["ADJ"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['ADJ_count'] = data['ADJ'].apply(lambda x: len(x.split()))

In [None]:
pos_list = ["ADV"]
data["ADV"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['ADV_count'] = data['ADV'].apply(lambda x: len(x.split()))

In [None]:
pos_list = ["PRON"]
data["PRON"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['PRON_count'] = data['PRON'].apply(lambda x: len(x.split()))

In [None]:
pos_list = ["SCONJ"]
data["SCONJ"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['SCONJ_count'] = data['SCONJ'].apply(lambda x: len(x.split()))

In [None]:
pos_list = ["INTJ"]
data["INTJ"] = data.apply(lambda x: filter_text_pos(x["POS_spacy"]), axis=1)
data['INTJ_count'] = data['SCONJ'].apply(lambda x: len(x.split()))

In [None]:
data.head()

In [None]:
with open("data.pkl", "wb") as f:
    pickle.dump(data, f)