In [583]:
import pandas as pd
import os

First, build a data table linking file ids to their label.

In [584]:
annotations = pd.read_csv("annotations_metadata.csv").set_index("file_id")
annotations

Unnamed: 0_level_0,user_id,subforum_id,num_contexts,label
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12834217_1,572066,1346,0,noHate
12834217_2,572066,1346,0,noHate
12834217_3,572066,1346,0,noHate
12834217_4,572066,1346,0,hate
12834217_5,572066,1346,0,noHate
...,...,...,...,...
33676864_5,734541,1388,0,noHate
33677019_1,735154,1388,0,noHate
33677019_2,735154,1388,0,noHate
33677053_1,572266,1388,0,hate


In [585]:
df1 = {"file_id": [], "text": []}
for path, name, filenames in os.walk("all_files"):
    for filename in filenames:
        full_name = path + "/" + filename
        file_id = filename.split(".")[0]
        df1["file_id"].append(file_id)
        with open(full_name) as f:
            text = f.read()
            df1["text"].append(text)
id_text = pd.DataFrame(df1).set_index("file_id")

In [586]:
Xy_pre = id_text.join(annotations).drop(columns=["user_id", "subforum_id", "num_contexts"])

In [587]:
tweets = pd.read_csv("labeled_data.csv")
tweets["label"] = tweets.apply(lambda row: "hate" if row["hate_speech"] > row["count"] // 2 else "noHate", axis=1)
tweets["text"] = tweets["tweet"]
tweets_dropped = tweets.drop(columns=["Unnamed: 0", "count", "hate_speech", "offensive_language", "neither", "class", "tweet"])

Xy = tweets_dropped.append(Xy_pre)

Some code borrowed from: https://thecleverprogrammer.com/2020/08/19/hate-speech-detection-model/

In [588]:
import re
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df
cleaned = clean_text(Xy, "text")
cleaned = cleaned[(cleaned.label == "hate") | (cleaned.label == "noHate")]
cleaned

Unnamed: 0,label,text
0,noHate,rt as a woman you shouldnt complain about cl...
1,noHate,rt boy dats coldtyga dwn bad for cuffin dat ...
2,noHate,rt dawg rt you ever fuck a bitch and she st...
3,noHate,rt ganderson based she look like a tranny
4,noHate,rt the shit you hear about me might be true ...
...,...,...
13472256_1,noHate,also it s so sad to see so much pre made crap ...
14417873_2,noHate,cf
30597853_3,noHate,sorry that you were unaware
30626265_1,noHate,my grandmother has red hair


In [589]:
from sklearn.model_selection import train_test_split
Xy_train, Xy_test = train_test_split(cleaned, random_state = 0)
Xy_train['label'].value_counts()

noHate    24695
hate       1919
Name: label, dtype: int64

In [590]:
from sklearn.utils import resample
upsampling = True
if upsampling:
    train_hate = Xy_train[Xy_train.label=="hate"]
    train_nohate = Xy_train[Xy_train.label=="noHate"]
    train_hate_upsampled = resample(train_hate, 
                                     replace=True,    
                                     n_samples=len(train_nohate),   
                                     random_state=0)
    train_upsampled = pd.concat([train_hate_upsampled, train_nohate])
else:
    train_upsampled = Xy_train

In [591]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier())])

In [592]:
from sklearn.model_selection import train_test_split
X_train, y_train = train_upsampled["text"], train_upsampled["label"]
X_test, y_test = Xy_test["text"], Xy_test["label"]

In [593]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)
print(model.score(X_test, y_test))
from sklearn.metrics import f1_score, confusion_matrix
print("f1 score", f1_score(y_test, y_predict, pos_label="noHate"))

matrix = confusion_matrix(y_test, y_predict, labels=["hate", "noHate"])
tn, fp, fn, tp = matrix.ravel()
print("False positive rate =", fp / (fp + tn))
print("False negative rate =", fn / (fn + tp))
print("True positive rate (sensitivity)=", tp / (tp + fn))
print("True negative rate (sensitivity)=", tn / (tn + fp))
# print("False negative rate =", fn / (fn + tn))
print(matrix)
# print(y_test[y_test == "hate"])

0.8586564472497745
f1 score 0.9197491360552925
False positive rate = 0.38197424892703863
False negative rate = 0.12076348953872507
True positive rate (sensitivity)= 0.8792365104612749
True negative rate (sensitivity)= 0.6180257510729614
[[ 432  267]
 [ 987 7186]]


In [594]:
def getClassification(text):
    text = text.lower()
    text = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    return model.predict(pd.Series([text]))[0]

We went tiptoeing along a path amongst the trees back towards the end of the widow’s garden, stooping down so as the branches wouldn’t scrape our heads. When we was passing by the kitchen I fell over a root and made a noise. We scrouched down and laid still. Miss Watson’s big nigger, named Jim, was setting in the kitchen door; we could see him pretty clear, because there was a light behind him. He got up and stretched his neck out about a minute, listening.

In [595]:
huckfinn = "We went tiptoeing along a path amongst the trees back towards the end of the widow’s garden, stooping down so as the branches wouldn’t scrape our heads. When we was passing by the kitchen I fell over a root and made a noise. We scrouched down and laid still. Miss Watson’s big nigger, named Jim, was setting in the kitchen door; we could see him pretty clear, because there was a light behind him. He got up and stretched his neck out about a minute, listening."
for text in huckfinn.split("."):
    val = getClassification(text)
    print(val, text)
    print()

noHate We went tiptoeing along a path amongst the trees back towards the end of the widow’s garden, stooping down so as the branches wouldn’t scrape our heads

noHate  When we was passing by the kitchen I fell over a root and made a noise

noHate  We scrouched down and laid still

noHate  Miss Watson’s big nigger, named Jim, was setting in the kitchen door; we could see him pretty clear, because there was a light behind him

noHate  He got up and stretched his neck out about a minute, listening

noHate 



In [596]:
getClassification("Niggers would come miles to hear Jim tell about it, and he was more looked up to than any nigger in that country.")

'hate'

Now, open up Huck Finn and start parsing

In [597]:
from bs4 import BeautifulSoup
import os

html = open('huckfinn.html')
soup = BeautifulSoup(html, 'html.parser')



In [598]:
def getSentenceClass(text):
    sentences = tokenize.sent_tokenize(text)
    newSentences = []
    for sentence in sentences:
        label = getClassification(sentence)
        newSentences.append((sentence.strip(), label))
    return newSentences

start = False
starttext = "You don’t know about me without you have read a book by the name of"
ctr = 0
for p in soup.find_all('p'):
    text = p.text
    if starttext in text:
        start = True
    if start:
        new_p = soup.new_tag("p")
        sentenceLabels = getSentenceClass(text)
        for sentence, label in sentenceLabels:
            span = soup.new_tag("span")
            span.string = sentence + " "
            span["class"] = label
            new_p.append(span)
        p.replace_with(new_p)
        ctr += 1

with open("modified_huck.html", "wb") as f_output:
    f_output.write(soup.prettify("utf-8"))  