# **Preprocessing**

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
data = pd.read_csv('updated_hate_speech2.csv',engine='python')
X = data['Content'].values
y = data['Label'].values

# We did a 80/20 split for training and testing. We later split the training set into training and validation
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# **Naive Bayes**

We chose a Multinomial Naive Bayes because it works best with discrete features such as word counts or frequencies

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
# Use a TF-IDF vectorizer to preprocess the data to create vectors of the word frequencies
vectorizer = TfidfVectorizer(stop_words='english')
nb_X_train = vectorizer.fit_transform(train_X)
nb_X_test = vectorizer.transform(test_X)

In [None]:
naive = MultinomialNB()
naive.fit(nb_X_train, train_y)

In [None]:
# Evaluate the model
y_pred = naive.predict(nb_X_test)
y_pred = (y_pred > 0.5).astype('int32')
print('Accuracy: %.3f' % accuracy_score(test_y, y_pred))
	
print('Precision: %.3f' % precision_score(test_y, y_pred))
	
print('Recall: %.3f' % recall_score(test_y, y_pred))
	
print('F1: %.3f' % f1_score(test_y, y_pred))

Accuracy: 0.800
Precision: 0.771
Recall: 0.780
F1: 0.776


# **CNN**

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten
from keras.utils import pad_sequences
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Dropout, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from keras.callbacks import EarlyStopping

In [None]:
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.25, random_state=42)

In [None]:
word_embeddings = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = embedding

In [None]:
# Could include num_words = 500
tokenizer = Tokenizer()
#This tokenizes the text and counts the frequency of each token
tokenizer.fit_on_texts(train_X)
#create a vocabulary of the most frequently occurring words in the training data
cnn_X_train = tokenizer.texts_to_sequences(train_X)
cnn_X_val = tokenizer.texts_to_sequences(val_X)
cnn_X_test = tokenizer.texts_to_sequences(test_X)

In [None]:
# We need to pad the sequences here so they have the right shape
maxlen = 100
cnn_X_train = pad_sequences(cnn_X_train, padding='post', maxlen=maxlen)
cnn_X_val = pad_sequences(cnn_X_val, padding='post', maxlen=maxlen)
cnn_X_test = pad_sequences(cnn_X_test, padding='post', maxlen=maxlen)

In [None]:
#Making the matrix for the embedding layer
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = word_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        # If word is not in pre-trained embeddings, use random vector
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [None]:
cnn = Sequential()
cnn.add(Embedding(input_dim=len(word_index) + 1, output_dim=100, input_length=maxlen, weights=[embedding_matrix], trainable=False))
cnn.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Flatten())
cnn.add(Dense(units=250, activation='relu'))
cnn.add(Dropout(rate=0.2))
cnn.add(Dense(units=1, activation='sigmoid'))

In [None]:
cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn.fit(cnn_X_train, train_y, epochs=10, batch_size=64, validation_data=(cnn_X_val, val_y), callbacks=[EarlyStopping(patience=3)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.History at 0x7f449f2f2940>

In [None]:
y_pred = cnn.predict(cnn_X_test)
y_pred = (y_pred > 0.5).astype('int32')

print('Accuracy: %.3f' % accuracy_score(test_y, y_pred))
	
print('Precision: %.3f' % precision_score(test_y, y_pred))
	
print('Recall: %.3f' % recall_score(test_y, y_pred))
	
print('F1: %.3f' % f1_score(test_y, y_pred))

Accuracy: 0.805
Precision: 0.790
Recall: 0.762
F1: 0.776


# **RNN**

In [None]:
import numpy as np
from keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping

In [None]:
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.25, random_state=42)

In [None]:
word_embeddings = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = embedding

In [None]:
# Could include num_words = 500
tokenizer = Tokenizer()
#This tokenizes the text and counts the frequency of each token
tokenizer.fit_on_texts(train_X)
#create a vocabulary of the most frequently occurring words in the training data
rnn_X_train = tokenizer.texts_to_sequences(train_X)
rnn_X_val = tokenizer.texts_to_sequences(val_X)
rnn_X_test = tokenizer.texts_to_sequences(test_X)

In [None]:
# We need to pad the sequences here so they have the right shape
maxlen = 100
rnn_X_train = pad_sequences(rnn_X_train, padding='post', maxlen=maxlen)
rnn_X_val = pad_sequences(rnn_X_val, padding='post', maxlen=maxlen)
rnn_X_test = pad_sequences(rnn_X_test, padding='post', maxlen=maxlen)

In [None]:
#Making the matrix for the embedding layer
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = word_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        # If word is not in pre-trained embeddings, use random vector
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [None]:
rnn = Sequential()
rnn.add(Embedding(len(word_index) + 1, embedding_dim, input_length=maxlen, 
                    weights=[embedding_matrix], trainable=False))
rnn.add(LSTM(64))
rnn.add(Dense(1, activation='sigmoid'))

In [None]:
rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
rnn.fit(rnn_X_train, train_y, epochs=10, batch_size=64, validation_data=(rnn_X_val, val_y), callbacks=[EarlyStopping(patience=3)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f449f2cd520>

In [None]:
y_pred = rnn.predict(rnn_X_test)
y_pred = (y_pred > 0.5).astype('int32')

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('Accuracy: %.3f' % accuracy_score(test_y, y_pred))
	
print('Precision: %.3f' % precision_score(test_y, y_pred))
	
print('Recall: %.3f' % recall_score(test_y, y_pred))
	
print('F1: %.3f' % f1_score(test_y, y_pred))

Accuracy: 0.822
Precision: 0.762
Recall: 0.868
F1: 0.812


# **Combined CNN-LSTM**

We wanted to combine the CNN and RNN as we believe it will capture both short-distance and long-distance dependencies

In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, MaxPooling1D, Bidirectional
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
import pandas as pd
from keras.utils import pad_sequences
from sklearn.metrics import classification_report, accuracy_score
from keras.models import Sequential
from keras.layers import Flatten

In [None]:
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.25, random_state=42)

In [None]:
word_embeddings = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = embedding

In [None]:
# Could include num_words = 500
tokenizer = Tokenizer()
#This tokenizes the text and counts the frequency of each token
tokenizer.fit_on_texts(train_X)
vocab_size = len(tokenizer.word_index) + 1
#create a vocabulary of the most frequently occurring words in the training data
combined_X_train = tokenizer.texts_to_sequences(train_X)
combined_X_val = tokenizer.texts_to_sequences(val_X)
combined_X_test = tokenizer.texts_to_sequences(test_X)

In [None]:
# We need to pad the sequences here so they have the right shape
maxlen = 100
combined_X_train = pad_sequences(combined_X_train, padding='post', maxlen=maxlen)
combined_X_val = pad_sequences(combined_X_val, padding='post', maxlen=maxlen)
combined_X_test = pad_sequences(combined_X_test, padding='post', maxlen=maxlen)

In [None]:
#Making the matrix for the embedding layer
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = word_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        # If word is not in pre-trained embeddings, use random vector
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [None]:
inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)(inputs)
conv_layer = Conv1D(filters=64, kernel_size=3, padding='valid', activation='relu')(embedding_layer)
pooling_layer = MaxPooling1D(pool_size=2)(conv_layer)

lstm_layer = Bidirectional(LSTM(64))(pooling_layer)
fc_layer = Dropout(0.5)(lstm_layer)

outputs = Dense(1, activation='sigmoid')(fc_layer)
CNNLSTM = Model(inputs=inputs, outputs=outputs)

In [None]:
CNNLSTM.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# I used early stopping here to prevent overfitting since this model is prone to overfitting
CNNLSTM.fit(combined_X_train, train_y, epochs=10, batch_size=128, validation_data=(combined_X_val, val_y), callbacks=[EarlyStopping(patience=3)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.callbacks.History at 0x7f449ed9f850>

In [None]:
y_pred = CNNLSTM.predict(combined_X_test)
y_pred = (y_pred > 0.5).astype('int32')

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('Accuracy: %.3f' % accuracy_score(test_y, y_pred))
	
print('Precision: %.3f' % precision_score(test_y, y_pred))
	
print('Recall: %.3f' % recall_score(test_y, y_pred))
	
print('F1: %.3f' % f1_score(test_y, y_pred))

Accuracy: 0.815
Precision: 0.783
Recall: 0.804
F1: 0.794


# **Outside Testing**

## HSD Dataset

In [None]:
import random 
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
outside_data = pd.read_csv('merged_hate.csv',engine='python')
test_X = outside_data['contents'].values
new_test_y = outside_data['label'].values

**Naive Bayes**

In [None]:
# Fit vectorizer on testing data
nb_new_test_X = vectorizer.transform(test_X)

In [None]:
# Evaluate the model
y_pred = naive.predict(nb_new_test_X)
y_pred = (y_pred > 0.5).astype('int32')
print('Accuracy: %.3f' % accuracy_score(new_test_y, y_pred))
	
print('Precision: %.3f' % precision_score(new_test_y, y_pred))
	
print('Recall: %.3f' % recall_score(new_test_y, y_pred))
	
print('F1: %.3f' % f1_score(new_test_y, y_pred))

Accuracy: 0.763
Precision: 0.752
Recall: 0.783
F1: 0.767


**CNN**

In [None]:
cnn_X_new_test = tokenizer.texts_to_sequences(test_X)
cnn_X_new_test = pad_sequences(cnn_X_new_test, padding='post', maxlen=maxlen)

# make predictions on the test data
y_pred = cnn.predict(cnn_X_new_test)
y_pred = (y_pred > 0.5).astype('int32')

# evaluate the model's performance
print('Accuracy: %.3f' % accuracy_score(new_test_y, y_pred))

print('Precision: %.3f' % precision_score(new_test_y, y_pred))

print('Recall: %.3f' % recall_score(new_test_y, y_pred))

print('F1: %.3f' % f1_score(new_test_y, y_pred))


Accuracy: 0.531
Precision: 0.554
Recall: 0.311
F1: 0.399


**RNN**

In [None]:
rnn_X_new_test = tokenizer.texts_to_sequences(test_X)
rnn_X_new_test = pad_sequences(rnn_X_new_test, padding='post', maxlen=maxlen)

y_pred = rnn.predict(rnn_X_new_test)
y_pred = (y_pred > 0.5).astype('int32')

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('Accuracy: %.3f' % accuracy_score(new_test_y, y_pred))
	
print('Precision: %.3f' % precision_score(new_test_y, y_pred))
	
print('Recall: %.3f' % recall_score(new_test_y, y_pred))
	
print('F1: %.3f' % f1_score(new_test_y, y_pred))

Accuracy: 0.535
Precision: 0.545
Recall: 0.417
F1: 0.473


**Combined CNN-LSTM**

In [None]:
combined_X_test = tokenizer.texts_to_sequences(test_X)
combined_X_test = pad_sequences(combined_X_test, padding='post', maxlen=maxlen)

y_pred = CNNLSTM.predict(combined_X_test)
y_pred = (y_pred > 0.5).astype('int32')

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('Accuracy: %.3f' % accuracy_score(new_test_y, y_pred))
	
print('Precision: %.3f' % precision_score(new_test_y, y_pred))
	
print('Recall: %.3f' % recall_score(new_test_y, y_pred))
	
print('F1: %.3f' % f1_score(new_test_y, y_pred))

Accuracy: 0.745
Precision: 0.762
Recall: 0.712
F1: 0.736


## Davidson Hate Speech Dataset

In [None]:
davidson_data = pd.read_csv('davidson_data.csv',engine='python')
test_X = davidson_data['tweet'].values
new_test_y = davidson_data['class'].values

**Naive Bayes**

In [None]:
# Fit vectorizer on testing data
nb_new_test_X = vectorizer.transform(test_X)

# Evaluate the model
y_pred = naive.predict(nb_new_test_X)
y_pred = (y_pred > 0.5).astype('int32')
print('Accuracy: %.3f' % accuracy_score(new_test_y, y_pred))
	
print('Precision: %.3f' % precision_score(new_test_y, y_pred))
	
print('Recall: %.3f' % recall_score(new_test_y, y_pred))
	
print('F1: %.3f' % f1_score(new_test_y, y_pred))

Accuracy: 0.801
Precision: 0.753
Recall: 0.897
F1: 0.819


**CNN**

In [None]:
cnn_X_new_test = tokenizer.texts_to_sequences(test_X)
cnn_X_new_test = pad_sequences(cnn_X_new_test, padding='post', maxlen=maxlen)

# make predictions on the test data
y_pred = cnn.predict(cnn_X_new_test)
y_pred = (y_pred > 0.5).astype('int32')

# evaluate the model's performance
print('Accuracy: %.3f' % accuracy_score(new_test_y, y_pred))

print('Precision: %.3f' % precision_score(new_test_y, y_pred))

print('Recall: %.3f' % recall_score(new_test_y, y_pred))

print('F1: %.3f' % f1_score(new_test_y, y_pred))

Accuracy: 0.513
Precision: 0.522
Recall: 0.319
F1: 0.396


**RNN**

In [None]:
rnn_X_new_test = tokenizer.texts_to_sequences(test_X)
rnn_X_new_test = pad_sequences(rnn_X_new_test, padding='post', maxlen=maxlen)

y_pred = rnn.predict(rnn_X_new_test)
y_pred = (y_pred > 0.5).astype('int32')

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('Accuracy: %.3f' % accuracy_score(new_test_y, y_pred))
	
print('Precision: %.3f' % precision_score(new_test_y, y_pred))
	
print('Recall: %.3f' % recall_score(new_test_y, y_pred))
	
print('F1: %.3f' % f1_score(new_test_y, y_pred))

Accuracy: 0.477
Precision: 0.481
Recall: 0.576
F1: 0.524


**Combined CNN-LSTM**

In [None]:
combined_X_test = tokenizer.texts_to_sequences(test_X)
combined_X_test = pad_sequences(combined_X_test, padding='post', maxlen=maxlen)

y_pred = CNNLSTM.predict(combined_X_test)
y_pred = (y_pred > 0.5).astype('int32')

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('Accuracy: %.3f' % accuracy_score(new_test_y, y_pred))
	
print('Precision: %.3f' % precision_score(new_test_y, y_pred))
	
print('Recall: %.3f' % recall_score(new_test_y, y_pred))
	
print('F1: %.3f' % f1_score(new_test_y, y_pred))

Accuracy: 0.731
Precision: 0.702
Recall: 0.806
F1: 0.750


Here, we decided to bring in another database unrelated to our training set to see how our models each do

HSD: The Hate Speech Dataset: https://github.com/aitor-garcia-p/hate-speech-dataset

Stormfront Corpus: https://paperswithcode.com/dataset/hate-speech

Wikipedia Talk Pages: https://figshare.com/articles/dataset/Wikipedia_Talk_Labels_Toxicity/4563973


The Davidson Hate Speech Dataset: https://github.com/t-davidson/hate-speech-and-offensive-language

I would do the HSD and the Stormfront corpus, load them in and test the 4 trained models on them