# Deep Neural Network for Hate Speech Detection

### Import common libraries

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras
from keras.layers import Input
from keras.layers import Dense, Dropout
from keras.models import Model
#from keras.models import Sequential

from evaluate_classification import EvaluateBinaryClassification
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  stacklevel=1)


### Initialise Random variables and Tensor Board

In [2]:
SEED = 123
np.random.seed(SEED)
tf.random.set_seed(SEED)

### Loading Data

In [3]:
BASE = 'D:\\ResearchDataGtx1060\\SentimentData\\Hate\\'
fins_train = ['random_hate_train.csv']
fins_test = ['eastasian_hate_test.csv']
track = 0

In [4]:
# We apply only this preprocessing because our data is already preprocessed
def cleanNonAscii(text):
    '''
    Remove Non ASCII characters from the dataset.
    Arguments:
        text: str
    returns: 
        text: str
    '''
    return ''.join(i for i in text if ord(i) < 128)

In [5]:
df_train = pd.read_csv(BASE+fins_train[track])
df_train.head()

Unnamed: 0,label,text
0,1,<user> if you are one of the <number> mil <has...
1,0,best <hashtag> law of attraction </hashtag> <h...
2,1,<hashtag> michelle obama </hashtag> is the mos...
3,0,smiling because life is good rite now ! <repea...
4,0,ã ¢ â  â ¤ ã ¯ â ¸ â  ã ¢ â  â ¤ ã ¯ â ¸ â ...


In [6]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,2242
1,2242


In [7]:
df_train['text'] = df_train['text'].apply(cleanNonAscii)
df_train.head(5)

Unnamed: 0,label,text
0,1,<user> if you are one of the <number> mil <has...
1,0,best <hashtag> law of attraction </hashtag> <h...
2,1,<hashtag> michelle obama </hashtag> is the mos...
3,0,smiling because life is good rite now ! <repea...
4,0,<hashtag> ...


In [8]:
X_train, y_train = df_train['text'].values, df_train['label'].values

In [9]:
df_test = pd.read_csv(BASE+fins_test[track])
df_test.head()

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [10]:
df_test.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,3898
1,3898


In [11]:
df_test['text'] = df_test['text'].apply(cleanNonAscii)
df_test.head(5)

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [12]:
X_test, y_test = df_test['text'].values, df_test['label'].values

### Transforming data suitable for model format

In [13]:
X_train[:2]

array(['<user> if you are one of the <number> mil <hashtag> americans </hashtag> who rejected , <hashtag> bigotry </hashtag> & <hashtag> hatred </hashtag> fly your flag half mast on jan .      ',
       'best <hashtag> law of attraction </hashtag> <hashtag> resources </hashtag> for <hashtag> healing </hashtag> ! <hashtag> alt ways to heal </hashtag> <hashtag> healthy </hashtag> is ! <hashtag> i dwp </hashtag> !'],
      dtype=object)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(binary=True)
count_vectorizer.fit(X_train)
#count_vectorizer.vocabulary_

CountVectorizer(binary=True)

In [15]:
train_vectors = count_vectorizer.transform(X_train) # it transforms to a sparse matrix
train_vectors

<4484x9447 sparse matrix of type '<class 'numpy.int64'>'
	with 60825 stored elements in Compressed Sparse Row format>

In [16]:
X_train_vectors = train_vectors.toarray() # convert sparse matrix to matrix
X_train_vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
test_vectors = count_vectorizer.transform(X_test)
X_test_vectors = test_vectors.toarray()

In [18]:
X_train_vectors[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
# Coverting y_train to make it suitable for two nodes in the output layer
# You can actually make this coversion much simplier using numpy
categories = list([0,1])
y_train_vectors = []
for e in y_train:
    output_empty = [0] * len(categories)
    output_row = list(output_empty)
    output_row[categories.index(e)] = 1
    y_train_vectors.append(output_row)

In [20]:
y_train_vectors = np.array(y_train_vectors)

In [21]:
y_train_vectors[:5]

array([[0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0]])

In [22]:
y_test_vectors = []
for e in y_test:
    output_empty = [0] * len(categories)
    output_row = list(output_empty)
    output_row[categories.index(e)] = 1
    y_test_vectors.append(output_row)

### Creating DNN model and training it for 10 epoc

In [23]:
def create_dnn_model():
    inputSize = len(X_train_vectors[0])
    outputSize = len(y_train_vectors[0])
    inputs = Input(shape=(inputSize,))
    
    dense = Dense(8, activation="relu") # input layer
    net = dense(inputs) # need this because keras needs to start from a lalyer
    net = Dropout(0.5)(net)
    
    net = Dense(8, activation='relu')(net) # hidden layer 1
    net = Dropout(0.5)(net)
    
    net = Dense(8, activation='relu')(net) # hdden layer 2
    net = Dropout(0.5)(net)
    
    net = Dense(8, activation='relu')(net) # hidden layer 3
    net = Dropout(0.5)(net)
    
    net = Dense(8, activation='relu')(net) # hidden layer 4
    net = Dropout(0.5)(net)
    
    net = Dense(8, activation='relu')(net) # hidden layer 5
    net = Dropout(0.5)(net)
    
    output = Dense(2, activation='sigmoid')(net) # output layer
    
    model = Model(inputs=[inputs], outputs=[output])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.summary()
    return model

dnn_model = create_dnn_model()
dnn_model.fit(X_train_vectors, y_train_vectors, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1dc01fb2e08>

### Evaluating the model with test dataset

In [25]:
p = dnn_model.predict(X_test_vectors, verbose=1)
predicted = np.argmax(p, axis=1)
predicted = np.array(predicted)
actual = y_test

ebc = EvaluateBinaryClassification(gnd_truths = actual, predictions = predicted)
print(ebc.get_full_report())

EvaluateBinaryClassification Object Created

Total Samples	7796
Positive Samples	3898
Negative Samples	3898
True Positive	2316
True Negative	1948
False Positive	1950
False Negative	1582
Accuracy	0.5469471523858389
Precision	0.5428973277074542
Recall	0.5941508465879939
F1 Measure	0.5673689367956883
Cohen Kappa Score	0.09389430477167782
Area Under Curve	0.5469471523858389

              precision    recall  f1-score   support

           0       0.55      0.50      0.52      3898
           1       0.54      0.59      0.57      3898

    accuracy                           0.55      7796
   macro avg       0.55      0.55      0.55      7796
weighted avg       0.55      0.55      0.55      7796



In [26]:
ebc.save_full_report(model_name='DNN', path='C:\\Users\\User\\JupyterPythonPredator\\COVID19\\domain_adaptation_rerun_randomhate_eastasianhate_')