# PyTorch CNN for Hate Speech Detection

PyTorch Implementation of the following Paper 

Misogynistic Tweet Detection: Modelling CNN with Small Datasets

Cite:

@inproceedings{bashar2018cnn, title={Misogynistic Tweet Detection: Modelling CNN with Small Datasets}, author={Bashar, Md Abul and Nayak, Richi and Suzor, Nicolas and Weir, Bridget}, booktitle={The 16th Australasian Data Mining Conference}, year={2018} }

Keras Implementation is available at the following link

https://github.com/mdabashar/CNN_for_Misogynistic_Tweet_Detection

#### 0.0 Import required libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch.nn.functional import binary_cross_entropy

import numpy as np
import pandas as pd

# from softmax import Softmax

#### 0.1 Initialise Random Variables

In [2]:
SEED = 123
np.random.seed(SEED)

### 1. Loading and Preparing Data

#### 1.0. Load and check data

In [3]:
BASE = ''
fins_train = ['eastasian_hate_sub_train.csv']
fins_test = ['eastasian_hate_sub_test.csv']
track = 0

In [4]:
# We apply only this preprocessing because our data is already preprocessed
def cleanNonAscii(text):
    '''
    Remove Non ASCII characters from the dataset.
    Arguments:
        text: str
    returns: 
        text: str
    '''
    return ''.join(i for i in text if ord(i) < 128)

In [5]:
df_train = pd.read_csv(BASE+fins_train[track])
df_train.head()

Unnamed: 0,label,text
0,0,<user> <user> i can ’ t say what i want to say...
1,1,"<user> <user> trust me . as hong konger , i ca..."
2,0,i ’ ve been living in china during the corona ...
3,1,those who like blaming <hashtag> chinese virus...
4,1,<hashtag> china virus </hashtag> china should ...


In [6]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,3120
1,3116


In [7]:
df_train['text'] = df_train['text'].apply(cleanNonAscii)

In [8]:
X_train, y_train = df_train['text'].values, df_train['label'].values

In [9]:
df_test = pd.read_csv(BASE+fins_test[track])
df_test.head()

Unnamed: 0,label,text
0,0,"with such damning evidence , the silence of <u..."
1,0,<user> <user> does <allcaps> cdc </allcaps> ha...
2,1,<url> <hashtag> chinazi virus </hashtag> keeps...
3,0,<user> 2 4 hrs later . unimpressed . <hashtag>...
4,0,rt <user> make sure you are prepared with the ...


In [10]:
df_test['text'] = df_test['text'].apply(cleanNonAscii)

In [11]:
X_test, y_test = df_test['text'].values, df_test['label'].values

In [12]:
X_train[0], X_test[0]

('<user> <user> i can  t say what i want to say about scaramucci but i will say how stupid he sounds , especially when he prefaces every ridiculous statement with ,  at the end of the day .  dumb . <hashtag> wuhan virus </hashtag> <hashtag> chinese coronavirus </hashtag>',
 'with such damning evidence , the silence of <user> and the international community to call out <hashtag> china </hashtag>  s irresponsible actions that are threatening the world is not only baffling but points to something nefarious altogether . <hashtag> wuhan coronavirus </hashtag> <hashtag> china virus </hashtag> <hashtag> covid19 </hashtag> <url>')

#### 1.1. Transforming data suitable for model format

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
num_words = 100000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train)
maxlen = max(map(lambda x: len(x),xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

In [14]:
xtrain.shape

(6236, 95)

####  1.2. Dataset mapper and Data Loader

In [15]:

class DatasetMapper(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [16]:
#Create dataset mapper objects for train and test
train = DatasetMapper(xtrain, y_train)
test = DatasetMapper(xtest, y_test)

In [17]:
#Initialise dataloaders for train and test
loader_train = DataLoader(train, batch_size=32)
loader_test = DataLoader(test, batch_size=32)

### 2. Define CNN model network

In [18]:
class CnnModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim=64):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(in_channels=95, out_channels=128, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=95, out_channels=256, kernel_size=4)
        self.conv3 = nn.Conv1d(in_channels=95, out_channels=512, kernel_size=5)
        
        # flatten and concat conv1, conv2, conv3
        self.fc1 = nn.Linear(in_features=(128+256+512), out_features = 256)
        self.out = nn.Linear(in_features=256, out_features=1)
        
        # Define proportion or neurons to dropout
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, t):
        # (0) input layer
        t = t
        
        # (1) embedding layer (this layer assign a vector to each word index in t)
        t = self.embeddings(t)
        
        # (2) hidden conv layer
        tri_gram = self.conv1(t)
        tri_gram = F.relu(tri_gram)
        tri_gram = F.max_pool1d(tri_gram, kernel_size=tri_gram.shape[2], stride=1) # GlobalMaxPool1d using MaxPool1d
        tri_gram = self.dropout(tri_gram)
        
        # (3) hidden conv layer
        four_gram = self.conv2(t)
        four_gram = F.relu(four_gram)
        four_gram = F.max_pool1d(four_gram, kernel_size=four_gram.shape[2], stride=1) # GlobalMaxPool1d using MaxPool1d
        four_gram = self.dropout(four_gram)
        
        # (4) hidden conv layer
        five_gram = self.conv3(t)
        five_gram = F.relu(five_gram)
        five_gram = F.max_pool1d(five_gram, kernel_size=five_gram.shape[2], stride=1) # GlobalMaxPool1d using MaxPool1d
        five_gram = self.dropout(five_gram)
        
        # flatten and concat conv1, conv2, conv3
        t = torch.cat((tri_gram.squeeze(dim=2), four_gram.squeeze(dim=2), five_gram.squeeze(dim=2)), dim=1)
        
        # (6) hidden linear layer
        t = self.fc1(t)
        t = F.relu(t)
        t = self.dropout(t)
        
        # (7) output layer
        t = self.out(t)
        t = torch.sigmoid(t)
        
        return t.squeeze()

#### 2.0. Cheak CNN model

In [19]:
# cnn_model = CnnModel(vocab_size=1000)
# cnn_model

In [20]:
# Creat random word indices in the vocabulary
# data = torch.randint(0, 1000, (1, 16)) # arguments: start_range=0, end_range=1000, (batch_size, words_in_instance=16)
# data.shape

In [21]:
# Forward pass
# out = cnn_model(data)
# out.shape

In [22]:
# out[0][127]

### 3. Train Model 

#### 3.1. Create CNN model and Define optimizer

In [23]:
cnn_model = CnnModel(vocab_size=100000)
optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)

#### 3.2. Define helper functions

In [24]:
def calculate_accuray(grand_truth, predictions):
    true_positives = 0
    true_negatives = 0
   
    # Gets frequency  of true positives and true negatives
    # The threshold is 0.5
    for true, pred in zip(grand_truth, predictions):
        if (pred >= 0.5) and (true == 1):
            true_positives += 1
        elif (pred < 0.5) and (true == 0):
            true_negatives += 1
        else:
            pass
    # Return accuracy
    return (true_positives+true_negatives) / len(grand_truth)

#### 3.3. Training Loop

In [25]:
for epoch in range(10):
    # set the model in training mode
    cnn_model.train()
    predictions = []
    print('<----epoch', epoch)
    
    # start training the batches
    for x_batch, y_batch in loader_train:
        y_batch = y_batch.type(torch.FloatTensor) # changing datatype to float tensor
        x_batch = x_batch.type(torch.LongTensor)
        y_pred = cnn_model(x_batch) # feed to the model
        loss = binary_cross_entropy(y_pred, y_batch) # calcualte loss
        optimizer.zero_grad() # clearn gradiants
        loss.backward() # calculate gradients
        optimizer.step() # update parameters based on gradients
        predictions += list(y_pred.detach().numpy())
    
    # evaluate for one epoch
    accuracy = calculate_accuray(y_train, predictions)
    print('accuracy', accuracy, '-------->')

<----epoch 0


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


accuracy 0.5375240538806928 -------->
<----epoch 1
accuracy 0.6130532392559332 -------->
<----epoch 2
accuracy 0.6619627966645285 -------->
<----epoch 3
accuracy 0.6937139191789609 -------->
<----epoch 4
accuracy 0.7554522129570237 -------->
<----epoch 5
accuracy 0.7867222578576011 -------->
<----epoch 6
accuracy 0.818313021167415 -------->
<----epoch 7
accuracy 0.8412443874278384 -------->
<----epoch 8
accuracy 0.8632135984605517 -------->
<----epoch 9
accuracy 0.8948043617703656 -------->


### 4. Evaluate Model

In [26]:
def evaluation(model, loader_test):

    # Set the model in evaluation mode
    model.eval()
    predictions = []

    # Starst evaluation phase
    with torch.no_grad():
        for x_batch, y_batch in loader_test:
            x_batch = x_batch.type(torch.LongTensor)
            y_pred = model(x_batch)
            predictions += list(y_pred.detach().numpy())
    return predictions

In [27]:
test_predictions = evaluation(cnn_model, loader_test)
test_accuracy = calculate_accuray(y_test, test_predictions)
print(test_accuracy)

0.7205128205128205


In [28]:
from evaluate_classification import EvaluateBinaryClassification

ebc = EvaluateBinaryClassification(gnd_truths = y_test, predictions = [int(round(y)) for y in test_predictions])
print(ebc.get_full_report())

EvaluateBinaryClassification Object Created

Total Samples	780
Positive Samples	380
Negative Samples	400
True Positive	240
True Negative	322
False Positive	78
False Negative	140
Accuracy	0.7205128205128205
Precision	0.7547169811320755
Recall	0.631578947368421
F1 Measure	0.6876790830945558
Cohen Kappa Score	0.4383670233848592
Area Under Curve	0.7182894736842105

              precision    recall  f1-score   support

           0       0.70      0.81      0.75       400
           1       0.75      0.63      0.69       380

    accuracy                           0.72       780
   macro avg       0.73      0.72      0.72       780
weighted avg       0.73      0.72      0.72       780

