### Import common libraries

In [1]:
# General Libraries
import numpy as np
import math

# Libraries for Data Loading and Processing
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# Pytorch Libraries for CNN Classification
import torch.nn as nn
from torch.nn.functional import binary_cross_entropy
from torch import optim
import torch 

# Custom Libraries for Performance Evaluation
from evaluate_classification import EvaluateBinaryClassification

C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  stacklevel=1)


### Initialise Random Variables

In [2]:
SEED = 123
np.random.seed(SEED)

### Loading and Preparing Data

In [3]:
BASE = 'D:\\ResearchDataGtx1060\\SentimentData\\Hate\\'
fins_train = ['eastasian_hate_sub_train.csv']
fins_test = ['eastasian_hate_sub_test.csv']
track = 0

In [4]:
# We apply only this preprocessing because our data is already preprocessed
def cleanNonAscii(text):
    '''
    Remove Non ASCII characters from the dataset.
    Arguments:
        text: str
    returns: 
        text: str
    '''
    return ''.join(i for i in text if ord(i) < 128)

In [5]:
df_train = pd.read_csv(BASE+fins_train[track])
df_train.head()

Unnamed: 0,label,text
0,0,<user> <user> i can ’ t say what i want to say...
1,1,"<user> <user> trust me . as hong konger , i ca..."
2,0,i ’ ve been living in china during the corona ...
3,1,those who like blaming <hashtag> chinese virus...
4,1,<hashtag> china virus </hashtag> china should ...


In [6]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,3120
1,3116


In [7]:
df_train['text'] = df_train['text'].apply(cleanNonAscii)

In [8]:
X_train, y_train = df_train['text'].values, df_train['label'].values

In [9]:
df_test = pd.read_csv(BASE+fins_test[track])
df_test.head()

Unnamed: 0,label,text
0,0,"with such damning evidence , the silence of <u..."
1,0,<user> <user> does <allcaps> cdc </allcaps> ha...
2,1,<url> <hashtag> chinazi virus </hashtag> keeps...
3,0,<user> 2 4 hrs later . unimpressed . <hashtag>...
4,0,rt <user> make sure you are prepared with the ...


In [10]:
df_test['text'] = df_test['text'].apply(cleanNonAscii)

In [11]:
X_test, y_test = df_test['text'].values, df_test['label'].values

In [12]:
X_train[0], X_test[0]

('<user> <user> i can  t say what i want to say about scaramucci but i will say how stupid he sounds , especially when he prefaces every ridiculous statement with ,  at the end of the day .  dumb . <hashtag> wuhan virus </hashtag> <hashtag> chinese coronavirus </hashtag>',
 'with such damning evidence , the silence of <user> and the international community to call out <hashtag> china </hashtag>  s irresponsible actions that are threatening the world is not only baffling but points to something nefarious altogether . <hashtag> wuhan coronavirus </hashtag> <hashtag> china virus </hashtag> <hashtag> covid19 </hashtag> <url>')

### Transforming data suitable for model format

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
num_words = 100000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train)
maxlen = max(map(lambda x: len(x),xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

In [14]:
xtrain

array([[  0,   0,   0, ...,  16,   7,   1],
       [  0,   0,   0, ...,  88,   1,  10],
       [  0,   0,   0, ...,   6,   4,   1],
       ...,
       [  0,   0,   0, ...,   2,   1,  10],
       [  0,   0,   0, ...,  13, 176,  10],
       [  0,   0,   0, ...,   4,   1,   5]])

# Define CNN Model

In [15]:
class Params:
    # Preprocessing parameeters
    seq_len: int = xtrain.shape[1]
    num_words: int = 100000
   
    # Model parameters
    embedding_size: int = 64
    out_size: int = 32
    stride: int = 2
   
    # Training parameters
    epochs: int = 13
    batch_size: int = 32
    learning_rate: float = 0.001
    
params=Params()

In [16]:
'''
nn.ModuleList Holds submodules in a list.
ModuleList can be indexed like a regular Python list, 
but modules it contains are properly registered, 
and will be visible by all Module methods.

'''
class CnnTextClassifier(nn.ModuleList):
    

    def __init__(self, params):
        super().__init__()
        
        # set paremeters related to text preparation
        self.seq_len = params.seq_len
        self.num_words = params.num_words
        self.emb_size = params.embedding_size
        self.stride = params.stride
        
        # define dropouts
        self.dropout50 = nn.Dropout(0.5)
        self.dropout20 = nn.Dropout(0.2)
        
        ### now define parameters for CNN
        
        # define kernel sizes
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 4
        self.kernel_4 = 5
        
        # define output size of each kernel
        self.kout_size = params.out_size
        
        # define number of stride for each kernel
        self.stride = params.stride
        
        # define embedding layer
        self.embedding = nn.Embedding(self.num_words+1, self.emb_size, padding_idx=0)
        
        # define convolution layers
        self.conv_1 = nn.Conv1d(self.seq_len, self.kout_size, self.kernel_1, self.stride)
        self.conv_2 = nn.Conv1d(self.seq_len, self.kout_size, self.kernel_2, self.stride)
        self.conv_3 = nn.Conv1d(self.seq_len, self.kout_size, self.kernel_3, self.stride)
        self.conv_4 = nn.Conv1d(self.seq_len, self.kout_size, self.kernel_4, self.stride)
        
        # define pooling layers
        self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
        self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
        self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
        self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)
        
        # define fully connected layer
        self.fc = nn.Linear(self.in_size_features_fc(), 1)
        
        
    def forward(self, x):
        # feed the numerical representation x of a text to embedding layer
        x = self.embedding(x)
        
        # apply convolution layer 1
        x1 = self.conv_1(x) # batch size, number of channels, height, width
        x1 = torch.relu(x1)
        x1 = self.pool_1(x1)
        
        # apply convolution layer 2
        x2 = self.conv_2(x)
        x2 = torch.relu(x2)
        x2 = self.pool_2(x2)
        
        # apply convolution layer 3
        x3 = self.conv_3(x)
        x3 = torch.relu(x3)
        x3 = self.pool_3(x3)
        
        # apply convolution layer 4
        x4 = self.conv_4(x)
        x4 = torch.relu(x4)
        x4 = self.pool_4(x4)
        
        # now we concatenate the output of each convolution layer
        union = torch.cat((x1, x2, x3, x4), 2)
        union = union.reshape(union.size(0), -1)
        
        # now we pass the flattened vector 'union' to the fully connected layer
        out = self.fc(union)
        
        # apply drouptou
        out = self.dropout20(out)
        
        # apply sigmoid activation to get a probability distribution
        out = torch.sigmoid(out)
        
        return out.squeeze()
        
        
    def in_size_features_fc(self):
        '''
        Calculates the number of output features after Convolution + Max pooling 
        Convolved_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
        Pooled_Features = ((embedding_size + (2 * padding) - dilation * (kernel - 1) - 1) / stride) + 1
        source: https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        '''
        # Calcualte size of convolved/pooled features for convolution_1/max_pooling_1 features
        out_conv_1 = ((self.emb_size - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_conv_1 = math.floor(out_conv_1)
        out_pool_1 = ((out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
        out_pool_1 = math.floor(out_pool_1)
      
        # Calcualte size of convolved/pooled features for convolution_2/max_pooling_2 features
        out_conv_2 = ((self.emb_size - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_conv_2 = math.floor(out_conv_2)
        out_pool_2 = ((out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
        out_pool_2 = math.floor(out_pool_2)
      
        # Calcualte size of convolved/pooled features for convolution_3/max_pooling_3 features
        out_conv_3 = ((self.emb_size - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_conv_3 = math.floor(out_conv_3)
        out_pool_3 = ((out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
        out_pool_3 = math.floor(out_pool_3)
      
        # Calcualte size of convolved/pooled features for convolution_4/max_pooling_4 features
        out_conv_4 = ((self.emb_size - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_conv_4 = math.floor(out_conv_4)
        out_pool_4 = ((out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
        out_pool_4 = math.floor(out_pool_4)
      
        # Returns "flattened" vector (input for fully connected layer)
        return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.kout_size

### Define dataset mapper

In [17]:
class DatasetMapper(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

### Create dataset mapper objects for train and test

In [18]:
train = DatasetMapper(xtrain, y_train)
test = DatasetMapper(xtest, y_test)

### Initialise dataloaders for train and test

In [19]:
loader_train = DataLoader(train, batch_size=params.batch_size)
loader_test = DataLoader(test, batch_size=params.batch_size)

### Define optimizer

In [20]:
cnn_model = CnnTextClassifier(params)
optimizer = optim.RMSprop(cnn_model.parameters(), lr=params.learning_rate)

### Train model

In [21]:
def calculate_accuray(grand_truth, predictions):
    true_positives = 0
    true_negatives = 0
   
    # Gets frequency  of true positives and true negatives
    # The threshold is 0.5
    for true, pred in zip(grand_truth, predictions):
        if (pred >= 0.5) and (true == 1):
            true_positives += 1
        elif (pred < 0.5) and (true == 0):
            true_negatives += 1
        else:
            pass
    # Return accuracy
    return (true_positives+true_negatives) / len(grand_truth)

In [22]:
for epoch in range(params.epochs):
    # set the model in training mode
    cnn_model.train()
    predictions = []
    print('<----epoch', epoch)
    
    # start training the batches
    for x_batch, y_batch in loader_train:
        y_batch = y_batch.type(torch.FloatTensor) # changing datatype to float tensor
        x_batch = x_batch.type(torch.LongTensor)
        y_pred = cnn_model(x_batch) # feed to the model
        loss = binary_cross_entropy(y_pred, y_batch) # calcualte loss
        optimizer.zero_grad() # clearn gradiants
        loss.backward() # calculate gradients
        optimizer.step() # update parameters based on gradients
        predictions += list(y_pred.detach().numpy())
    
    # evaluate for one epoch
    accuracy = calculate_accuray(y_train, predictions)
    print('accuracy', accuracy, '-------->')

<----epoch 0
accuracy 0.5829057087876844 -------->
<----epoch 1
accuracy 0.7171263630532393 -------->
<----epoch 2
accuracy 0.7830339961513791 -------->
<----epoch 3
accuracy 0.8337075048107762 -------->
<----epoch 4
accuracy 0.8521488133418859 -------->
<----epoch 5
accuracy 0.8781270044900578 -------->
<----epoch 6
accuracy 0.8904746632456703 -------->
<----epoch 7
accuracy 0.8968890314304041 -------->
<----epoch 8
accuracy 0.8960872354073124 -------->
<----epoch 9
accuracy 0.8920782552918538 -------->
<----epoch 10
accuracy 0.895445798588839 -------->
<----epoch 11
accuracy 0.9021808851828095 -------->
<----epoch 12
accuracy 0.8938422065426556 -------->


### Evaluate Model

In [25]:
def evaluation(model, loader_test):

    # Set the model in evaluation mode
    model.eval()
    predictions = []

    # Starst evaluation phase
    with torch.no_grad():
        for x_batch, y_batch in loader_test:
            x_batch = x_batch.type(torch.LongTensor)
            y_pred = model(x_batch)
            predictions += list(y_pred.detach().numpy())
    return predictions

In [24]:
test_predictions = evaluation(cnn_model, loader_test)
test_accuracy = calculate_accuray(y_test, test_predictions)
print(test_accuracy)

0.7115384615384616


In [33]:
test_predictions
ebc = EvaluateBinaryClassification(gnd_truths = y_test, predictions = [int(round(y)) for y in test_predictions])
print(ebc.get_full_report())

EvaluateBinaryClassification Object Created

Total Samples	780
Positive Samples	380
Negative Samples	400
True Positive	265
True Negative	290
False Positive	110
False Negative	115
Accuracy	0.7115384615384616
Precision	0.7066666666666667
Recall	0.6973684210526315
F1 Measure	0.7019867549668874
Cohen Kappa Score	0.42250740375123397
Area Under Curve	0.7111842105263156

              precision    recall  f1-score   support

           0       0.72      0.72      0.72       400
           1       0.71      0.70      0.70       380

    accuracy                           0.71       780
   macro avg       0.71      0.71      0.71       780
weighted avg       0.71      0.71      0.71       780

