In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import pickle

In [2]:
#import text data
#english - subtask a
dtf = pd.read_csv(r"en_train_normalised.csv", delimiter=",", na_filter=False, encoding="utf-8")             #training data
dtf_test = pd.read_csv(r"en_test_normalised.csv", delimiter=",", na_filter=False, encoding="utf-8")         #testing data
#dtf_test = pd.read_csv(r"en_dev_normalised.csv", delimiter=",", na_filter=False, encoding="utf-8")         #development 

#spanish text
#dtf = pd.read_csv(r"es_train_normalised.csv", delimiter=",", na_filter=False, encoding="utf-8")
#dtf_test = pd.read_csv(r"es_test_normalised.csv", delimiter=",", na_filter=False, encoding="utf-8")

# embedding

In [4]:
#import the glove embeddings
path  =r"glove.twitter.27B.200d.txt"

glove = pd.read_csv(path, sep=" ", quoting=3, header=None, index_col=0)
glove_embedding = {key: val.values for key, val in glove.T.items()}

In [5]:
#create embedding matrix - look-table for embeddings based on index
def create_embedding_matrix(word_index,embedding_dict,dimension):
  embedding_matrix=np.zeros((len(word_index)+1,dimension))      #maybe better to use rand??
 
  for word,index in word_index.items():
    if word in embedding_dict:
      embedding_matrix[index]=embedding_dict[word]
  return embedding_matrix

In [7]:
#fit the embedding matrix on the testing vocabulary
text = dtf["text"].tolist()
 
tokenizer=tf.keras.preprocessing.text.Tokenizer(split=" ")
tokenizer.fit_on_texts(text)
 
text_token=tokenizer.texts_to_sequences(text)
 
embedding_matrix = create_embedding_matrix(tokenizer.word_index,embedding_dict=glove_embedding,dimension=200)

In [8]:
vocab_size = embedding_matrix.shape[0]              #no. words in dataset --> 11400
vector_size = embedding_matrix.shape[1]             #dimension of vectors --> 200

embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=vector_size)

In [9]:
#initialise embedding layer using pre-trained weights
embedding.weight=nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))

# Model structure

In [10]:
class convoNN(nn.ModuleList):
    #def __init__(self, weights_matrix, hidden_size, number_feature):
    def __init__(self, weights_matrix, hidden_size, number_feature):
        super(convoNN, self).__init__()
        # intialize embedding layer with required size
        self.vocab_size = weights_matrix.shape[0]              #no. words in dataset
        self.vector_size = weights_matrix.shape[1]             #dimension of vectors

        #embedding
        self.embedding_layer = nn.Embedding(self.vocab_size + 1, self.vector_size)
        #initialise embedding layer using pre-trained weights
        self.embedding_layer.weight=nn.Parameter(torch.tensor(weights_matrix,dtype=torch.float32))
        #disable learning bc pre-trained
        #self.embedding_layer.weight.requires_grad=False

        self.stride = 2
        self.dropout = nn.Dropout(0.5)
        self.seq_len = 13
        self.out_size = 32

        #kernels
        self.kernel_1 = 2
        self.kernel_2 = 3
        self.kernel_3 = 4
        self.kernel_4 = 5

        #convolution layers
        self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)     
        self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
        self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
        self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)

        #pooling layers
        self.pooling_1 = nn.MaxPool1d(self.kernel_1, self.stride)
        self.pooling_2 = nn.MaxPool1d(self.kernel_2, self.stride)
        self.pooling_3 = nn.MaxPool1d(self.kernel_3, self.stride)
        self.pooling_4 = nn.MaxPool1d(self.kernel_4, self.stride)

        #feature layer
        self.feature_layer = nn.Linear(number_feature, number_feature).float()

        # combination layer
        self.combined_layer = nn.Linear(self.in_features_fc()+number_feature, hidden_size).float()        
        
    	#fully connected layer
        #self.fc = nn.Linear(self.in_features_fc(), 1)              #without features
        self.fc = nn.Linear(hidden_size, 1)                         #with features
    
    def in_features_fc(self):

      self.embedding_size = 200
      # Calculate size of convolved/pooled features for convolution_1/max_pooling_1 features
      out_conv_1 = ((self.embedding_size - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
      out_conv_1 = math.floor(out_conv_1)
      out_pool_1 = ((out_conv_1 - 1 * (self.kernel_1 - 1) - 1) / self.stride) + 1
      out_pool_1 = math.floor(out_pool_1)
      
      # Calculate size of convolved/pooled features for convolution_2/max_pooling_2 features
      out_conv_2 = ((self.embedding_size - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
      out_conv_2 = math.floor(out_conv_2)
      out_pool_2 = ((out_conv_2 - 1 * (self.kernel_2 - 1) - 1) / self.stride) + 1
      out_pool_2 = math.floor(out_pool_2)
      
      # Calculate size of convolved/pooled features for convolution_3/max_pooling_3 features
      out_conv_3 = ((self.embedding_size - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
      out_conv_3 = math.floor(out_conv_3)
      out_pool_3 = ((out_conv_3 - 1 * (self.kernel_3 - 1) - 1) / self.stride) + 1
      out_pool_3 = math.floor(out_pool_3)
      
      # Calculate size of convolved/pooled features for convolution_4/max_pooling_4 features
      out_conv_4 = ((self.embedding_size - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
      out_conv_4 = math.floor(out_conv_4)
      out_pool_4 = ((out_conv_4 - 1 * (self.kernel_4 - 1) - 1) / self.stride) + 1
      out_pool_4 = math.floor(out_pool_4)
      
      # Returns "flattened" vector (input for fully connected layer)
      return (out_pool_1 + out_pool_2 + out_pool_3 + out_pool_4) * self.out_size
        
    
    def forward(self, embedding_input, feature_input):
        # embedding_layer
        x = self.embedding_layer(embedding_input)

        #apply convolution layers
        x1 = self.conv_1(x)
        x1 = torch.relu(x1)
        x1 = self.pooling_1(x1)

        x2 = self.conv_2(x)
        x2 = torch.relu(x2)
        x2 = self.pooling_2(x2)

        x3 = self.conv_3(x)
        x3 = torch.relu(x3)
        x3 = self.pooling_3(x3)

        x4 = self.conv_4(x)
        x4 = torch.relu(x4)
        x4 = self.pooling_4(x4)
        
        # feature layer
        feature_layer = self.feature_layer(feature_input)

        #concatenate outputs from convolutional layers
        result = torch.cat((x1, x2, x3, x4),2)
        result = result.reshape(result.size(0),-1)

        #combine output from convolutional layers and number features
        combined = torch.cat((result, feature_layer), 1)
        combined_layer = self.combined_layer(combined)

        #pass through fully connected layer
        out = self.fc(combined_layer)
        #out = self.fc(result)  	            #without number features
        
        out = self.dropout(out)

        #apply activation
        out = torch.sigmoid(out)
        
        # output layer
        return out.squeeze()

In [11]:
#initialise model
convoNN(embedding_matrix, 24, 3)
#model = convoNN(embedding_matrix)

convoNN(
  (embedding_layer): Embedding(11401, 200)
  (dropout): Dropout(p=0.5, inplace=False)
  (conv_1): Conv1d(13, 32, kernel_size=(2,), stride=(2,))
  (conv_2): Conv1d(13, 32, kernel_size=(3,), stride=(2,))
  (conv_3): Conv1d(13, 32, kernel_size=(4,), stride=(2,))
  (conv_4): Conv1d(13, 32, kernel_size=(5,), stride=(2,))
  (pooling_1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pooling_2): MaxPool1d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pooling_3): MaxPool1d(kernel_size=4, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pooling_4): MaxPool1d(kernel_size=5, stride=2, padding=0, dilation=1, ceil_mode=False)
  (feature_layer): Linear(in_features=3, out_features=3, bias=True)
  (combined_layer): Linear(in_features=6211, out_features=24, bias=True)
  (fc): Linear(in_features=24, out_features=1, bias=True)
)

# Padding
sentences that are shorter than 13 words are padded with zeroes, sentences that are longer are truncated to length 13

In [12]:
def pad(tweet):
    temp_tweet = list(tweet.split(" "))
    if len(temp_tweet) == 1:
        for x in range(14):
            temp_tweet.append("0")
        tweet = " ".join(tweet)
    if len(temp_tweet) < 13:
        x = 13 - len(temp_tweet)
        for x in range(x):
            temp_tweet.append("0")
        tweet = " ".join(temp_tweet)
        return tweet
    if len(temp_tweet) > 13:
        y = len(temp_tweet) - 13
        tweet = temp_tweet[y:]
        tweet = " ".join(tweet)
        return tweet
    else:
        return tweet

# Load data from training data

In [13]:
#load text
train_samples_prepad = dtf["text"].to_list()
print(len(train_samples_prepad))

#apply padding to training samples
x_train = []
for x in train_samples_prepad:
    new = pad(x)
    x_train.append(new)

len(x_train)

9000


9000

In [14]:
#load features
train_caps = dtf["caps"].to_list()
train_sentiment = dtf["sntmt"].to_list()
train_nohs = dtf["nohs"].to_list()

#check if everything same length
print(len(train_caps) == len(train_sentiment) == len(train_nohs))

#make lists of features into array
train_features = pd.DataFrame(train_caps)
train_features.columns = ["caps"]
train_features["sentiment"] = train_sentiment
train_features["nohs"] = train_nohs
train_features_arr = train_features.to_numpy()
train_features_arr = torch.FloatTensor(train_features_arr)

True


In [15]:
#load labels
#HS TR AG
train_label = dtf["HS"].to_list()    #hate speech
train_label_tr = dtf["TR"].to_list()    #targeted
train_label_ag = dtf["AG"].to_list()    #aggressive

#for subtask B: select required label
#train_label = train_label_tr
#train_label = train_label_ag

# Load data from testing data

In [16]:
#load text from trainibg
test_samples_prepad = dtf_test["text"].to_list()

#pad test sentences
x_test = []
for x in test_samples_prepad:
    new = pad(x)
    x_test.append(new)

len(x_test)

3000

In [17]:
#load features
test_caps = dtf_test["caps"].to_list()
test_sentiment = dtf_test["sntmt"].to_list()
test_nohs = dtf_test["nohs"].to_list()

#check if everything same length
print(len(test_caps) == len(test_sentiment) == len(test_nohs))

#make lists of features into dataframe
test_features = pd.DataFrame(test_caps)
test_features.columns = ["caps"]
test_features["sentiment"] = test_sentiment
test_features["nohs"] = test_nohs
test_features_arr = test_features.to_numpy()
test_features_arr = torch.FloatTensor(test_features_arr)

True


In [18]:
test_label = dtf_test["HS"].to_list()
test_label_tr = dtf_test["TR"].to_list()
test_label_ag = dtf_test["AG"].to_list()

#for subtask B: select required label
#test_label = test_label_tr
#test_label = test_label_ag

# Get IDs

In [19]:
#get IDs per word - input for embedding layer of model
def get_ids_from_words(samples, assignment_dict):
    final_ids = []
    for x in samples:
        temp = x.split()
        ids = []
        for sample in temp:
            if sample in assignment_dict.keys():
                ids.append(assignment_dict[sample])
            else:
                ids.append(0)
        final_ids.append(ids)
    return torch.tensor(final_ids, dtype=torch.long)

In [20]:
train_ids = get_ids_from_words(x_train, tokenizer.word_index)
test_ids = get_ids_from_words(x_test, tokenizer.word_index)

# Training

In [21]:
class DatasetMapper(Dataset):
    def __init__(self, tweet, features, label):
        self.tweet = tweet            
        self.features = features            
        self.label = label
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, idx):
        return  self.tweet[idx], self.features[idx], self.label[idx]

In [22]:
train = DatasetMapper(train_ids, train_features_arr, train_label)
test = DatasetMapper(test_ids, test_features_arr, test_label)
loader_train = DataLoader(train, batch_size=32)
loader_test = DataLoader(test, batch_size=32)

In [23]:
def train(model, train_ids, train_features_arr, train_label, test_ids, test_features_arr, test_label):
    
    train = DatasetMapper(train_ids, train_features_arr, train_label)
    test = DatasetMapper(test_ids, test_features_arr, test_label)   
   
    # Initialize loaders
    loader_train = DataLoader(train, batch_size=32)
    loader_test = DataLoader(test, batch_size=32)
   
   #set learning rate and select optimiser
    learning_rate = 0.001
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
   
    # Starts training phase
    for epoch in range(100):
        # Set model in training model
        model.train()
        predictions = []
        # Starts batch training
        for tweet_batch, features_batch, label_batch in loader_train:
      
            label_batch = label_batch.type(torch.FloatTensor)
         
            # Feed the model
            label_pred = model(tweet_batch, features_batch)
         
            # Loss calculation
            loss = F.binary_cross_entropy(label_pred, label_batch)
         
            optimizer.zero_grad()
         
            # backwards pass
            loss.backward()
         
            # Gradients update
            optimizer.step()
         
            # Save predictions
            predictions += list(label_pred.detach().numpy())
      
        # Metrics calculation
        train_accuary = accuracy_score(train_label, np.around(predictions))

        if (epoch+1) % 10 == 0:
            print("Epoch: %d, loss: %.5f, Train accuracy: %.5f" % (epoch+1, loss.item(), train_accuary))

In [24]:
#create model instance
model = convoNN(embedding_matrix, 24, 3)

In [None]:
#train model
train(model, train_ids, train_features_arr, train_label, test_ids, test_features_arr, test_label)

# Evaluation

In [None]:
#get predicted labels for the test set
pred_labels = model(test_ids, test_features_arr)
predictions_test_binary = list()

#round values: =<0.5 --> 0, > 0.5 --> 1
for x in pred_labels:
    y = torch.round(x)
    y = y.item()
    predictions_test_binary.append(y)

In [None]:
#evaluate performance of model on subtask A
def evaluate_a(predictions_test,gold_data):
    levels = ["HS"]
    ground_truth = gold_data

    predicted = predictions_test
    ground_truth["predicted"] = predicted

    # Check length files
    if (len(ground_truth) != len(predicted)):
        sys.exit('Prediction and gold data have different number of lines.')

    # Check predicted classes
    for c in levels:
        gt_class = list(ground_truth[c].value_counts().keys())
        for value in predicted:
            if not value in gt_class:
                sys.exit("Wrong value in " + c + " prediction column.")

    # Compute Performance Measures HS
    acc_hs = accuracy_score(ground_truth["HS"], ground_truth["predicted"])
    [p_nohs, p_hs], [r_nohs, r_hs], [f1_nohs, f1_hs], support = precision_recall_fscore_support(ground_truth["HS"], ground_truth["predicted"], pos_label = 1)
    p_macro, r_macro, f1_macro, support = precision_recall_fscore_support(ground_truth["HS"], ground_truth["predicted"], average = "macro")

    return acc_hs, p_hs, p_nohs, r_hs, r_nohs, f1_hs, f1_nohs, p_macro, r_macro, f1_macro

In [None]:
#evaluate performance of model on subtask b
def evaluate_b(pred,gold):
    levels = ["HS", "TR", "AG"]

    ground_truth = gold
    predicted = pred

    # Check length files
    if (len(ground_truth) != len(predicted)):
        sys.exit('Prediction and gold data have different number of lines.')

    # Check predicted classes
    for c in levels:
        gt_class = list(ground_truth[c].value_counts().keys())
        if not (predicted[c].isin(gt_class).all()):
            sys.exit("Wrong value in " + c + " prediction column.")

    data = pd.merge(ground_truth, predicted, on="id")

    if (len(ground_truth) != len(data)):
        sys.exit('Invalid tweet IDs in prediction.')

    # Compute Performance Measures
    acc_levels = dict.fromkeys(levels)
    p_levels = dict.fromkeys(levels)
    r_levels = dict.fromkeys(levels)
    f1_levels = dict.fromkeys(levels)
    for l in levels:
        acc_levels[l] = accuracy_score(data[l + "_x"], data[l + "_y"])
        p_levels[l], r_levels[l], f1_levels[l], _ = precision_recall_fscore_support(data[l + "_x"], data[l + "_y"], average="macro")
    macro_f1 = np.mean(list(f1_levels.values()))

    # Compute Exact Match Ratio
    check_emr = np.ones(len(data), dtype=bool)
    for l in levels:
        check_label = data[l + "_x"] == data[l + "_y"]
        check_emr = check_emr & check_label
    emr = sum(check_emr) / len(data)

    return macro_f1, emr, acc_levels, p_levels, r_levels, f1_levels

In [None]:
#print evaluation A
acc_hs, p_hs, p_nohs, r_hs, r_nohs, f1_hs, f1_nohs, p_macro, r_macro, f1_macro = evaluate_a(predictions_test_binary, dtf_test)

print("\t".join(["{}".format(x) for x in ["acc.", "P (1)", "P (0)", "R (1)", "R (0)", "F1 (1)", "F1 (0)", "P (avg)", "R (avg)", "F1 (avg)"]]))
print("\t".join(["{0:.3f}".format(x) for x in [acc_hs, p_hs, p_nohs, r_hs, r_nohs, f1_hs, f1_nohs, p_macro, r_macro, f1_macro]]))

# save model

In [None]:
#filename = r"es_model.sav"
# pickle.dump(model, open(filename, "wb"))

# unpickle model and test

In [None]:
loaded_model = pickle.load(open(r"es_model.sav", "rb"))

In [None]:
predicted_labels = loaded_model(test_ids, test_features_arr)
predictions_test_binary_x = list()

for x in predicted_labels:
    y = torch.round(x)
    y = y.item()
    predictions_test_binary_x.append(y)

# for subtask B: all dimensions must be saved to be evaluated using evaluation B - for individual evaluation of dimensions use evaluation A
#tr = predictions_test_binary_x
#ag = predictions_test_binary_x

In [None]:
pred_dtf = dtf_test
pred_dtf = pred_dtf.drop(columns=["TR", "AG"], axis = 1)

pred_dtf["TR"] = tr
pred_dtf["AG"] = ag

In [None]:
#SUBTASK A EVAL

acc_hs, p_hs, p_nohs, r_hs, r_nohs, f1_hs, f1_nohs, p_macro, r_macro, f1_macro = evaluate_a(predictions_test_binary_x, dtf_test)

print("\t".join(["{}".format(x) for x in ["acc.", "P (1)", "P (0)", "R (1)", "R (0)", "F1 (1)", "F1 (0)", "P (avg)", "R (avg)", "F1 (avg)"]]))
print("\t".join(["{0:.3f}".format(x) for x in [acc_hs, p_hs, p_nohs, r_hs, r_nohs, f1_hs, f1_nohs, p_macro, r_macro, f1_macro]]))

acc.	P (1)	P (0)	R (1)	R (0)	F1 (1)	F1 (0)	P (avg)	R (avg)	F1 (avg)
0.555	0.478	0.657	0.649	0.487	0.551	0.560	0.568	0.568	0.555


In [None]:
#SUBTASK B EVAL
macro_f1, emr, acc_levels, p_levels, r_levels, f1_levels = evaluate_b(pred_dtf, dtf_test)

print("\t".join(["{}".format(x) for x in ["acc_HS", "acc_TR", "acc_AG", "p_HS", "p_TR", "p_AG", "r_HS", "r_TR", "r_AG", "f1_HS", "f1_TR", "f1_AG", "emr", "macro_f1"]]))
print("\t".join(["{0:.3f}".format(x) for x in [acc_levels["HS"], acc_levels["TR"], acc_levels["AG"], p_levels["HS"], p_levels["TR"], p_levels["AG"], r_levels["HS"], r_levels["TR"], r_levels["AG"], f1_levels["HS"], f1_levels["TR"], f1_levels["AG"], emr, macro_f1]]))