In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pandas
!pip install emoji
import pandas as pd

In [None]:
#PRE PROCESSING

import re
import emoji
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def pre_process(text):

    text = text.replace('\n', '')
    text = text.replace('\t', '')
    text = text.replace('\s', '')
    text = re.sub(r'https?://\S+|www\.\S+', 'url', text) # to remove url
    text = re.sub(r'\S+@\S+', 'email', text) # to remove email
    text = re.sub(r'@\S+', 'user', text) # to remove user
    text = re.sub(r'\d+%', 'percentage', text) # to remove percentange
    text = re.sub(r'\$\d+(\.\d+)?', 'money', text) # to remove money
    text = re.sub(r'\d+/\d+/\d+', 'date', text) # to remove dates
    text = re.sub(r'\d+:\d+', 'time', text) # to put time in the place of time
    text = re.sub(r'\d{3}-\d{3}-\d{4}', 'phone', text) # phone number
    text = re.sub(r'#(\S+)', lambda x: ' '.join(re.findall(r'[A-Z]?[a-z]+', x.group())), text)# to remove hashtags
    text = re.sub(r'\b(\w+?)\1{2,}\b', r'\1', text) # to remove elongated words
    text = emoji.demojize(text) # to remove emojies
    text = text.lower() # to convert into lower case

    stop_words = set(stopwords.words('english'))
    words = text.split(" ")
    final_words = [word for word in words if word not in stop_words]
    final_sen = ' '.join(final_words)

    return final_sen

In [None]:
DATASET_COLUMNS = ["id", "entity", "sentiment", "tweet"]

hateEvalP_train = pd.read_csv('/kaggle/input/mydataset/train_en.tsv', delimiter='\t')
hateTrail_train = pd.read_csv('/kaggle/input/mydataset/trial_en.tsv', delimiter='\t')
hateEval_train = pd.concat([hateEvalP_train, hateTrail_train], axis = 0, ignore_index=True)

hateEval_dev = pd.read_csv('/kaggle/input/mydataset/dev_en.tsv', delimiter='\t')

polarity_train = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', names=DATASET_COLUMNS)

In [None]:
print(len(hateEval_train))
print(len(polarity_train))

In [None]:
print(hateEval_train.columns)
print(polarity_train.columns)

In [None]:
hateEval_train['pre_process'] = hateEval_train['text'].apply(pre_process)

In [None]:
neutral_polar = list(polarity_train[polarity_train['sentiment'] == 'Positive']['tweet'])
postive_polar = list(polarity_train[polarity_train['sentiment'] == 'Neutral']['tweet'])
negative_polar = list(polarity_train[polarity_train['sentiment'] == 'Negative']['tweet'])

print(len(neutral_polar))
print(len(postive_polar))
print(len(negative_polar))

In [None]:
import random

polarity_data = []
no = 1900

for i in range(no):
    if isinstance(neutral_polar[i], str):
        polarity_data.append([pre_process(neutral_polar[i]), 0])
    if isinstance(postive_polar[i], str):
        polarity_data.append([pre_process(postive_polar[i]), 1])
    if isinstance(negative_polar[i], str):
        polarity_data.append([pre_process(negative_polar[i]), 2])

random.shuffle(polarity_data)    
print(len(polarity_data))

In [None]:
polarityData = []
polarity_label = []

for a, b in polarity_data:
    polarityData.append(a)
    polarity_label.append(b)

print(len(polarityData))
print(len(polarity_label))

In [None]:
print(polarityData[:5], polarity_label[:20])

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

print(model)
print(tokenizer)

In [None]:
import torch.nn as nn
import torch
import torch.optim as optim

In [None]:
from torch.utils.data import Dataset, DataLoader

BatchSize = 8
class MultiTaskData(Dataset):
    def __init__(self, tweets, labels, no, dic):
        self.tweets = tweets
        self.labels = labels
        self.no = no
        self.sentiToLabel = dic
    
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, ind):
        output = tokenizer(self.tweets[ind],  max_length = 80,  padding='max_length', truncation=True, return_tensors='pt')
        input_ids = output['input_ids'][0]
        attention_mask = output['attention_mask'][0]
        
        if self.no == 1: #polarity
            my_tensor = torch.zeros(3)
            my_tensor[self.labels[ind]] = 1
        elif self.no == 2: #sentiment
            my_tensor = torch.zeros(7)
            my_tensor[self.sentiToLabel[self.labels[ind]]] = 1
        else: #hate
            my_tensor = torch.zeros(2)
            my_tensor[self.labels[ind]] = 1
        
        return input_ids, attention_mask, my_tensor
            
            
hateEval_data = MultiTaskData(hateEval_train['pre_process'], hateEval_train['HS'], 3, None)
polarity_data = MultiTaskData(polarityData, polarity_label, 1, None)

In [None]:
hateEval_load = DataLoader(hateEval_data, BatchSize)
polarity_load = DataLoader(polarity_data, BatchSize)

print(len(hateEval_load))
print(len(polarity_load))

In [None]:
print(polarity_data[0])
print(hateEval_data[0]) # each having (input_ids, attention_masks, labels)

In [None]:
class MultiTaskModel(nn.Module):
    def __init__(self, hidden_dim, d_model, output1, output2, output3, drop_prob = 0.1):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
#         self.softmax = nn.Softmax(dim=1)  # we should be using these as the loss needs logits to run
        
        # 1 - polarity, 2 - sentiment, 3 - hate
        self.linear11 = nn.Linear(d_model, hidden_dim)
        self.dropout11 = nn.Dropout(drop_prob)
        self.linear12 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout12 = nn.Dropout(drop_prob)
        self.linear13 = nn.Linear(hidden_dim, output1)
        
        
        self.linear31 = nn.Linear(d_model, hidden_dim)
        self.dropout31 = nn.Dropout(drop_prob)
        self.linear32 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout32 = nn.Dropout(drop_prob)
        self.linear33 = nn.Linear(hidden_dim, output3)
        
    
    def forward(self, input_ids, masks, task_no):
        
        output = self.bert(input_ids, masks) #[32, 80, 768]
        output = output['last_hidden_state']
        cls = output[:,0,:] # [32 x 768]
        
        if task_no[0] == 1:
            y = self.linear11(cls)
            y = self.relu(y)
            y = self.dropout11(y)
            y = self.linear12(y)
            y = self.relu(y)
            y = self.dropout12(y)
            y = self.linear13(y)
        
        else:
            y = self.linear31(cls)
            y = self.relu(y)
            y = self.dropout31(y)
            y = self.linear32(y)
            y = self.relu(y)
            y = self.dropout32(y)
            y = self.linear33(y)
            
        
        return y

 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiTaskModel(200, 768, 3, 7, 2)
model = model.to(device)
cross_criterion = nn.CrossEntropyLoss()
bce_criterion = nn.BCELoss()


optimizer_polarity = optim.AdamW([
                                    {'params' : model.bert.parameters()},
                                    {'params' : model.linear11.parameters()},
                                    {'params' : model.dropout11.parameters()},
                                    {'params' : model.linear12.parameters()},
                                    {'params' : model.dropout12.parameters()},
                                    {'params' : model.linear13.parameters()}
                       
                                  ], lr=1e-4, weight_decay=0.01
                                )

optimizer_hate = optim.AdamW([
                                    {'params' : model.bert.parameters()},
                                    {'params' : model.linear31.parameters()},
                                    {'params' : model.dropout31.parameters()},
                                    {'params' : model.linear32.parameters()},
                                    {'params' : model.dropout32.parameters()},
                                    {'params' : model.linear33.parameters()},
                       
                                  ], lr=1e-4, weight_decay=0.01
                                )


In [None]:
def getData(data, i):
    if i >= len(data):
        return None, None, None
    
    for j, (input_ids, mask, labels) in enumerate(data):
        if i == j:
            return input_ids, mask, labels
    
    return None, None, None

In [None]:
# preparing evalutaion dataset
data = list(hateEval_dev['text'])
label = list(hateEval_dev['HS'])
dev_data = []
dev_label = []
for i, x in enumerate(data):
    if len(dev_data) > 32:
        break
    elif isinstance(x, str):
        dev_data.append(pre_process(x))
        dev_label.append(label[i])
    
hateEval_dev_dataset = MultiTaskData(dev_data, dev_label, 3, None)
hateEval_dev_load = DataLoader(hateEval_dev_dataset, len(dev_data))

In [None]:
!pip install torcheval-nightly

In [None]:
import torcheval
from torcheval.metrics.functional import multiclass_confusion_matrix

polarity_accuracy = torcheval.metrics.MulticlassAccuracy(num_classes = 3, device = device)
polarity_f1Score = torcheval.metrics.MulticlassF1Score(num_classes = 3, device = device)
polarity_confusion_matrix = torcheval.metrics.MulticlassConfusionMatrix(num_classes = 3, device = device)

hate_accuracy = torcheval.metrics.MulticlassAccuracy(num_classes = 2 , device = device)
hate_f1Score = torcheval.metrics.MulticlassF1Score(num_classes = 2, device = device)
hate_confusion_matrix = torcheval.metrics.MulticlassConfusionMatrix(num_classes = 2, device = device)


hate_accuracy_dev = torcheval.metrics.MulticlassAccuracy(num_classes = 2 , device = device)
hate_f1Score_dev = torcheval.metrics.MulticlassF1Score(num_classes = 2, device = device)
hate_confusion_matrix_dev = torcheval.metrics.MulticlassConfusionMatrix(num_classes = 2, device = device)

upper_limit = 256 # 175 -- 32 size
EPOCHS = 3
threshold = 0.5 

lossPerEpoch = []


for i in range(EPOCHS):
    
    lossList = []
    for j in range(upper_limit):
        
        if i < 2:
            for param in model.bert.parameters():
                param.requires_grad = False
        elif i == 2:
            for param in model.bert.parameters():
                param.requires_grad = True
            
            
        model.train()
        loss = 0
        
        input_ids, masks, label = getData(polarity_load, j)
        task_no = torch.zeros(3)
        task_no[0] = 1
        if input_ids != None:
            
            input_ids, masks, task_no = input_ids.to(device), masks.to(device), task_no.to(device)
            output = model.forward(input_ids, masks, task_no)
            predictions = torch.argmax(output, dim=1)
            label_ground = label.squeeze().int()
            predictions, label_ground, output, label = predictions.to(device), label_ground.to(device), output.to(device), label.to(device)
            
            
            loss = cross_criterion(output, label)
            print(loss, 0)
            optimizer_polarity.zero_grad()
            loss.backward()
            optimizer_polarity.step()

        
        input_ids, masks, label = getData(hateEval_load, j)
        task_no = torch.zeros(3)
        task_no[2] = 1
        if input_ids != None:
            
            input_ids, masks, task_no = input_ids.to(device), masks.to(device), task_no.to(device)
            output = model.forward(input_ids, masks, task_no)
            predictions = torch.argmax(output, dim=1)
            label_ground = torch.argmax(label, dim=1)
            print(predictions, label_ground)
            output, label = output.to(device), label.to(device)

            predictions, label_ground = predictions.to(device), label_ground.to(device)
            hate_accuracy.update(predictions, label_ground)
            hate_f1Score.update(predictions, label_ground)
            hate_confusion_matrix.update(predictions, label_ground)
            
            loss = cross_criterion(output, label)
            lossList.append(loss)
            print(loss, 2)
            optimizer_hate.zero_grad()
            loss.backward()
            optimizer_hate.step()
            
        
    accuracy = hate_accuracy.compute()
    f1_score = hate_f1Score.compute()
    confusion_matrix = hate_confusion_matrix.compute()
        

    model.eval()
    eval_loss = 0
    
    task_no = torch.zeros(3)
    task_no[2] = 1
    for input_ids, masks, labels in hateEval_dev_load:
            
        input_ids, masks, task_no = input_ids.to(device), masks.to(device), task_no.to(device)
        output = model.forward(input_ids, masks, task_no)
        predictions = torch.argmax(output, dim=1)
        label_ground = torch.argmax(labels, dim=1)
        print(predictions, label_ground)
        output, label = output.to(device), labels.to(device)
            
        predictions, label_ground = predictions.to(device), label_ground.to(device)
        eval_loss += cross_criterion(output, label)
        hate_accuracy_dev.update(predictions, label_ground)
        hate_f1Score_dev.update(predictions, label_ground)
        hate_confusion_matrix_dev.update(predictions, label_ground)
    
    accuracy_dev = hate_accuracy_dev.compute()
    f1_score_dev = hate_f1Score_dev.compute()
    confusion_matrix_dev =  hate_confusion_matrix_dev.compute()
    
    hate_accuracy.reset()
    hate_f1Score.reset()
    hate_confusion_matrix.reset()
    
    hate_accuracy_dev.reset()
    hate_f1Score_dev.reset()
    hate_confusion_matrix_dev.reset()
    
    lossPerEpoch.append(sum(lossList)/len(lossList))
        
    print(i, eval_loss, loss)
    print(accuracy, f1_score, confusion_matrix)
    print(accuracy_dev, f1_score_dev, confusion_matrix_dev)
        
        


In [None]:
hateEval_test = pd.read_csv('/kaggle/input/mydataset/test_en.tsv', delimiter='\t')
hateEval_labels = pd.read_csv('/kaggle/input/mydataset/en_a.tsv', delimiter='\t', header=None)
hateEval_test['pre_process'] = hateEval_test['text'].apply(pre_process)
hateEval_data = MultiTaskData(hateEval_test['pre_process'], hateEval_labels[1], 3, None)
hateEval_test_load = DataLoader(hateEval_data, 8)

In [None]:
hate_accuracy_test = torcheval.metrics.MulticlassAccuracy(num_classes = 2 , device = device)
hate_f1Score_test = torcheval.metrics.MulticlassF1Score(num_classes = 2, device = device)
hate_confusion_matrix_test = torcheval.metrics.MulticlassConfusionMatrix(num_classes = 2, device = device)

In [None]:
task_no = torch.zeros(3)
task_no[2] = 1
task_no = task_no.to(device)

for j, (input_ids, masks, labels) in enumerate(hateEval_test_load):
        input_ids, masks = input_ids.to(device), masks.to(device)
        output = model.forward(input_ids, masks, task_no)
        predictions = torch.argmax(output, dim=1)
        label_ground = torch.argmax(labels, dim=1)
        print(predictions, label_ground)
        output, label = output.to(device), labels.to(device)
            
        predictions, label_ground = predictions.to(device), label_ground.to(device)
        hate_accuracy_test.update(predictions, label_ground)
        hate_f1Score_test.update(predictions, label_ground)
        hate_confusion_matrix_test.update(predictions, label_ground)
        
accuracy_test = hate_accuracy_test.compute()
f1_score_test = hate_f1Score_test.compute()
confusion_matrix_test =  hate_confusion_matrix_test.compute()
    
hate_accuracy_test.reset()
hate_f1Score_test.reset()
hate_confusion_matrix_test.reset()
   
print(accuracy_test, f1_score_test, confusion_matrix_test)

In [None]:
import torch

filepath = '/kaggle/working/hateSpeechPolarity.pth'

torch.save(model, filepath)

In [None]:
import matplotlib.pyplot as plt
lossItems = list()
for loss in lossPerEpoch:
    lossItems.append(loss.item())
lossItems = torch.tensor(lossItems)
epochs = torch.arange(len(lossItems))+1
print(epochs)
plt.plot(epochs,lossItems)
plt.xlabel("epochs")
plt.ylabel("loss")
plt.xticks(range(1, len(lossItems) + 1))
plt.title('Epochs vs Loss')
plt.xticks(range(1, len(lossItems) + 1))
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Given data
lossItems = [0.605, 0.508, 0.497]
epochs = [1, 2, 3]

# Plot
plt.plot(epochs, lossItems)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.xticks(range(1, len(lossItems) + 1))
plt.title('Epochs vs Loss')
plt.show()
