In [17]:
!pip install transformers



In [18]:
import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [19]:
import pandas as pd
import math
from sklearn import preprocessing





task = 'task_2'

#2019 datasets also

df_test = pd.read_csv("hasoc2019_en_test-2919.tsv",sep='\t')
df_train = pd.read_csv("english_dataset.tsv",sep="\t")
df_train = df_train.dropna()



print(len(df_train))
print(df_train.head())

total_sentences = list(df_train['text'].values)
total_labels = list(df_train[task].values)



test_sentences = list(df_test['text'].values)
test_labels = list(df_test[task].values)

def clean_text(sentences):
    for index,line in enumerate(sentences):
        if "\n" in line:
            sentences[index] = line.replace("\n","")
    return sentences
        
total_sentences = clean_text(total_sentences)
test_sentences = clean_text(test_sentences)

def clean_labels(labels):
    new_list= []
    for value in labels:
        new_list.append(value.strip())
    return new_list

total_labels = clean_labels(total_labels)
test_labels = clean_labels(test_labels)

le = preprocessing.LabelEncoder()
le.fit(total_labels)
encoded_labels = le.transform(total_labels)
encoded_test_labels = le.transform(test_labels)
print(set(encoded_labels))

print(len(total_sentences),len(encoded_labels),len(test_sentences),len(encoded_test_labels))

print(df_test)

5852
      text_id                                               text  ... task_2 task_3
0  hasoc_en_1  #DhoniKeepsTheGlove | WATCH: Sports Minister K...  ...   NONE   NONE
1  hasoc_en_2  @politico No. We should remember very clearly ...  ...   HATE    TIN
2  hasoc_en_3  @cricketworldcup Guess who would be the winner...  ...   NONE   NONE
3  hasoc_en_4  Corbyn is too politically intellectual for #Bo...  ...   NONE   NONE
4  hasoc_en_5  All the best to #TeamIndia for another swimmin...  ...   NONE   NONE

[5 rows x 5 columns]
{0, 1, 2, 3}
5852 5852 1153 1153
             text_id  ... task_3
0       hasoc_en_902  ...   NONE
1       hasoc_en_416  ...   NONE
2       hasoc_en_207  ...   NONE
3       hasoc_en_595  ...   NONE
4       hasoc_en_568  ...    UNT
...              ...  ...    ...
1148  hasoc_en1_3958  ...   NONE
1149  hasoc_en1_4648  ...   NONE
1150  hasoc_en1_4832  ...   NONE
1151  hasoc_en1_3721  ...   NONE
1152   hasoc_en1_991  ...   NONE

[1153 rows x 5 columns]


In [20]:
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_length = 0
for sentence in total_sentences:
    #print(sentence)
    length = len(tokenizer.tokenize(sentence))
    if length > max_length:
        max_length  = length
print("max token length is: ",max_length)
# max token length obtained is 50
# bert tokens are limited to 514 bytes.

max token length is:  399


In [21]:
def encoder_generator(sentences,labels):
    
    sent_index = []
    input_ids = []
    attention_masks =[]

    for index,sent in enumerate(sentences):
        
        sent_index.append(index)
        
        encoded_dict = tokenizer.encode_plus(sent,
                                             add_special_tokens=True,
                                             max_length=128,
                                             pad_to_max_length=True,
                                             truncation = True,
                                             return_attention_mask=True,
                                             return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids,dim=0)
    attention_masks = torch.cat(attention_masks,dim=0)
    labels = torch.tensor(labels)
    sent_index = torch.tensor(sent_index)

    return sent_index,input_ids,attention_masks,labels

sent_index,input_ids,attention_masks,encoded_label_tensors = encoder_generator(total_sentences,encoded_labels)
test_sent_index,test_input_ids,test_attention_masks,encoded_test_label_tensors = encoder_generator(test_sentences,encoded_test_labels)
print('Original: ', total_sentences[0])
print('Token IDs:', input_ids[0])



Original:  #DhoniKeepsTheGlove | WATCH: Sports Minister Kiren Rijiju issues statement backing MS Dhoni over 'Balidaan Badge', tells BCCI to take up the matter with ICC and keep government in the know as nation's pride is involved    https://t.co/zuo5335Rjr
Token IDs: tensor([  101,  1001, 28144, 10698, 20553,  4523, 10760, 23296, 21818,  1064,
         3422,  1024,  2998,  2704, 11382,  7389, 15544,  4478,  9103,  3314,
         4861,  5150,  5796, 28144, 10698,  2058,  1005, 20222,  2850,  2319,
        10780,  1005,  1010,  4136,  4647,  6895,  2000,  2202,  2039,  1996,
         3043,  2007, 16461,  1998,  2562,  2231,  1999,  1996,  2113,  2004,
         3842,  1005,  1055,  6620,  2003,  2920, 16770,  1024,  1013,  1013,
         1056,  1012,  2522,  1013, 16950,  2080, 22275, 19481,  2099,  3501,
         2099,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,  

In [22]:
from torch.utils.data import TensorDataset,random_split

dataset = TensorDataset(input_ids,attention_masks,encoded_label_tensors)
test_dataset = TensorDataset(test_sent_index,test_input_ids,test_attention_masks,encoded_test_label_tensors)

train_size = int(0.75*len(dataset))

val_size = len(dataset)-train_size

train_dataset,val_dataset = random_split(dataset,[train_size,val_size])

print('train data samples is {}'.format(len(train_dataset)))
print("valid data samples is {}".format(len(val_dataset)))
print("test data samples is {}".format(len(test_dataset)))

train data samples is 4389
valid data samples is 1463
test data samples is 1153


In [23]:
from torch.utils.data import DataLoader,RandomSampler,SequentialSampler

bs=8

train_data_loader = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=bs)
valid_data_loader = DataLoader(val_dataset,
                              sampler=SequentialSampler(val_dataset),
                              batch_size=bs)
test_data_loader = DataLoader(test_dataset,
                            sampler=SequentialSampler(test_dataset),
                            batch_size=bs)

In [24]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                     num_labels=len(le.classes_),
                                                     output_attentions=False,
                                                     output_hidden_states=False,
                                                     )
#model.cpu()
device = "cuda:0"
model = model.to(device)
model.cuda()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [25]:
optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-8)

from transformers import get_linear_schedule_with_warmup

epochs=10
total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps=0,
                                           num_training_steps=total_steps)

In [26]:
import numpy as np

def predictions_labels(preds,labels):
    #print(preds.device,labels.device)
    pred = torch.argmax(preds,axis=1).flatten()
    label = labels.flatten()
    return pred,label

In [27]:
import random
import numpy as np
import time
from sklearn.metrics import classification_report,accuracy_score,f1_score

total_t0 = time.time()

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [28]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    #print(correct.device)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).to(device)

def predictions_labels(preds,labels):
    #print(preds.device,labels.device)
    pred = torch.argmax(preds,axis=1).flatten()
    label = labels.flatten()
    return pred,label

In [29]:
def train():
  total_train_loss = 0
  total_train_acc = 0
    
  model.train() # set model in train mode for batchnorm and dropout layers in bert model
    
  for step,batch in enumerate(train_data_loader):
    #print("**************************************************************************")
    #print("Step : ",step,"  batch",len(batch))
    #print("**************************************************************************")
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    model.zero_grad()
    #loss,logits = model(b_input_ids,attention_mask=b_input_mask,labels=b_labels.long())
    outputs = model(b_input_ids,attention_mask=b_input_mask,labels=b_labels.long())
    loss = outputs.loss
    logits = outputs.logits
    #total_train_loss+=loss.detach().numpy()
    total_train_loss+=loss.detach()
    total_train_acc+=categorical_accuracy(logits,b_labels).item()
            
    loss.backward()
            
    torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
            
    optimizer.step()
            
    scheduler.step() #go ahead and update the learning rate
    #print(total_train_loss,total_train_acc)
            
  avg_train_loss = total_train_loss/len(train_data_loader)
  avg_train_acc = total_train_acc/len(train_data_loader)
    
  return avg_train_loss,avg_train_acc

In [30]:
def evaluate():
    model.eval()
        
    total_eval_accuracy = 0
    total_eval_loss = 0
    number_of_eval_steps= 0
    
    all_true_labels = []
    all_pred_labels = []

    for batch in valid_data_loader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():

        #loss, logits = model(b_input_ids,attention_mask= b_input_mask,labels = b_labels.long())
          outputs = model(b_input_ids,attention_mask=b_input_mask,labels=b_labels.long())
        loss = outputs.loss
        logits = outputs.logits

        #total_eval_loss+=loss.detach().numpy()

        #logits = logits.detach().cpu().numpy()
        #label_ids = b_labels.to('cpu').numpy()

        total_eval_loss+=loss.detach()        
        logits = logits.detach()
        label_ids = b_labels.to(device)

        pred,true = predictions_labels(logits,label_ids)
        
        all_pred_labels.extend(pred.detach().cpu().numpy())
        all_true_labels.extend(true.detach().cpu().numpy())
    
    #print(np.shape(np.array(all_pred_labels).reshape(-1,1)),np.shape(np.array(all_true_labels).reshape(-1,1)))

    print(classification_report(all_pred_labels,all_true_labels))
    avg_val_accuracy = accuracy_score(all_pred_labels,all_true_labels)
    macro_f1_score = f1_score(all_pred_labels,all_true_labels,average='macro')
    
    avg_val_loss = total_eval_loss/len(valid_data_loader)

    print("accuracy = {0:.2f}".format(avg_val_accuracy))
    
    return avg_val_loss,avg_val_accuracy,macro_f1_score

In [31]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
epochs = 100
train_loss = 0
train_acc = 0
valid_loss = 0
valid_acc = 0
macro_f1  = 0
best_macro_f1 = float('0')
for epoch in range(epochs):
  start_time = time.time()
  train_loss,train_acc = train()
  valid_loss,valid_acc,macro_f1 = evaluate()
    
  end_time = time.time()
        
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
  if macro_f1 > best_macro_f1:
    best_macro_f1 = macro_f1
    torch.save(model,'model_english_task_a.pt')
  
  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
  #print(f'\t macro_f1: {macro_f1:.3f} |  c: {valid_acc*100:.2f}%')

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.99      0.67      0.80      1353
           2       0.13      0.54      0.20        26
           3       0.40      0.79      0.53        84

    accuracy                           0.68      1463
   macro avg       0.38      0.50      0.38      1463
weighted avg       0.94      0.68      0.78      1463

accuracy = 0.68
Epoch: 01 | Epoch Time: 2m 8s
	Train Loss: 0.975 | Train Acc: 62.82%
	 Val. Loss: 0.841 |  Val. Acc: 67.81%
              precision    recall  f1-score   support

           0       0.11      0.37      0.17        78
           1       0.85      0.73      0.79      1073
           2       0.38      0.30      0.33       142
           3       0.56      0.54      0.55       170

    accuracy                           0.65      1463
   macro avg       0.47      0.48      0.46      1463
weighted avg       0.73      0.65      0.68      1463

accura

In [None]:
del model
import gc
gc.collect()
  
model = torch.load('model_english_task_b.pt')
model = model.to(device)

In [None]:
def evaluate_test():
    model.eval()
        
    total_eval_accuracy = 0
    total_eval_loss = 0
    number_of_eval_steps= 0
    
    all_true_labels = []
    all_pred_labels = []
    
    all_sentence_id=[]

    for batch in test_data_loader:
        b_sentence_id = batch[0].to(device)
        b_input_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        sent_ids = b_sentence_id.to('cpu').numpy()
        all_sentence_id.extend(sent_ids)
        
        with torch.no_grad():

            outputs = model(b_input_ids,
                                attention_mask= b_input_mask,
                                labels = b_labels.long())
        
        loss = outputs.loss
        logits = outputs.logits

        total_eval_loss+=loss.item()

        logits = logits.detach().cpu()

        label_ids = b_labels.to('cpu')
        

        pred,true = predictions_labels(logits,label_ids)
        
        all_pred_labels.extend(pred)
        
        all_true_labels.extend(true)

    print(classification_report(all_pred_labels,all_true_labels))
    avg_val_accuracy = accuracy_score(all_pred_labels,all_true_labels)
    
    avg_val_loss = total_eval_loss/len(valid_data_loader)

    print("accuracy = {0:.2f}".format(avg_val_accuracy))
    
    return avg_val_loss,avg_val_accuracy,all_sentence_id,all_pred_labels

valid_loss,valid_acc,all_sentence_id,all_pred_labels = evaluate_test()
