In [436]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DistilBertForSequenceClassification

from copy import deepcopy
from torch.utils.data import Dataset
import tqdm
import csv
from torch.utils.data import DataLoader

import pandas as pd
from sklearn.metrics import accuracy_score


In [437]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [39]:
# with open("./data/splitted.csv", "r") as read_obj:
#     reader = csv.reader(read_obj)
#     for ind,sample in enumerate(reader):
#         print(sample)
#         if ind==1:
#             break

In [438]:
class TwitterDataset(Dataset):
    """
    Implement TwitterDataset in Pytorch
    """
    def __init__(self, path, tokenizer):
        
        self.label_to_id = {"no_answer": 0, "answer": 1}
        self.id_to_label = {0: "no_answer", 1: "answer"}
        
        self.tokenizer = tokenizer
        self.pad_id=tokenizer.encode(tokenizer.pad_token,add_special_tokens=False)[0]
        self.text_samples = []
        self.labels = []
        self.samples=[]
        
        print("Building Twitter Dataset...")
        
        with open(path, "r") as read_obj:
            reader = csv.reader(read_obj)
            for ind,sample in enumerate(reader):
                if ind!=0:
                    # print(sample[4])
                
                    self.text_samples.append(sample[4])
                    # tokens=tokenizer(sample[4],return_tensors="pt")
                    # tokens['input_ids']=tokens['input_ids'].squeeze(0)
                    # tokens['attention_mask']=tokens['attention_mask'].squeeze(0)
                    label=torch.tensor(int(sample[8]))
                    self.samples.append({"input": sample[4], "label": int(sample[8])})

    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index):
        return deepcopy(self.samples[index])
    
    
    
    def collate_fn(self, batch):
        
        batch_text = [sample['input'] for sample in batch]
        batch_ids=self.tokenizer(batch_text,return_tensors="pt",padding=True) #for sample in batch_padded]
        
        tensor_batch_ids = batch_ids #torch.tensor(batch_ids)
        
        batch_labels = [sample['label'] for sample in batch]
        tensor_labels = torch.tensor(batch_labels).long()
        
        return tensor_batch_ids, tensor_labels

In [439]:
train_data=TwitterDataset("./data/splitted.csv",tokenizer)
test_data=TwitterDataset("./data/splitted_test.csv",tokenizer)

Building Twitter Dataset...
Building Twitter Dataset...


In [440]:
train_data_loader=DataLoader(train_data,batch_size=10,shuffle=True,collate_fn=train_data.collate_fn)
test_data_loader=DataLoader(test_data,batch_size=10,shuffle=False)

In [441]:
def train(model, train_data, optimizer, num_epoch, device):
    
    model.train()
        
    epochs=[]
    predictions=[]
    targets=[]
    # Training loop
    for epoch in range(num_epoch):

        print(f"================ Epoch {epoch+1} ================")

        epoch_loss = 0
        i=0
        for data in tqdm.tqdm(train_data,desc="Training"):
            
            loss=0
            optimizer.zero_grad()

            # for idx in range(len(data["input_ids"])):
            inputs,labels=data
        
            # input=data["input"].to(device)
            # target=data["label"].to(device)

            out=model(**inputs.to(device), labels=labels.to(device))
            model_loss=out.loss
            targets.extend(labels.tolist())
            predictions.extend(out.logits.argmax(dim=-1).tolist())
            loss+=model_loss

            epoch_loss+=loss.item()/len(data)

            loss.backward()
            optimizer.step()
            i+=1
            if i%50==0:
                print(f"Accuracy after {i} iterations: {accuracy_score(targets,predictions)}")
        print(f"Epoch {epoch+1} training loss ={epoch_loss/len(train_data)}")
        epochs.append(epoch_loss/len(train_data))

    return epochs

In [442]:
def eval(model, eval_data,device):
    # this function should be called in the train loop to monitor the performance in validation set while training.
    
    with torch.no_grad():
        epoch_loss=0
        predictions=[]
        targets=[]
        for data in eval_data:
            # YOUR CODE HERE
            loss=0

            input=data["input"].to(device)
            target=data["label"].to(device)

            out=model(**input, labels=target)
            model_loss=out.loss
            targets.append(target.item())
            predictions.append(out.logits.argmax().item())
            loss+=model_loss
                
            epoch_loss+=loss.item()

        print(f"Test model loss={epoch_loss / len(eval_data)}")
        print(f"Accuracy: {accuracy_score(targets, predictions)}")
    return predictions,targets

In [443]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-6)

In [444]:
if torch.cuda.is_available():
    device='cuda'
else:
    device='cpu'

In [445]:
model=model.to(device)

In [433]:
train(model,train_data_loader,optimizer,5,'cuda')



Training:  25%|██▌       | 50/200 [00:18<01:04,  2.34it/s]

Accuracy after 50 iterations: 0.498


Training:  50%|█████     | 100/200 [00:38<00:41,  2.42it/s]

Accuracy after 100 iterations: 0.516


Training:  75%|███████▌  | 150/200 [00:56<00:19,  2.58it/s]

Accuracy after 150 iterations: 0.5166666666666667


Training: 100%|██████████| 200/200 [01:15<00:00,  2.66it/s]

Accuracy after 200 iterations: 0.5215
Epoch 1 training loss per sample=0.35605897560715677





[0.35605897560715677]

In [245]:
precition,target=eval(model,test_data_loader,device)

Test model loss=0.6947665936946869
Accuracy: 0.518


In [236]:
model(**inputs.to(device),labels=torch.tensor([1],device='cuda'))

SequenceClassifierOutput(loss=tensor(0.7210, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0024, -0.0573]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [247]:
precition

[1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,


In [248]:
target

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
