<a href="https://colab.research.google.com/github/paraggarg37/Assignment/blob/master/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install transformers
!pip install torch==1.1.0



In [20]:
device = "cuda:0"

In [48]:
import torch
import torch.nn as nn
from transformers import BertModel

class SentimentClassifier(nn.Module):

    def __init__(self, freeze_bert = True):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        #Classification layer
        self.cls_layer = nn.Linear(768, 1).to(device)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        #cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head
        #print(cont_reps)
        #cls_rep = cont_reps[:, 0]
        out = self.bert_layer(seq, attention_mask = attn_masks)

        #Feeding cls_rep to the classifier layer
        #logits = self.cls_layer(out[1])
      
        logits = self.cls_layer(out[0][:,0])

        return logits

In [49]:
net = SentimentClassifier(freeze_bert = True).to(device)

In [50]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss().to(device)
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [51]:
def train(net, criterion, opti, train_loader, val_loader, args):
    for ep in range(args.max_eps):
        print ("Running ep : {}".format(ep))
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #print ("Running batch : {}".format(it))
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.to(args.gpu), attn_masks.to(args.gpu), labels.to(args.gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()

            if (it + 1) % args.print_every == 0:
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))
        
        val_acc, val_loss = evaluate(net, criterion, val_loader, args)
        print("Epoch {} complete! Validation Accuracy : {}, Validation Loss : {}".format(ep, val_acc, val_loss))


In [43]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class SSTDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        #print("calinng get item")
        #print(self.df.loc)
        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, self.df.columns[0]]
        #print("calinng get item done")
        label = self.df.loc[index, self.df.columns[1]]

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [52]:
from torch.utils.data import DataLoader

#Creating instances of training and validation set
train_set = SSTDataset(filename = 'train.tsv', maxlen = 30)
val_set = SSTDataset(filename = 'dev.tsv', maxlen = 30)

#Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = 64, num_workers = 5)
val_loader = DataLoader(val_set, batch_size = 64, num_workers = 5)

In [53]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f542e13c890>

In [54]:
class Args():
    def __init__(self):
        self.max_eps = 50
        self.gpu = device
        self.print_every = 30

In [39]:
args = Args()

In [None]:
train(net, criterion, opti, train_loader, val_loader, args)

Running ep : 0
Iteration 30 of epoch 1 complete. Loss : 0.6787721514701843 Accuracy : 0.5625
Iteration 60 of epoch 1 complete. Loss : 0.6796607971191406 Accuracy : 0.5625
Iteration 90 of epoch 1 complete. Loss : 0.6950100660324097 Accuracy : 0.453125
Epoch 0 complete! Validation Accuracy : 0.5466517806053162, Validation Loss : 0.6803450754710606
Running ep : 1
Iteration 30 of epoch 2 complete. Loss : 0.6707484722137451 Accuracy : 0.609375
Iteration 60 of epoch 2 complete. Loss : 0.6688879728317261 Accuracy : 0.609375
Iteration 90 of epoch 2 complete. Loss : 0.6851829290390015 Accuracy : 0.5
Epoch 1 complete! Validation Accuracy : 0.5986607670783997, Validation Loss : 0.6710948135171618
Running ep : 2
Iteration 30 of epoch 3 complete. Loss : 0.6619623899459839 Accuracy : 0.6875
Iteration 60 of epoch 3 complete. Loss : 0.6589871644973755 Accuracy : 0.65625
Iteration 90 of epoch 3 complete. Loss : 0.6763014793395996 Accuracy : 0.53125
Epoch 2 complete! Validation Accuracy : 0.631026864051

In [41]:
def evaluate(net, criterion, dataloader, args):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.torch(device)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [31]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

In [None]:
!wget https://raw.githubusercontent.com/clairett/pytorch-sentiment-classification/master/data/SST2/test.tsv 

In [None]:
df = pd.read_csv('train.tsv', delimiter = '\t')

In [None]:
print (df.loc[0,df.columns[1]])

In [None]:
#https://medium.com/swlh/painless-fine-tuning-of-bert-in-pytorch-b91c14912caa#:~:text=The%20from_pretrained%20method%20creates%20an,like%20any%20other%20Pytorch%20module.