<a href="https://colab.research.google.com/github/paraggarg37/Assignment/blob/master/idcma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/avanigoel/icdm_pqa/master/data/Tools_and_Home_Improvement.txt

--2021-04-27 12:48:18--  https://raw.githubusercontent.com/avanigoel/icdm_pqa/master/data/Tools_and_Home_Improvement.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 91790274 (88M) [text/plain]
Saving to: ‘Tools_and_Home_Improvement.txt.3’


2021-04-27 12:48:22 (152 MB/s) - ‘Tools_and_Home_Improvement.txt.3’ saved [91790274/91790274]



In [2]:
device = "cuda:0"

In [3]:
!pip install transformers
!pip install torch==1.1.0



In [4]:
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset
import pandas as pd

In [5]:
class SSTDataset(Dataset):

    def __init__(self, df, maxlen, is_train=True):
       

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.maxlen = maxlen
        self.df = df 

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        #print("calinng get item")
        #print(self.df.loc)
        #Selecting the sentence and label at the specified index in the data frame
      
        question = self.df.iloc[index]['question']
        #question = self.df['question'][index]
        
        answer = self.df.iloc[index]['answer']
        #answer = self.df['answer'][index]
        
        #print("calinng get item done")
        label = self.df.iloc[index]['label']
        #label = self.df['label'][index] 

        #print ("question type ", type(question), type(self.df))

        #print('label is {}, question {}, ans {}'.format(label, question, answer))

        #print(question, answer, str(answer))
        #print(label)

        encoded_pair = self.tokenizer.encode_plus(
                            question, answer,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = self.maxlen, 
                            padding='max_length',
                            truncation=True,         # Truncate all sentences.
                            return_token_type_ids=True,
                            return_attention_mask = True
                          
        )

        tokens_ids_tensor = torch.tensor(encoded_pair['input_ids'])
        attn_mask = (tokens_ids_tensor != 0).long()
       
        tokens_type_ids_tensor = torch.tensor(encoded_pair['token_type_ids'])

        #encoded_pair['token_type_ids'],
        return tokens_ids_tensor, attn_mask, tokens_type_ids_tensor, label

In [6]:
import torch
import torch.nn as nn
from transformers import BertModel

class SentimentClassifier(nn.Module):

    def __init__(self, freeze_bert = True):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        #Classification layer
        self.cls_layer = nn.Linear(768, 1).to(device)

    def forward(self, seq, attn_masks, tokens_type_ids):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        #token_type_ids=tokens_type_ids
        out = self.bert_layer(seq, attention_mask = attn_masks)

        logits = self.cls_layer(out[0][:,0])

        return logits

In [7]:
def train(net, criterion, opti, train_loader, val_loader, args):
    for ep in range(args.max_eps):
        print ("Running ep : {}".format(ep))
        
        for it, (seq, attn_masks, tokens_type_ids, labels) in enumerate(train_loader):
            #print ("Running batch : {}".format(it))
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, tokens_type_ids, labels = seq.to(device), attn_masks.to(device), tokens_type_ids.to(device), labels.to(device)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks, tokens_type_ids)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()

            if (it + 1) % args.print_every == 0:
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} Accuracy : {} complete. Loss : {}".format(it+1, ep+1, acc, loss.item()))
        
        val_acc, val_loss = evaluate(net, criterion, val_loader, args)
        print("Epoch {} complete! Validation Accuracy : {}, Validation Loss : {}".format(ep, val_acc, val_loss))


In [8]:
def evaluate(net, criterion, dataloader, args):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, tokens_type_ids, labels in dataloader:
            seq, attn_masks, tokens_type_ids, labels = seq.to(device), attn_masks.to(device), tokens_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, tokens_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [9]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

In [10]:
import pandas as pd
df = pd.read_csv("Tools_and_Home_Improvement.txt", delimiter = '\t')
qa = df[['question','answer']]
from sklearn.utils import shuffle
list_of_answers = list(df['answer'])
list_of_answers = shuffle(list_of_answers)
nqa =  pd.DataFrame({'question': df['question'].tolist(),'answer':list_of_answers})
qa['label']=1
nqa['label']=0
data = pd.DataFrame({'question': qa['question'].tolist() + nqa['question'].tolist(),'answer':qa['answer'].tolist() + nqa['answer'].tolist(), 'label':qa['label'].tolist() + nqa['label'].tolist()})
data=shuffle(data)
split = int(len(data)*0.9)
train_data = data[0:split]
test_data = data[split:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
net = SentimentClassifier(freeze_bert = False).to(device)
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss().to(device)
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [12]:
from torch.utils.data import DataLoader

#Creating instances of training and validation set
train_set = SSTDataset(df = train_data, maxlen = 64)
val_set = SSTDataset(df = test_data, maxlen = 64)

#Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = 64, num_workers = 5)
val_loader = DataLoader(val_set, batch_size = 64, num_workers = 5)

In [13]:
class Args():
    def __init__(self):
        self.max_eps = 50
        self.gpu = device
        self.print_every = 30


In [None]:
args = Args()
train(net, criterion, opti, train_loader, val_loader, args)

Running ep : 0
Iteration 30 of epoch 1 Accuracy : 0.625 complete. Loss : 0.6718505620956421
Epoch 0 complete! Validation Accuracy : 0.7515625357627869, Validation Loss : 0.5390904396772385
Running ep : 1
Iteration 30 of epoch 2 Accuracy : 0.8125 complete. Loss : 0.4632303714752197
Epoch 1 complete! Validation Accuracy : 0.7989583015441895, Validation Loss : 0.4681692322095235
Running ep : 2
Iteration 30 of epoch 3 Accuracy : 0.890625 complete. Loss : 0.24782612919807434
Epoch 2 complete! Validation Accuracy : 0.8036458492279053, Validation Loss : 0.5873926083246866
Running ep : 3
