In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
import random
from sklearn.utils import resample

In [None]:
data=pd.read_csv('qnli.tsv',sep='\t',header=None)

In [None]:
data=data.rename(columns={0: "id", 1: "premise", 2: "entailment", 3: "target"})

In [None]:
value = data['target'].value_counts()
print(value)

0    27040
1     5127
Name: target, dtype: int64


In [None]:
df_majority = data[data['target']==0]
df_minority = data[data['target']==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    
                                 n_samples=value[1])
data = pd.concat([df_majority_downsampled,df_minority])

In [None]:
target=data['target']
data = data.drop(['id'],axis=1)

In [None]:
import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: Tesla P100-PCIE-16GB, n_gpu: 1


In [None]:
!pip install transformers
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

# Download helper functions file
helper_file = drive.CreateFile({'id': '16HW-z9Y1tM3gZ_vFpJAuwUDohz91Aac-'})
helper_file.GetContentFile('helpers.py')
print('helper file downloaded! (helpers.py)')


success!
helper file downloaded! (helpers.py)


In [None]:
from transformers import BertTokenizer
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DebertaTokenizer, DebertaForSequenceClassification
import torch
import numpy as np
model_name = 'cross-encoder/qnli-electra-base'
def load_data(df):
  MAX_LEN = 512
  #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
  #tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
  token_ids = []
  mask_ids = []
  seg_ids = []
  y = []

  premise_list = df['premise'].to_list()
  entailment_list = df['entailment'].to_list()
  label_list = df['target'].to_list()

  for (premise, hypothesis, label) in zip(premise_list, entailment_list, label_list):
    premise=premise[:MAX_LEN]
    hypothesis=hypothesis[:MAX_LEN]
    premise_id = tokenizer.encode(premise, add_special_tokens = False)
    hypothesis_id = tokenizer.encode(hypothesis, add_special_tokens = False)
    pair_token_ids = [tokenizer.cls_token_id] + premise_id + [tokenizer.sep_token_id] + hypothesis_id + [tokenizer.sep_token_id]
    premise_len = len(premise_id)
    hypothesis_len = len(hypothesis_id)

    segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
    attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

    token_ids.append(torch.tensor(pair_token_ids))
    seg_ids.append(segment_ids)
    mask_ids.append(attention_mask_ids)
    if label == 0:
      label = torch.tensor([0.,1.])
    else:
      label = torch.tensor([1.,0.])
    y.append(np.array(label))
    
  token_ids = pad_sequence(token_ids, batch_first=True)
  mask_ids = pad_sequence(mask_ids, batch_first=True)
  seg_ids = pad_sequence(seg_ids, batch_first=True)
  y = torch.tensor(y)
  dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
  #print(len(dataset))
  return dataset


In [None]:
def get_data_loaders(batch_size=32, shuffle=True):
  train_loader = DataLoader(
    train_data,
    shuffle=shuffle,
    batch_size=batch_size
  )

  val_loader = DataLoader(
    val_data,
    shuffle=shuffle,
    batch_size=batch_size
  )

  return train_loader, val_loader

In [None]:
 from sklearn.model_selection import train_test_split

 X_train, X_val = train_test_split(data,test_size=0.2, random_state=1)

In [None]:
train_data = load_data(X_train)
val_data = load_data(X_val)



In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2,problem_type="multi_label_classification")


# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'config', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.bias',

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermed

In [None]:
batch_size = 32
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 2e-8 
                )
epochs = 5

In [None]:
train_loader = DataLoader(train_data,shuffle=True,batch_size=16)
val_loader = DataLoader(val_data,shuffle=True,batch_size=16)

In [None]:
from sklearn.metrics import f1_score
def multi_acc(y_pred, y_test):
  pred = torch.log_softmax(y_pred, dim=1).argmax(dim=1)
  acc = (pred == y_test).sum().float() / float(y_test.size(0))
  #print(f1_score(y_test.cpu().numpy(),pred.cpu().numpy()))
  return acc

import time

EPOCHS = 5

def train(model, train_loader, val_loader, optimizer):  
  total_step = len(train_loader)

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()
      
      acc = multi_acc(prediction, labels[:,0])

      loss.backward()
      optimizer.step()
      
      total_train_loss += loss.item()
      total_train_acc  += acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    with torch.no_grad():
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)
        
        loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()
        
        acc = multi_acc(prediction, labels[:,0])

        total_val_loss += loss.item()
        total_val_acc  += acc.item()

    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [None]:
train(model, train_loader, val_loader, optimizer)

Epoch 1: train_loss: 0.6940 train_acc: 0.5091 | val_loss: 0.6945 val_acc: 0.5082
00:05:06.81
Epoch 2: train_loss: 0.6936 train_acc: 0.4981 | val_loss: 0.6931 val_acc: 0.4855
00:05:06.80
Epoch 3: train_loss: 0.6933 train_acc: 0.4964 | val_loss: 0.6930 val_acc: 0.4897
00:05:06.80
Epoch 4: train_loss: 0.6934 train_acc: 0.4931 | val_loss: 0.6933 val_acc: 0.5103
00:05:06.83
Epoch 5: train_loss: 0.6936 train_acc: 0.5015 | val_loss: 0.6935 val_acc: 0.5124
00:05:06.83


In [None]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
import sklearn
from sklearn.metrics import roc_auc_score

from sklearn.metrics import f1_score

def multi_acc(y_pred, y_test,total_roc,num_iter):
  pred = torch.log_softmax(y_pred, dim=1).argmax(dim=1)
  probs = torch.argmax(y_pred,axis=1)
  
  print('pred',pred)
  acc = (pred == y_test).sum().float() / float(y_test.size(0))
  print('prob',probs)
  
  fpr, tpr, thresholds = roc_curve(y_test.cpu().numpy(), probs.cpu().numpy())
  '''
  pyplot.plot([0, 1], [0, 1], linestyle='--')
  pyplot.plot(fpr, tpr)
  pyplot.show()
  '''
  #roc_acc = roc_auc_score(y_test.cpu().numpy(),probs.cpu().numpy())
  #print(f1_score(y_test.cpu().numpy(),pred.cpu().numpy()))
  try:
    roc_auc= roc_auc_score(y_test.cpu().numpy(), probs.cpu().numpy())
    total_roc+=roc_auc
    print('roc',roc_auc)
    num_iter+=1
  except ValueError:
    pass
  
  return acc,total_roc,num_iter

import time

EPOCHS = 5




def train(model, train_loader, val_loader, optimizer):  
  total_step = len(train_loader)

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    total_roc_1 = 0
    number_of_iter = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()

      acc,train_roc,iter_1 = multi_acc(prediction, labels[:,0],total_roc_1,number_of_iter)
      
      #roc_acc = roc_auc_score(labels,prediction,multi_class='ovr')
      #acc = sklearn.accuracy_score(labels,prediction)
      print('acc1',acc)
      loss.backward()
      optimizer.step()
      
      total_train_loss += loss.item()
      total_train_acc  += acc.item()
      

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    total_roc = 0
    number_of_iter=0

    with torch.no_grad():
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)
        
        loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()
        
        acc,roc,iter = multi_acc(prediction, labels[:,0],total_roc,number_of_iter)
        #acc = sklearn.accuracy_score(labels,prediction)
        total_val_loss += loss.item()
        total_val_acc  += acc.item()
        number_of_iter +=iter
        total_roc +=roc
        

    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
    print('roc',roc)
    print('num',number_of_iter)
    print('avg',roc/number_of_iter)


In [None]:
train(model, train_loader, val_loader, optimizer)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
prob tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
roc 0.5
pred tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
prob tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
roc 0.5
pred tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
prob tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
roc 0.5
pred tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
prob tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
roc 0.5
pred tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
prob tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
roc 0.5
pred tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
prob tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
roc 0.5
pred tensor([1, 1,

The average ROC value while considering only the iterations with values: 0.5001083403025925

The avergae ROC value while considering all the iterations: 0.46704948878159613