In [1]:
import os
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
#from torchmetrics import Accuracy, F1Score
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)
RAW_DATASET_DIR = "raw_datasets/SST_2"

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

### Word piece tokenizer

In [3]:
deberta_model_name = "microsoft/deberta-base"

In [4]:
from transformers import DebertaTokenizer
tokenizer = DebertaTokenizer.from_pretrained(deberta_model_name)

sequence = "Hello, my dog is cute"

In [5]:
tokenized_sequence = tokenizer.tokenize(sequence)
print(tokenized_sequence)

['Hello', ',', 'Ġmy', 'Ġdog', 'Ġis', 'Ġcute']


In [6]:
encoded_sequence = tokenizer(sequence)["input_ids"]
print(encoded_sequence)

[1, 31414, 6, 127, 2335, 16, 11962, 2]


In [7]:
decoded_sequence = tokenizer.decode(encoded_sequence)
print(decoded_sequence)

[CLS]Hello, my dog is cute[SEP]


### Dataloader

In [8]:
def fun1(review):
    review = review.lower() # lowercase, standardize
    return review

def fun2(x):
    if x=='negative':
        return 0
    elif x=='positive':
        return 1
    else:
        return 2

# Classification0,2 are imbalanced

#df = pd.read_excel('ClassificationDataset2.xlsx',names = ['sentiment','review']) # classification2
#df = pd.read_excel('ClassificationDataset0.xlsx',names=['sentiment','review'],header = None) #Classification0
#df = pd.read_csv('all-data.csv',encoding="ISO-8859-1",names=['sentiment','review'],header = None) # all-data
df = pd.read_csv('ClassificationDataset1_new.csv') # ../../Sentiment_classification(Movie_reviews)/train.csv, Classification1_new.csv

NUM_CLASSES = 2
df = df.dropna()
df = df.sample(frac=1) # shuffles the data

# get rid of punctuation
df['review'] = df['review'].apply(fun1)
#df['sentiment'] = df['sentiment'].apply(fun2)  # Classification0,all_data
#df['sentiment'] = df['sentiment'].apply(lambda x: x-1) # classification2
df.head()

Unnamed: 0,review,sentiment
12256,work was being carried out which we weren t aware of the rooms were tiny very little space to get around the bed and the beds were small our feet hung over the edge despite having booked a deluxe room there was no decent public space to relax in either the theatre was freezing and the lobby was unappealing despite the lovely photos the cleanliness of the room was poor very dusty lots of marks on the walls and floor and the sheets had not been changed since the last guests the staff did try their best including moving rooms as the heater was making a noise in the first room but the move itself was very shambolic they did apologise and gave us complementary chocolates and wine but we shall not stay here again when there are lots of other lovely hotels in the same area for the same price,0
6406,4 start hotel should do more than a poor service every thing is chargeable even water the size of the room is so small and the design is very strange can not understand it the matters and pillow is very poor quality and had strong headaches every day morning,0
9007,room was very small and the bed seemed to still be dirty after it had been cleaned,0
9648,the beds were comfy facilities in the room were good room service was excellent very delicious and quick central location,1
15640,sofa bed uncomfortable for children short wait for a table at breakfast,0


In [9]:
class DeBertaDataset(Dataset):
    def __init__(self, df, tokenizer,n_folds = 1,leave_out_fold = 0,split = 'train', max_length_sentence=100):
        super(DeBertaDataset, self).__init__()

        texts = list(df['review'].values)
        labels = list(df['sentiment'].values)
        
        if n_folds == 1:
            # 80-20 split
            if split == 'train':
                X,Y = texts[:int(len(texts)*0.8)], labels[:int(len(texts)*0.8)]
            else:
                X,Y = texts[int(len(texts)*0.8):], labels[int(len(texts)*0.8):]
        else:
            each_fold = int(len(texts)/n_folds) 
            if split == 'train':
                print(leave_out_fold*each_fold)
                X_l,Y_l = texts[: leave_out_fold*each_fold ], labels[: leave_out_fold*each_fold ]
                X_r,Y_r = texts[(leave_out_fold+1)*each_fold:], labels[(leave_out_fold+1)*each_fold :]
                X_l.extend(X_r)
                Y_l.extend(Y_r)
                X = X_l
                Y = Y_l
            else:
                X,Y = texts[leave_out_fold*each_fold: (leave_out_fold+1)*each_fold ], labels[leave_out_fold*each_fold : (leave_out_fold+1)*each_fold ]

        self.X = X
        self.Y = Y
        self.tokenizer=tokenizer
        self.max_length = max_length_sentence
        
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, index):
        
        text1 = self.X[index]
        
        inputs = self.tokenizer.encode_plus(
            text1 ,
            None, # since we have only 1 sentence as input
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long).to(device),
            'mask': torch.tensor(mask, dtype=torch.long).to(device),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long).to(device),
            'target': torch.tensor(self.Y[index], dtype=torch.long).to(device)
        }

In [10]:
train_dataset = DeBertaDataset(df,tokenizer,split='train')
test_dataset = DeBertaDataset(df,tokenizer,split='val')

print("Train data size == ",len(train_dataset))
print("Val data size == ",len(test_dataset))

Train data size ==  14398
Val data size ==  3600


In [11]:
BATCH_SIZE = 32

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE)

In [12]:
# testing data loaders
dict = next(iter(train_dataloader))
dict

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




{'ids': tensor([[   1,    5, 3267,  ...,    0,    0,    0],
         [   1,  115,   33,  ...,    0,    0,    0],
         [   1, 2579, 2382,  ...,    0,    0,    0],
         ...,
         [   1,  182, 2579,  ...,    0,    0,    0],
         [   1, 2362, 2430,  ...,    0,    0,    0],
         [   1,    5, 7676,  ...,    0,    0,    0]], device='cuda:0'),
 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'),
 'target': tensor([0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
         1, 0, 1, 0, 0, 1, 0, 1], devic

In [13]:
tokenizer.decode(dict['ids'][5])

'[CLS] breakfast facilities were overcrowded[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

### Model

In [14]:
op = tokenizer.encode_plus(
            "I'm sentence 1", # text
            None, # text pair
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=10,
        )
op

{'input_ids': [1, 100, 437, 3645, 112, 2, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]}

## Training

In [15]:
from transformers import AutoTokenizer, DebertaForSequenceClassification

model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base",num_labels = NUM_CLASSES).to(device)
lr = 5e-5
optimizer= optim.Adam(model.parameters(),lr= lr)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier

In [16]:
epochs = 3
train_losses = []

for epoch in range(epochs):
    model.train()
    loop=tqdm(enumerate(train_dataloader),leave=False,total=len(train_dataloader))
    print(epoch)
    total_matches = 0
    for batch, dl in loop:
        
        # input
        ids, token_type_ids, mask, label = dl['ids'], dl['token_type_ids'], dl['mask'], dl['target']
        optimizer.zero_grad()
        

        output = model(ids,mask,token_type_ids, labels=label)
        loss = output.loss
        loss.backward()
        optimizer.step()

        train_losses.append(loss)
        logits = output.logits
        pred = logits.argmax(dim=1)
        
        # prediction
        matches = (torch.sum(pred == label)).item()
        accuracy = matches/BATCH_SIZE
        total_matches += matches

        # Show progress while training
        loop.set_description_str(f"Epoch={epoch}/{epochs} loss={loss.item()} accuracy={accuracy}")

    
    print(f"Train Accuracy :{epoch} = {total_matches/len(train_dataset)}") 

  0%|          | 0/450 [00:00<?, ?it/s]

0


  0%|          | 0/450 [00:00<?, ?it/s]                                                                

Train Accuracy :0 = 0.9319349909709682
1


  0%|          | 0/450 [00:00<?, ?it/s]                                                                

Train Accuracy :1 = 0.9640922350326434
2


                                                                                                       

Train Accuracy :2 = 0.9722183636616196




In [21]:
# save model
model_name = f"DeBERTaForClassification_reviews0_{epochs}_{lr}_FULL.pt"
torch.save(model.state_dict(), f"{model_name}")

In [18]:
model.load_state_dict(torch.load('DeBERTaForClassification_reviews1_3_5e-05_FULL.pt'))

<All keys matched successfully>

## Tesing

In [19]:
# test dataset
test_losses = []
total_correct = []
total_pred = []

model.eval()
loop=tqdm(enumerate(test_dataloader),leave=False,total=len(test_dataloader))
total_matches = 0
with torch.no_grad():
    for batch, dl in loop:
        ids, token_type_ids, mask, label = dl['ids'], dl['token_type_ids'], dl['mask'], dl['target']
        optimizer.zero_grad()

        output = model(ids,mask,token_type_ids, labels=label)
        loss = output.loss

        logits = output.logits
        pred = logits.argmax(dim=1)
        test_losses.append(loss)

        matches = torch.sum(pred == label)
        total_pred.extend(list(pred.cpu().numpy()))
        total_correct.extend(list(label.cpu().numpy()))
        total_matches += matches.item()

        # Show progress while training
        loop.set_description(f'loss={loss.item()}')


    print(f"Test Accuracy :{total_matches/len(test_dataset)}")        

                                                                            

Test Accuracy :0.9755555555555555




In [20]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

print(classification_report(total_correct, total_pred))

macroF1 = f1_score(total_correct, total_pred, average='macro')
microF1 = f1_score(total_correct, total_pred, average='micro')

print("\n------------- Micro F1 Score == {} ------------".format(microF1))
print("------------- Macro F1 Score == {} ------------\n".format(macroF1))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1744
           1       0.98      0.97      0.98      1856

    accuracy                           0.98      3600
   macro avg       0.98      0.98      0.98      3600
weighted avg       0.98      0.98      0.98      3600


------------- Micro F1 Score == 0.9755555555555555 ------------
------------- Macro F1 Score == 0.9755388782043066 ------------

