In [1]:
import transformers
import torch.nn as nn
import torch
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
import pandas as pd

import re

from tqdm import tqdm

import  torch.nn.functional as F

In [2]:
# importing device 

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Available total GPU's : ",torch.cuda.device_count() )
    print("GPU used is : ",torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")

In [88]:
# Constants 
BERT_PATH = 'bert-base-uncased'
MAX_LEN = 128
TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 64
EPOCHS = 2
MODEL_PATH = 'train_model.pth'
TRAINING_FILE = 'balaced_train_data_chunk.csv'

In [76]:
# Model Architecture 

class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 3)

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        bo = self.bert_drop(o2)
        output = self.out(bo)
#         print('output_from_bert_layer : ')
        return output


In [77]:
# Dataset Preparation

class BERTDataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }


In [78]:
# Sentence Prediction 
def sentence_prediction(sentence):
    tokenizer = TOKENIZER
    max_len = MAX_LEN
    review = str(sentence)
    review = " ".join(review.split())

    inputs = tokenizer.encode_plus(
        review, None, add_special_tokens=True, max_length=max_len
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    padding_length = max_len - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)
    token_type_ids = torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0)

    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)

    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    outputs = torch.argmax(F.softmax(outputs),dim=1)
    outputs = outputs.cpu().detach().numpy()
    return outputs


In [9]:
sentence_prediction('Happy Mothers day mum')

NameError: name 'model' is not defined

In [79]:
# Since we are performing multi class classification we will proceed with CrossEntropyLoss
def loss_fn(outputs, targets):
#     print('loss function started')
    loss =  F.cross_entropy(outputs,targets)
#     print('outputs in loss : ',outputs)
#     print('targets in loss : ',targets)
    return loss


In [80]:
# Train Function
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    train_loss = 0 
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
    return train_loss



In [81]:
# Evaluation Function 

def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    val_loss = 0
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.long)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            outputs = torch.argmax(F.softmax(outputs),dim=1)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
            val_loss += np.sum(fin_targets == fin_outputs)
            print('fin_targets : ',fin_targets)
            print('fin_outputs : ',fin_outputs)
    print("validation Loss : ",val_loss)
    return fin_outputs, fin_targets, val_loss


In [49]:
def remove_urls(text):
    text = re.sub(r'https|www?\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'insta\w+','',text)
    text = re.sub(r'photo\w+','',text)
    text = re.sub(r'vedio\w+','',text)
    text = text.lower()
    return text

In [69]:
train_dataset_chunk = pd.read_csv('balaced_train_data_chunk.csv')
test_dataset_chunk = pd.read_csv('test_data_chunk.csv')

In [71]:
train_dataset_chunk['label'].value_counts()

0    769
2    765
1    761
Name: label, dtype: int64

In [28]:
train_dataset_chunk = train_dataset_chunk.sample(frac=1).reset_index(drop=True).loc[:20]
test_dataset_chunk = test_dataset_chunk.loc[:2,:]

In [29]:
train_dataset_chunk.shape, test_dataset_chunk.shape

((21, 2), (3, 2))

In [31]:
test_dataset_chunk

Unnamed: 0,id,cleaned_text
0,1.246628e+18,0 yeah i once cooked potatoes when i was 0 yea...
1,1.245898e+18,happy mothers day to all the mums stepmums gra...
2,1.244717e+18,i love the people from the uk however when i a...


In [82]:
# Loading datafile 
# dfx = pd.read_csv(TRAINING_FILE).fillna("none")

dfx = train_dataset_chunk
# Splitting Data in train and validation dataset 
df_train, df_valid = model_selection.train_test_split(
    dfx, test_size=0.1, random_state=42, stratify=dfx.label.values
)

# Resetting index of traing and validation dataset
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

# 
train_dataset = BERTDataset(
    review=df_train.cleaned_text.values, target=df_train.label.values
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE
)

valid_dataset = BERTDataset(
    review=df_valid.cleaned_text.values, target=df_valid.label.values
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=VALID_BATCH_SIZE
)

In [83]:
for i in train_data_loader:
    l = i.keys()
    for key in l:
        print(i[key].shape)
    break
    

torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32])


In [84]:
for i in valid_data_loader:
    l = i.keys()
    for key in l:
        print(i[key].shape)
    break
    

torch.Size([64, 128])
torch.Size([64, 128])
torch.Size([64, 128])
torch.Size([64])


In [85]:
# Initialising Model 
model = BERTBaseUncased()
model.to(device)

BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [86]:
# Calculating total number of train steps 
num_train_steps = (int(len(df_train) / TRAIN_BATCH_SIZE) * EPOCHS)

optimizer = AdamW(model.parameters(), lr=1e-02)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
)

In [87]:
def accuracy_per_class(preds, labels):

    
    preds_flat = np.array(preds)
    labels_flat = np.array(labels)

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
best_accuracy = 0
output_dict = {}
output_list =[]
target_list = []
metrics_list = []
for epoch in range(EPOCHS):
    
    train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
    outputs, targets, val_loss = eval_fn(valid_data_loader, model, device)
    avg_train_loss = train_loss / len(train_data_loader)
    avg_val_loss = val_loss / len(valid_data_loader)
    accuracy_per_class(outputs,targets)
    accuracy = metrics.accuracy_score(targets, outputs)
    d={'epoch':(epoch+1),'avg_train_loss':avg_train_loss, 'avg_val_loss':avg_val_loss, 'accuracy_score':accuracy}
    print(d)
    metrics_list.append(d)
    if accuracy > best_accuracy:
        pickle.dump(model, open('model_with_best_accuracy.pth','wb'))
        #torch.save(model.state_dict(), MODEL_PATH)
        best_accuracy = accuracy

  0%|          | 0/65 [00:00<?, ?it/s]

In [5]:
metrics_df = pd.DataFrame(metrics_list)

In [None]:
metrics_df