<a href="https://colab.research.google.com/github/mehedihasanbijoy/BanglaLLMs/blob/main/Text%20Classification/DistilBERT_torch_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Required Libraries

In [None]:
%%capture

!pip install -q gdown
!pip install --upgrade transformers[torch]
!pip install datasets

## Download Necessary Corpora

In [None]:
%%capture

# Download the folder named BLP2023-VITD (for fine-tune the LLM)
!gdown "https://drive.google.com/drive/folders/1TqmhXN2hyLutFlSoVAyTWkBBIK_DaLuF?usp=sharing" --folder

# Fetch the corpus (for fine-tune the tokenizer)
!gdown "https://drive.google.com/drive/folders/1oIT7DZhd4uXTpjgBeRGSP-Fs-1Ux3m-b?usp=sharing" --folder

In [None]:
!unzip "/content/BLP2023-VITD/BLP2023-VITD.zip"

Archive:  /content/BLP2023-VITD/BLP2023-VITD.zip
replace BLP2023-VITD/dev.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: BLP2023-VITD/dev.csv    
  inflating: BLP2023-VITD/test.csv   
  inflating: BLP2023-VITD/train.csv  


## Import Libraries

In [None]:
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

## Fine-tune the Tokenizer

In [None]:
test = pd.read_csv("/content/BanglaParaphraseBUETNLP/test.csv")
train = pd.read_csv("/content/BanglaParaphraseBUETNLP/train.csv")
valid = pd.read_csv("/content/BanglaParaphraseBUETNLP/valid.csv")

df = pd.concat([test, train, valid], ignore_index=True)
df.reset_index(drop=True, inplace=True)

source_texts = df['source'].tolist()
target_texts = df['target'].tolist()
all_texts = source_texts + target_texts
# all_texts = all_texts[:500000]
all_texts[:3]

['কিছুদিন আগে প্যারিস থেকে ঘুরে এসেছি।',
 'ভাড়া করে ফেললেন কার্নেগি হলের মতো অত্যন্ত অভিজাত অডিটোরিয়াম, যেখানে হাজার হাজার মানুষ একসাথে বসে পারফর্মেন্স দেখতে পারে।',
 'সম্পূর্ণ নিয়ন্ত্রণ হারিয়ে জাহাজ পড়লো ঘোর সমুদ্রে।']

In [None]:
all_considered_characters = [
    ' ',  'ঁ',  'ং',  'ঃ',  'অ',  'আ',  'ই',  'ঈ',  'উ',  'ঊ',  'ঋ',  'এ',  'ঐ',  'ও',  'ঔ',
    'ক',  'খ',  'গ',  'ঘ',  'ঙ',  'চ',  'ছ',  'জ',  'ঝ',  'ঞ',  'ট',  'ঠ',  'ড',  'ঢ',  'ণ',  'ত',
    'থ',  'দ',  'ধ',  'ন',  'প',  'ফ',  'ব',  'ভ',  'ম',  'য',  'র',  'ল',  'শ',  'ষ',  'স',  'হ',
    'ড়',   'ঢ়',   'য়',  '়',  'া',  'ি',  'ী',  'ু',  'ূ',  'ৃ',  'ে',  'ৈ',  'ো',  'ৌ',  '্',  'ৎ',
    '০',  '১',  '২',  '৩',  '৪',  '৫',  '৬',  '৭',  '৮',  '৯']

In [None]:
all_cleaned_sentences = []

for sent in tqdm(all_texts):
    cleaned_sent = ""
    for char in sent:
        if char in all_considered_characters:
            cleaned_sent += str(char)
    all_cleaned_sentences.append(cleaned_sent)

100%|██████████| 933260/933260 [01:24<00:00, 11055.66it/s]


In [None]:
pretrained_tokenizer_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)

example_sent = all_cleaned_sentences[random.randint(0, len(all_cleaned_sentences)-1)]

print(example_sent)
print(tokenizer.tokenize(example_sent))
print(tokenizer.encode(example_sent))
print(tokenizer.decode(tokenizer.encode(example_sent)))

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

তার সফলতার মূল কারণও এই দুটো প্রবাদ
['ত', '##া', '##র', '[UNK]', 'ম', '##ল', 'ক', '##া', '##র', '##ণ', '##ও', 'এ', '##ই', 'দ', '##ট', '##ে', '##া', 'প', '##র', '##ব', '##া', '##দ']
[101, 1362, 29914, 29908, 100, 1370, 29909, 1353, 29914, 29908, 29897, 29888, 1351, 29885, 1364, 29895, 29917, 29914, 1367, 29908, 29904, 29914, 29900, 102]
[CLS] তার [UNK] মল কারণও এই দটো পরবাদ [SEP]


In [None]:
# Customize training parameters
vocab_size = 30000
min_frequency = 5

# Fine-tune the tokenizer on your custom dataset
tokenizer_finetuned = tokenizer.train_new_from_iterator(np.array(all_cleaned_sentences).reshape(-1, 1), vocab_size=vocab_size)

print(example_sent)
print(tokenizer_finetuned.tokenize(example_sent))
print(tokenizer_finetuned.encode(example_sent))
print(tokenizer_finetuned.decode(tokenizer_finetuned.encode(example_sent)))

তার সফলতার মূল কারণও এই দুটো প্রবাদ
['তার', 'সফলতার', 'মল', 'কারণও', 'এই', 'দটো', 'পরবাদ']
[2, 154, 7561, 550, 12569, 175, 1944, 14850, 3]
[CLS] তার সফলতার মল কারণও এই দটো পরবাদ [SEP]


## Process the Violence Inciting Text Detection (VITD) Corpus

In [None]:
train_df = pd.read_csv("/content/BLP2023-VITD/train.csv")
val_df = pd.read_csv("/content/BLP2023-VITD/dev.csv")
test_df = pd.read_csv("/content/BLP2023-VITD/test.csv")

print(f"Number of instances in training set  : {str(len(train_df)).rjust(4)}")
print(f"Number of instances in validation set: {str(len(val_df)).rjust(4)}")
print(f"Number of instances in Test set      : {str(len(test_df)).rjust(4)}")

Number of instances in training set  : 2700
Number of instances in validation set: 1330
Number of instances in Test set      : 2016


In [None]:
id2label = {0: "Non-Violence", 1: "Passive Violence", 2: "Direct Violence"}
print(f"id2label: {id2label}")

label2id = {v: k for k, v in id2label.items()}
print(f"label2id: {label2id}")

id2label: {0: 'Non-Violence', 1: 'Passive Violence', 2: 'Direct Violence'}
label2id: {'Non-Violence': 0, 'Passive Violence': 1, 'Direct Violence': 2}


In [None]:
train_df.sample(3)

Unnamed: 0,text,label
2180,ডাকাত জদি চোর কে বলে৷ ভালো হওয়ার কথা৷ কেমন লাগে,1
2288,ও কোন বিজ্ঞান মনস্ক লেখক নয়।সে একজন ইসলাম বিদ্...,1
2498,খুব সুন্দরতো এই ভিডিও ফুটেজ ধন্যবাদ,0


In [None]:
class CustomCorpus(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df['text'].values[idx]
        ids_n_masks = self.tokenizer(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        return {
            "input_ids": torch.tensor(ids_n_masks["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(ids_n_masks["attention_mask"], dtype=torch.long),
            "targets": torch.tensor(self.df["label"][idx], dtype=torch.long)
        }

In [None]:
max_len = 64
batch_size = 8

In [None]:
train_dataset = CustomCorpus(train_df, tokenizer_finetuned, max_len)
val_dataset = CustomCorpus(val_df, tokenizer_finetuned, max_len)
test_dataset = CustomCorpus(test_df, tokenizer_finetuned, max_len)

In [None]:
train_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
                }

In [None]:
training_loader = torch.utils.data.DataLoader(train_dataset, **train_params)
valid_loader = torch.utils.data.DataLoader(val_dataset, **train_params)
testing_loader = torch.utils.data.DataLoader(test_dataset, **test_params)

## Fine-tune DistilBERT

In [None]:
n_classes = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
from transformers import DistilBertModel

class DistillBERTClass(torch.nn.Module):
    def __init__(self, n_classes):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.5)
        self.classifier = torch.nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistillBERTClass(n_classes)
model

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [None]:
model.to(device)
print(f"Model is sent to {device}")

Model is sent to cuda


In [None]:
learning_rate = 2e-5

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=learning_rate)

In [None]:
# Function to calcuate the accuracy of the model
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0

    model.train()

    for data in training_loader:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        # print(targets)

        outputs = model(ids, mask)
        # print(outputs)

        loss = loss_function(outputs, targets)
        tr_loss += loss.item()

        big_val, big_idx = torch.max(outputs.data, dim=1)
        # print(big_idx)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss: {epoch_loss}, Training Accuracy: {epoch_accu:.2f}%")

    return

In [None]:
def valid(model, testing_loader):
    tr_loss = 0
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for data in testing_loader:
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)

            outputs = model(ids, mask).squeeze()

            loss = loss_function(outputs, targets)
            tr_loss += loss.item()

            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples

    print(f"Validation Loss: {epoch_loss}, Validation Accuracy: {epoch_accu:.2f}%\n")

    return epoch_accu

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1} / {num_epochs}")
    train(epoch)
    valid(model, valid_loader)

Epoch 1 / 100
Training Loss: 0.9719409841228519, Training Accuracy: 52.59%
Validation Loss: 0.8665267080960873, Validation Accuracy: 61.50%

Epoch 2 / 100
Training Loss: 0.8395286760562976, Training Accuracy: 63.26%
Validation Loss: 0.7910629211071722, Validation Accuracy: 66.02%

Epoch 3 / 100
Training Loss: 0.6440195577031762, Training Accuracy: 73.85%
Validation Loss: 0.7615021033558304, Validation Accuracy: 68.05%

Epoch 4 / 100
Training Loss: 0.4203619236049391, Training Accuracy: 84.56%
Validation Loss: 0.9003662662413305, Validation Accuracy: 66.92%

Epoch 5 / 100
Training Loss: 0.22802861262465546, Training Accuracy: 92.33%
Validation Loss: 1.0077501360416234, Validation Accuracy: 67.22%

Epoch 6 / 100
Training Loss: 0.1262825116041232, Training Accuracy: 96.11%
Validation Loss: 1.1174806339476637, Validation Accuracy: 65.79%

Epoch 7 / 100
Training Loss: 0.12811251309667246, Training Accuracy: 95.67%
Validation Loss: 1.2070014768999493, Validation Accuracy: 67.74%

Epoch 8 / 1

In [None]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

Validation Loss: 2.7520335344535334, Validation Accuracy: 67.36%

Accuracy on test data = 67.36%


In [None]:
def violence_identifier(sentence):
    input_tokens = tokenizer_finetuned(sentence, return_tensors="pt", padding=True, truncation=True)
    input_tokens = {key: value.to(device) for key, value in input_tokens.items()}

    outputs = model(**input_tokens)
    big_val, big_idx = torch.max(outputs.data, dim=1)

    print(f"{'*'*50}\nSentence  : {test_example}\nPrediction: {id2label[big_idx.item()]}\n{'*'*50}")

In [None]:
# Example sentence
test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

violence_identifier(test_example)

**************************************************
Sentence  : কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।
Prediction: Passive Violence
**************************************************
