<a href="https://colab.research.google.com/github/mehedihasanbijoy/BanglaLLMs/blob/main/Text%20Classification/SentNoB_DistilBERT_HuggingFace_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Required Libraries

In [None]:
%%capture

!pip install -q gdown
!pip install --upgrade transformers[torch]
!pip install datasets

## Download Necessary Corpora

In [None]:
%%capture

# Download the folder named SentNoB (for fine-tune the LLM)
!gdown "https://drive.google.com/drive/folders/1EjBD0TumnpbFui4EJF7msT7O9Md6qfQw?usp=sharing" --folder

# Fetch the corpus (for fine-tune the tokenizer)
!gdown "https://drive.google.com/drive/folders/1oIT7DZhd4uXTpjgBeRGSP-Fs-1Ux3m-b?usp=sharing" --folder

## Import Libraries

In [None]:
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

## Log In to HuggingFace-hub

In [None]:
# %%capture
# !apt install git-lfs

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

## Fine-tune the Tokenizer

In [None]:
test = pd.read_csv("/content/BanglaParaphraseBUETNLP/test.csv")
train = pd.read_csv("/content/BanglaParaphraseBUETNLP/train.csv")
valid = pd.read_csv("/content/BanglaParaphraseBUETNLP/valid.csv")

df = pd.concat([test, train, valid], ignore_index=True)
df.reset_index(drop=True, inplace=True)

source_texts = df['source'].tolist()
target_texts = df['target'].tolist()
all_texts = source_texts + target_texts
# all_texts = all_texts[:500000]
all_texts[:3]

['কিছুদিন আগে প্যারিস থেকে ঘুরে এসেছি।',
 'ভাড়া করে ফেললেন কার্নেগি হলের মতো অত্যন্ত অভিজাত অডিটোরিয়াম, যেখানে হাজার হাজার মানুষ একসাথে বসে পারফর্মেন্স দেখতে পারে।',
 'সম্পূর্ণ নিয়ন্ত্রণ হারিয়ে জাহাজ পড়লো ঘোর সমুদ্রে।']

In [None]:
all_considered_characters = [
    ' ',  'ঁ',  'ং',  'ঃ',  'অ',  'আ',  'ই',  'ঈ',  'উ',  'ঊ',  'ঋ',  'এ',  'ঐ',  'ও',  'ঔ',
    'ক',  'খ',  'গ',  'ঘ',  'ঙ',  'চ',  'ছ',  'জ',  'ঝ',  'ঞ',  'ট',  'ঠ',  'ড',  'ঢ',  'ণ',  'ত',
    'থ',  'দ',  'ধ',  'ন',  'প',  'ফ',  'ব',  'ভ',  'ম',  'য',  'র',  'ল',  'শ',  'ষ',  'স',  'হ',
    'ড়',   'ঢ়',   'য়',  '়',  'া',  'ি',  'ী',  'ু',  'ূ',  'ৃ',  'ে',  'ৈ',  'ো',  'ৌ',  '্',  'ৎ',
    '০',  '১',  '২',  '৩',  '৪',  '৫',  '৬',  '৭',  '৮',  '৯']

In [None]:
all_cleaned_sentences = []

for sent in tqdm(all_texts):
    cleaned_sent = ""
    for char in str(sent):
        if char in all_considered_characters:
            cleaned_sent += str(char)
    all_cleaned_sentences.append(cleaned_sent)

100%|██████████| 933260/933260 [01:31<00:00, 10168.91it/s]


In [None]:
pretrained_tokenizer_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)

example_sent = all_cleaned_sentences[random.randint(0, len(all_cleaned_sentences)-1)]

print(example_sent)
print(tokenizer.tokenize(example_sent))
print(tokenizer.encode(example_sent))
print(tokenizer.decode(tokenizer.encode(example_sent)))

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

আরবি ভাষায় শহরটি মাদাইন নামে পরিচিত
['আ', '##র', '##ব', '##ি', 'ভ', '##া', '##ষ', '##া', '##য', 'শ', '##হ', '##র', '##ট', '##ি', 'ম', '##া', '##দ', '##া', '##ই', '##ন', 'ন', '##া', '##ম', '##ে', 'প', '##র', '##ি', '##চ', '##ি', '##ত']
[101, 1348, 29908, 29904, 29915, 1369, 29914, 29911, 29914, 29907, 1374, 29913, 29908, 29895, 29915, 1370, 29914, 29900, 29914, 29885, 29902, 1366, 29914, 29906, 29917, 1367, 29908, 29915, 29892, 29915, 29898, 102]
[CLS] আরবি ভাষায শহরটি মাদাইন নামে পরিচিত [SEP]


In [None]:
# Customize training parameters
vocab_size = 30000
min_frequency = 5

# Fine-tune the tokenizer on your custom dataset
tokenizer_finetuned = tokenizer.train_new_from_iterator(np.array(all_cleaned_sentences).reshape(-1, 1), vocab_size=vocab_size)

print(example_sent)
print(tokenizer_finetuned.tokenize(example_sent))
print(tokenizer_finetuned.encode(example_sent))
print(tokenizer_finetuned.decode(tokenizer_finetuned.encode(example_sent)))

আরবি ভাষায় শহরটি মাদাইন নামে পরিচিত
['আরবি', 'ভাষায', 'শহরটি', 'মাদা', '##ইন', 'নামে', 'পরিচিত']
[2, 5900, 2128, 5662, 16685, 1587, 789, 1096, 3]
[CLS] আরবি ভাষায শহরটি মাদাইন নামে পরিচিত [SEP]


## Process the SentNoB Corpus

In [None]:
train_df = pd.read_csv("/content/SentNoB/Train.csv")
val_df = pd.read_csv("/content/SentNoB/Val.csv")
test_df = pd.read_csv("/content/SentNoB/Test.csv")

print(f"Number of instances in training set  : {str(len(train_df)).rjust(4)}")
print(f"Number of instances in validation set: {str(len(val_df)).rjust(4)}")
print(f"Number of instances in Test set      : {str(len(test_df)).rjust(4)}")

Number of instances in training set  : 12575
Number of instances in validation set: 1567
Number of instances in Test set      : 1586


In [None]:
def find_len(sent):
    return len(str(sent).split())

train_df['len'] = train_df['Data'].apply(find_len)
test_df['len'] = test_df['Data'].apply(find_len)
val_df['len'] = val_df['Data'].apply(find_len)

train_df = train_df.loc[train_df['len']>5].reset_index(drop=True).iloc[:, :-1]
# test_df = test_df.loc[test_df['len']>5].reset_index(drop=True).iloc[:, :-1]
# val_df = val_df.loc[val_df['len']>5].reset_index(drop=True).iloc[:, :-1]

train_df = train_df.sample(frac=1.)
train_df = train_df.iloc[:int(len(train_df)*0.7), :].reset_index(drop=True)
test_df = train_df.iloc[int(len(train_df)*0.7):int(len(train_df)*0.85), :].reset_index(drop=True)
val_df = train_df.iloc[int(len(train_df)*0.85):int(len(train_df)*1), :].reset_index(drop=True)

train_df = train_df.rename(columns={'Data': 'text', 'Label': 'label'})
test_df = test_df.rename(columns={'Data': 'text', 'Label': 'label'})
val_df = val_df.rename(columns={'Data': 'text', 'Label': 'label'})

print(f"Number of instances in training set  : {str(len(train_df)).rjust(5)}")
print(f"Number of instances in validation set: {str(len(val_df)).rjust(5)}")
print(f"Number of instances in Test set      : {str(len(test_df)).rjust(5)}")

Number of instances in training set  :  8093
Number of instances in validation set:  1214
Number of instances in Test set      :  1214


In [None]:
id2label = {0: "Neutral", 1: "Positive", 2: "Negative"}
print(f"id2label: {id2label}")

label2id = {v: k for k, v in id2label.items()}
print(f"label2id: {label2id}")

id2label: {0: 'Neutral', 1: 'Positive', 2: 'Negative'}
label2id: {'Neutral': 0, 'Positive': 1, 'Negative': 2}


In [None]:
hf_train_dataset = Dataset.from_pandas(train_df)
hf_val_dataset = Dataset.from_pandas(val_df)
hf_test_dataset = Dataset.from_pandas(test_df)

print(hf_train_dataset)
print(hf_val_dataset)
print(hf_test_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 8093
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1214
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1214
})


In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer_finetuned(examples['text'], padding='max_length', truncation=True)

hf_train_dataset_tokenized = hf_train_dataset.map(tokenize_function, batched=True)
hf_val_dataset_tokenized = hf_val_dataset.map(tokenize_function, batched=True)
hf_test_dataset_tokenized = hf_test_dataset.map(tokenize_function, batched=True)

print(hf_train_dataset_tokenized)
print(hf_val_dataset_tokenized)
print(hf_test_dataset_tokenized)

Map:   0%|          | 0/8093 [00:00<?, ? examples/s]

Map:   0%|          | 0/1214 [00:00<?, ? examples/s]

Map:   0%|          | 0/1214 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 8093
})
Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1214
})
Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1214
})


## Fine-tune DistilBERT

In [None]:
n_classes = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=n_classes)
model.to(device)
print(f"Model is sent to {device}")

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['encoder.layer.7.intermediate.dense.bias', 'encoder.layer.10.attention.output.LayerNorm.bias', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.7.output.LayerNorm.bias', 'encoder.layer.3.attention.output.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.11.attention.self.value.bias', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.2.attention.self.key.bias', 'encoder.layer.9.attention.self.value.weight', 'encoder.layer.10.attention.self.value.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.11.output.dense.bias', 'encoder.layer.7.attention.self.query.bias', 'encoder.layer.11.attention.output.LayerNorm.weight', 'encoder.layer.9.attention.self.key.bias', 'encoder.layer.6.attention.output.dense.bias', 'classifier.weight', 'encoder.layer.4.attention.self.query.weight', 'encoder.layer.1

Model is sent to cuda


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./DistilBERT_finetuned",
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    num_train_epochs=10,
    learning_rate=2e-5,
    push_to_hub=False,
)

# Total Steps = (no. of training instances / batch size) * no. of epochs

In [None]:
# Define the custom compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "precision": precision_score(p.label_ids, preds, average='macro'),
        "recall": recall_score(p.label_ids, preds, average='macro'),
        "f1-score": f1_score(p.label_ids, preds, average='macro'),
    }

In [None]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_dataset_tokenized,
    eval_dataset=hf_val_dataset_tokenized,
    compute_metrics=compute_metrics
)

In [None]:
# Performance of the model before fine-tuning
trainer.evaluate(eval_dataset=hf_test_dataset_tokenized)

{'eval_loss': 1.0819004774093628,
 'eval_accuracy': 0.39621087314662273,
 'eval_precision': 0.13207029104887424,
 'eval_recall': 0.3333333333333333,
 'eval_f1-score': 0.18918387413962634,
 'eval_runtime': 39.7503,
 'eval_samples_per_second': 30.541,
 'eval_steps_per_second': 3.824}

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
500,1.0853,1.028744,0.472817,0.480301,0.398175,0.382767
1000,0.9759,0.875841,0.647446,0.731726,0.554362,0.503109
1500,0.8508,0.771015,0.676277,0.638066,0.626173,0.620979
2000,0.8207,0.7386,0.693575,0.66902,0.665907,0.656967
2500,0.7152,0.621291,0.733114,0.730993,0.735314,0.719573
3000,0.7509,0.593592,0.757825,0.753317,0.687643,0.684307
3500,0.6201,0.614222,0.76112,0.767172,0.688166,0.692686
4000,0.6208,0.473973,0.8229,0.80296,0.792654,0.7954
4500,0.5626,0.441798,0.845964,0.823986,0.832314,0.827268
5000,0.5291,0.41462,0.862438,0.846395,0.843374,0.844083


TrainOutput(global_step=10120, training_loss=0.5748174045396888, metrics={'train_runtime': 8686.6888, 'train_samples_per_second': 9.317, 'train_steps_per_second': 1.165, 'total_flos': 2.129376889672704e+16, 'train_loss': 0.5748174045396888, 'epoch': 10.0})

In [None]:
# Performance of the model after fine-tuning
trainer.evaluate(eval_dataset=hf_test_dataset_tokenized)

{'eval_loss': 0.2424594908952713,
 'eval_accuracy': 0.9448105436573312,
 'eval_precision': 0.9458403522731008,
 'eval_recall': 0.9341830338950569,
 'eval_f1-score': 0.9392873797834974,
 'eval_runtime': 42.267,
 'eval_samples_per_second': 28.722,
 'eval_steps_per_second': 3.596,
 'epoch': 10.0}

In [None]:
def violence_identifier(sentence):
    # Tokenize the example text
    input_tokens = tokenizer_finetuned(sentence, return_tensors="pt", padding=True, truncation=True)

    # Move the inputs to the same device as the model
    input_tokens = {key: value.to(model.device) for key, value in input_tokens.items()}

    # Pass the tokenized input through the model
    outputs = model(**input_tokens)

    # Get the predicted class probabilities
    logits = outputs.logits
    probabilities = logits.softmax(dim=1)

    # Get the predicted class index
    predicted_class = torch.argmax(probabilities).item()

    print(f"{'*'*50}\nSentence  : {test_example}\nPrediction: {id2label[predicted_class]}\n{'*'*50}")

In [None]:
# Example sentence
test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

violence_identifier(test_example)

**************************************************
Sentence  : কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।
Prediction: Negative
**************************************************


## Push the model to HuggingFace

In [None]:
# model.push_to_hub("DistilBERT-Bangla-Sentiment-Analysis-VITD")

## Utilize the Fine-tuned Model from HuggingFace

In [None]:
# model = BertForSequenceClassification.from_pretrained('mehedihasanbijoy/DistilBERT-Bangla-Sentiment-Analysis-VITD', num_labels=n_classes)
# model.to(device)
# print(f"Model is sent to {device}")

In [None]:
# # Example sentence
# test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

# violence_identifier(test_example)