<a href="https://colab.research.google.com/github/mehedihasanbijoy/BanglaLLMs/blob/main/Text%20Classification/DistilBERT_HuggingFace_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Required Libraries

In [None]:
%%capture

!pip install -q gdown
!pip install --upgrade transformers[torch]
!pip install datasets

## Download Necessary Corpora

In [None]:
%%capture

# Download the folder named BLP2023-VITD (for fine-tune the LLM)
!gdown "https://drive.google.com/drive/folders/1TqmhXN2hyLutFlSoVAyTWkBBIK_DaLuF?usp=sharing" --folder

# Fetch the corpus (for fine-tune the tokenizer)
!gdown "https://drive.google.com/drive/folders/1oIT7DZhd4uXTpjgBeRGSP-Fs-1Ux3m-b?usp=sharing" --folder

In [None]:
!unzip "/content/BLP2023-VITD/BLP2023-VITD.zip"

Archive:  /content/BLP2023-VITD/BLP2023-VITD.zip
replace BLP2023-VITD/dev.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: BLP2023-VITD/dev.csv    
  inflating: BLP2023-VITD/test.csv   
  inflating: BLP2023-VITD/train.csv  


## Import Libraries

In [None]:
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

## Log In to HuggingFace-hub

In [None]:
%%capture
!apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Fine-tune the Tokenizer

In [None]:
test = pd.read_csv("/content/BanglaParaphraseBUETNLP/test.csv")
train = pd.read_csv("/content/BanglaParaphraseBUETNLP/train.csv")
valid = pd.read_csv("/content/BanglaParaphraseBUETNLP/valid.csv")

df = pd.concat([test, train, valid], ignore_index=True)
df.reset_index(drop=True, inplace=True)

source_texts = df['source'].tolist()
target_texts = df['target'].tolist()
all_texts = source_texts + target_texts
# all_texts = all_texts[:500000]
all_texts[:3]

['কিছুদিন আগে প্যারিস থেকে ঘুরে এসেছি।',
 'ভাড়া করে ফেললেন কার্নেগি হলের মতো অত্যন্ত অভিজাত অডিটোরিয়াম, যেখানে হাজার হাজার মানুষ একসাথে বসে পারফর্মেন্স দেখতে পারে।',
 'সম্পূর্ণ নিয়ন্ত্রণ হারিয়ে জাহাজ পড়লো ঘোর সমুদ্রে।']

In [None]:
all_considered_characters = [
    ' ',  'ঁ',  'ং',  'ঃ',  'অ',  'আ',  'ই',  'ঈ',  'উ',  'ঊ',  'ঋ',  'এ',  'ঐ',  'ও',  'ঔ',
    'ক',  'খ',  'গ',  'ঘ',  'ঙ',  'চ',  'ছ',  'জ',  'ঝ',  'ঞ',  'ট',  'ঠ',  'ড',  'ঢ',  'ণ',  'ত',
    'থ',  'দ',  'ধ',  'ন',  'প',  'ফ',  'ব',  'ভ',  'ম',  'য',  'র',  'ল',  'শ',  'ষ',  'স',  'হ',
    'ড়',   'ঢ়',   'য়',  '়',  'া',  'ি',  'ী',  'ু',  'ূ',  'ৃ',  'ে',  'ৈ',  'ো',  'ৌ',  '্',  'ৎ',
    '০',  '১',  '২',  '৩',  '৪',  '৫',  '৬',  '৭',  '৮',  '৯']

In [None]:
all_cleaned_sentences = []

for sent in tqdm(all_texts):
    cleaned_sent = ""
    for char in sent:
        if char in all_considered_characters:
            cleaned_sent += str(char)
    all_cleaned_sentences.append(cleaned_sent)

100%|██████████| 933260/933260 [01:42<00:00, 9070.68it/s] 


In [None]:
pretrained_tokenizer_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)

example_sent = all_cleaned_sentences[random.randint(0, len(all_cleaned_sentences)-1)]

print(example_sent)
print(tokenizer.tokenize(example_sent))
print(tokenizer.encode(example_sent))
print(tokenizer.decode(tokenizer.encode(example_sent)))

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

তার তরবারি দিয়ে রক্তের ফোঁটাগুলো ঝরঝর করে নেমে আসে
['ত', '##া', '##র', 'ত', '##র', '##ব', '##া', '##র', '##ি', 'দ', '##ি', '##য', '##ে', 'র', '##ক', '##ত', '##ে', '##র', '[UNK]', '[UNK]', 'ক', '##র', '##ে', 'ন', '##ে', '##ম', '##ে', 'আ', '##স', '##ে']
[101, 1362, 29914, 29908, 1362, 29908, 29904, 29914, 29908, 29915, 1364, 29915, 29907, 29917, 1372, 29889, 29898, 29917, 29908, 100, 100, 1353, 29908, 29917, 1366, 29917, 29906, 29917, 1348, 29912, 29917, 102]
[CLS] তার তরবারি দিযে রকতের [UNK] [UNK] করে নেমে আসে [SEP]


In [None]:
# Customize training parameters
vocab_size = 30000
min_frequency = 5

# Fine-tune the tokenizer on your custom dataset
tokenizer_finetuned = tokenizer.train_new_from_iterator(np.array(all_cleaned_sentences).reshape(-1, 1), vocab_size=vocab_size)

print(example_sent)
print(tokenizer_finetuned.tokenize(example_sent))
print(tokenizer_finetuned.encode(example_sent))
print(tokenizer_finetuned.decode(tokenizer_finetuned.encode(example_sent)))

তার তরবারি দিয়ে রক্তের ফোঁটাগুলো ঝরঝর করে নেমে আসে
['তার', 'তরবারি', 'দিযে', 'রকতের', 'ফোটা', '##গলো', 'ঝর', '##ঝ', '##র', 'করে', 'নেমে', 'আসে']
[2, 154, 12203, 298, 6328, 16045, 275, 12773, 68, 69, 158, 2052, 898, 3]
[CLS] তার তরবারি দিযে রকতের ফোটাগলো ঝরঝর করে নেমে আসে [SEP]


## Process the Violence Inciting Text Detection (VITD) Corpus

In [None]:
train_df = pd.read_csv("/content/BLP2023-VITD/train.csv")
val_df = pd.read_csv("/content/BLP2023-VITD/dev.csv")
test_df = pd.read_csv("/content/BLP2023-VITD/test.csv")

print(f"Number of instances in training set  : {str(len(train_df)).rjust(4)}")
print(f"Number of instances in validation set: {str(len(val_df)).rjust(4)}")
print(f"Number of instances in Test set      : {str(len(test_df)).rjust(4)}")

Number of instances in training set  : 2700
Number of instances in validation set: 1330
Number of instances in Test set      : 2016


In [None]:
id2label = {0: "Non-Violence", 1: "Passive Violence", 2: "Direct Violence"}
print(f"id2label: {id2label}")

label2id = {v: k for k, v in id2label.items()}
print(f"label2id: {label2id}")

id2label: {0: 'Non-Violence', 1: 'Passive Violence', 2: 'Direct Violence'}
label2id: {'Non-Violence': 0, 'Passive Violence': 1, 'Direct Violence': 2}


In [None]:
hf_train_dataset = Dataset.from_pandas(train_df)
hf_val_dataset = Dataset.from_pandas(val_df)
hf_test_dataset = Dataset.from_pandas(test_df)

print(hf_train_dataset)
print(hf_val_dataset)
print(hf_test_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 2700
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1330
})
Dataset({
    features: ['text', 'label'],
    num_rows: 2016
})


In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer_finetuned(examples['text'], padding='max_length', truncation=True)

hf_train_dataset_tokenized = hf_train_dataset.map(tokenize_function, batched=True)
hf_val_dataset_tokenized = hf_val_dataset.map(tokenize_function, batched=True)
hf_test_dataset_tokenized = hf_test_dataset.map(tokenize_function, batched=True)

print(hf_train_dataset_tokenized)
print(hf_val_dataset_tokenized)
print(hf_test_dataset_tokenized)

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1330 [00:00<?, ? examples/s]

Map:   0%|          | 0/2016 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2700
})
Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1330
})
Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2016
})


## Fine-tune DistilBERT

In [None]:
n_classes = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=n_classes)
model.to(device)
print(f"Model is sent to {device}")

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['encoder.layer.6.attention.output.dense.bias', 'encoder.layer.3.output.LayerNorm.weight', 'encoder.layer.4.intermediate.dense.weight', 'encoder.layer.8.output.dense.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.7.attention.self.query.bias', 'encoder.layer.5.attention.output.LayerNorm.bias', 'encoder.layer.9.attention.self.key.bias', 'encoder.layer.10.attention.self.query.weight', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.8.attention.output.dense.bias', 'encoder.layer.6.output.LayerNorm.weight', 'encoder.layer.4.attention.self.value.weight', 'encoder.layer.2.output.dense.weight', 'encoder.layer.5.attention.output.dense.weight', 'encoder.layer.4.output.LayerNorm.bias', 'encoder.layer.1.attention.self.key.weight', 'encoder.layer.4.attention.self.value.bias', 'emb

Model is sent to cuda


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./DistilBERT_finetuned",
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    num_train_epochs=20,
    learning_rate=2e-5,
    push_to_hub=False,
)

# Total Steps = (no. of training instances / batch size) * no. of epochs

In [None]:
# Define the custom compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "precision": precision_score(p.label_ids, preds, average='macro'),
        "recall": recall_score(p.label_ids, preds, average='macro'),
        "f1-score": f1_score(p.label_ids, preds, average='macro'),
    }

In [None]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_dataset_tokenized,
    eval_dataset=hf_val_dataset_tokenized,
    compute_metrics=compute_metrics
)

In [None]:
# Performance of the model before fine-tuning
trainer.evaluate(eval_dataset=hf_test_dataset_tokenized)

{'eval_loss': 1.006896734237671,
 'eval_accuracy': 0.529265873015873,
 'eval_precision': 0.27760416666666665,
 'eval_recall': 0.328978559028659,
 'eval_f1-score': 0.2525678996273453,
 'eval_runtime': 61.582,
 'eval_samples_per_second': 32.737,
 'eval_steps_per_second': 4.092}

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
500,0.9618,0.918659,0.603008,0.489371,0.401948,0.357191
1000,0.7798,0.833992,0.684211,0.735061,0.542705,0.521597
1500,0.6011,1.334124,0.663158,0.787063,0.483578,0.467487
2000,0.4427,1.066825,0.67594,0.63806,0.64626,0.62439
2500,0.356,1.437643,0.683459,0.682399,0.652038,0.651936


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
500,0.9618,0.918659,0.603008,0.489371,0.401948,0.357191
1000,0.7798,0.833992,0.684211,0.735061,0.542705,0.521597
1500,0.6011,1.334124,0.663158,0.787063,0.483578,0.467487
2000,0.4427,1.066825,0.67594,0.63806,0.64626,0.62439
2500,0.356,1.437643,0.683459,0.682399,0.652038,0.651936
3000,0.2893,1.664919,0.71203,0.673028,0.64645,0.656779
3500,0.2016,1.813706,0.710526,0.662018,0.665729,0.661791
4000,0.167,1.981617,0.701504,0.678392,0.665287,0.667157
4500,0.1201,2.185323,0.685714,0.671509,0.631914,0.63836
5000,0.0765,2.32764,0.669173,0.622561,0.645353,0.630433


TrainOutput(global_step=6760, training_loss=0.30820039503673125, metrics={'train_runtime': 5506.8618, 'train_samples_per_second': 9.806, 'train_steps_per_second': 1.228, 'total_flos': 1.4208124557312e+16, 'train_loss': 0.30820039503673125, 'epoch': 20.0})

In [None]:
# Performance of the model after fine-tuning
trainer.evaluate(eval_dataset=hf_test_dataset_tokenized)

{'eval_loss': 2.628613233566284,
 'eval_accuracy': 0.6527777777777778,
 'eval_precision': 0.5831098047181787,
 'eval_recall': 0.6210422260038752,
 'eval_f1-score': 0.5879946703329345,
 'eval_runtime': 64.4665,
 'eval_samples_per_second': 31.272,
 'eval_steps_per_second': 3.909,
 'epoch': 20.0}

In [None]:
def violence_identifier(sentence):
    # Tokenize the example text
    input_tokens = tokenizer_finetuned(sentence, return_tensors="pt", padding=True, truncation=True)

    # Move the inputs to the same device as the model
    input_tokens = {key: value.to(model.device) for key, value in input_tokens.items()}

    # Pass the tokenized input through the model
    outputs = model(**input_tokens)

    # Get the predicted class probabilities
    logits = outputs.logits
    probabilities = logits.softmax(dim=1)

    # Get the predicted class index
    predicted_class = torch.argmax(probabilities).item()

    print(f"{'*'*50}\nSentence  : {test_example}\nPrediction: {id2label[predicted_class]}\n{'*'*50}")

In [None]:
# Example sentence
test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

violence_identifier(test_example)

**************************************************
Sentence  : কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।
Prediction: Passive Violence
**************************************************


## Push the model to HuggingFace

In [None]:
model.push_to_hub("DistilBERT-Bangla-Sentiment-Analysis-VITD")

## Utilize the Fine-tuned Model from HuggingFace

In [None]:
model = BertForSequenceClassification.from_pretrained('mehedihasanbijoy/DistilBERT-Bangla-Sentiment-Analysis-VITD', num_labels=n_classes)
model.to(device)
print(f"Model is sent to {device}")

In [None]:
# Example sentence
test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

violence_identifier(test_example)