<a href="https://colab.research.google.com/github/mehedihasanbijoy/BanglaLLMs/blob/main/Text%20Classification/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Required Libraries

In [None]:
%%capture

!pip install -q gdown
!pip install --upgrade transformers[torch]
!pip install datasets

## Download Necessary Corpora

In [None]:
%%capture

# Download the folder named BLP2023-VITD (for fine-tune the LLM)
!gdown "https://drive.google.com/drive/folders/1TqmhXN2hyLutFlSoVAyTWkBBIK_DaLuF?usp=sharing" --folder

# Fetch the corpus (for fine-tune the tokenizer)
!gdown "https://drive.google.com/drive/folders/1oIT7DZhd4uXTpjgBeRGSP-Fs-1Ux3m-b?usp=sharing" --folder

In [None]:
!unzip "/content/BLP2023-VITD/BLP2023-VITD.zip"

Archive:  /content/BLP2023-VITD/BLP2023-VITD.zip
replace BLP2023-VITD/dev.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: BLP2023-VITD/dev.csv    
  inflating: BLP2023-VITD/test.csv   
  inflating: BLP2023-VITD/train.csv  


## Import Libraries

In [None]:
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

## Log In to HuggingFace-hub

In [None]:
%%capture
!apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Fine-tune the Tokenizer

In [None]:
test = pd.read_csv("/content/BanglaParaphraseBUETNLP/test.csv")
train = pd.read_csv("/content/BanglaParaphraseBUETNLP/train.csv")
valid = pd.read_csv("/content/BanglaParaphraseBUETNLP/valid.csv")

df = pd.concat([test, train, valid], ignore_index=True)
df.reset_index(drop=True, inplace=True)

source_texts = df['source'].tolist()
target_texts = df['target'].tolist()
all_texts = source_texts + target_texts
# all_texts = all_texts[:500000]
all_texts[:3]

['কিছুদিন আগে প্যারিস থেকে ঘুরে এসেছি।',
 'ভাড়া করে ফেললেন কার্নেগি হলের মতো অত্যন্ত অভিজাত অডিটোরিয়াম, যেখানে হাজার হাজার মানুষ একসাথে বসে পারফর্মেন্স দেখতে পারে।',
 'সম্পূর্ণ নিয়ন্ত্রণ হারিয়ে জাহাজ পড়লো ঘোর সমুদ্রে।']

In [None]:
all_considered_characters = [
    ' ',  'ঁ',  'ং',  'ঃ',  'অ',  'আ',  'ই',  'ঈ',  'উ',  'ঊ',  'ঋ',  'এ',  'ঐ',  'ও',  'ঔ',
    'ক',  'খ',  'গ',  'ঘ',  'ঙ',  'চ',  'ছ',  'জ',  'ঝ',  'ঞ',  'ট',  'ঠ',  'ড',  'ঢ',  'ণ',  'ত',
    'থ',  'দ',  'ধ',  'ন',  'প',  'ফ',  'ব',  'ভ',  'ম',  'য',  'র',  'ল',  'শ',  'ষ',  'স',  'হ',
    'ড়',   'ঢ়',   'য়',  '়',  'া',  'ি',  'ী',  'ু',  'ূ',  'ৃ',  'ে',  'ৈ',  'ো',  'ৌ',  '্',  'ৎ',
    '০',  '১',  '২',  '৩',  '৪',  '৫',  '৬',  '৭',  '৮',  '৯']

In [None]:
all_cleaned_sentences = []

for sent in tqdm(all_texts):
    cleaned_sent = ""
    for char in sent:
        if char in all_considered_characters:
            cleaned_sent += str(char)
    all_cleaned_sentences.append(cleaned_sent)

100%|██████████| 933260/933260 [01:17<00:00, 12043.74it/s]


In [None]:
pretrained_tokenizer_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)

example_sent = all_cleaned_sentences[random.randint(0, len(all_cleaned_sentences)-1)]

print(example_sent)
print(tokenizer.tokenize(example_sent))
print(tokenizer.encode(example_sent))
print(tokenizer.decode(tokenizer.encode(example_sent)))

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

তাইচুং এর সাংস্কৃতিক বিষয়ক ব্যুরোর প্রধান সচিব আন্দ্রেই ইশান ইয়াং বলেন উয়াং এর উৎসর্গীকরণ এবং তাকে সাহায্য করার জন্য ছাত্রের প্রচেষ্টা দেখে জনগণ খুব প্রভাবিত হয়েছিল এবং এটি দ্রুত একটি জাতীয় অনুষ্ঠানে পরিণত হয়েছিল
['ত', '##া', '##ই', '##চ', '##ং', 'এ', '##র', 'স', '##া', '##ং', '##স', '##ক', '##ত', '##ি', '##ক', 'ব', '##ি', '##ষ', '##য', '##ক', 'ব', '##য', '##র', '##ে', '##া', '##র', 'প', '##র', '##ধ', '##া', '##ন', 'স', '##চ', '##ি', '##ব', 'আ', '##ন', '##দ', '##র', '##ে', '##ই', 'ই', '##শ', '##া', '##ন', 'ই', '##য', '##া', '##ং', 'ব', '##ল', '##ে', '##ন', 'উ', '##য', '##া', '##ং', 'এ', '##র', '[UNK]', 'এ', '##ব', '##ং', 'ত', '##া', '##ক', '##ে', 'স', '##া', '##হ', '##া', '##য', '##য', 'ক', '##র', '##া', '##র', 'জ', '##ন', '##য', 'ছ', '##া', '##ত', '##র', '##ে', '##র', 'প', '##র', '##চ', '##ে', '##ষ', '##ট', '##া', 'দ', '##ে', '##খ', '##ে', 'জ', '##ন', '##গ', '##ণ', 'খ', '##ব', 'প', '##র', '##ভ', '##া', '##ব', '##ি', '##ত', 'হ', '##য', '##ে', '##ছ', '##ি', '##ল', 'এ', '##ব', '##ং

In [None]:
# Customize training parameters
vocab_size = 30000
min_frequency = 5

# Fine-tune the tokenizer on your custom dataset
tokenizer_finetuned = tokenizer.train_new_from_iterator(np.array(all_cleaned_sentences).reshape(-1, 1), vocab_size=vocab_size)

print(example_sent)
print(tokenizer_finetuned.tokenize(example_sent))
print(tokenizer_finetuned.encode(example_sent))
print(tokenizer_finetuned.decode(tokenizer_finetuned.encode(example_sent)))

তাইচুং এর সাংস্কৃতিক বিষয়ক ব্যুরোর প্রধান সচিব আন্দ্রেই ইশান ইয়াং বলেন উয়াং এর উৎসর্গীকরণ এবং তাকে সাহায্য করার জন্য ছাত্রের প্রচেষ্টা দেখে জনগণ খুব প্রভাবিত হয়েছিল এবং এটি দ্রুত একটি জাতীয় অনুষ্ঠানে পরিণত হয়েছিল
['তাই', '##চ', '##ং', 'এর', 'সাংসকতিক', 'বিষযক', 'বযরোর', 'পরধান', 'সচিব', 'আনদরে', '##ই', 'ইশ', '##ান', 'ইযাং', 'বলেন', 'উ', '##যাং', 'এর', 'উৎসরগ', '##ীকরণ', 'এবং', 'তাকে', 'সাহাযয', 'করার', 'জনয', 'ছাতরের', 'পরচেষটা', 'দেখে', 'জনগণ', 'খব', 'পরভাবিত', 'হযেছিল', 'এবং', 'এটি', 'দরত', 'একটি', 'জাতীয', 'অনষঠানে', 'পরিণত', 'হযেছিল']
[2, 449, 90, 109, 237, 5261, 3911, 21111, 684, 6980, 10886, 97, 9625, 132, 12037, 439, 11, 7556, 237, 7067, 6238, 182, 347, 1247, 295, 212, 13919, 3402, 842, 2730, 400, 2937, 397, 182, 486, 1063, 238, 1341, 4133, 1573, 397, 3]
[CLS] তাইচং এর সাংসকতিক বিষযক বযরোর পরধান সচিব আনদরেই ইশান ইযাং বলেন উযাং এর উৎসরগীকরণ এবং তাকে সাহাযয করার জনয ছাতরের পরচেষটা দেখে জনগণ খব পরভাবিত হযেছিল এবং এটি দরত একটি জাতীয অনষঠানে পরিণত হযেছিল [SEP]


## Process the Violence Inciting Text Detection (VITD) Corpus

In [None]:
train_df = pd.read_csv("/content/BLP2023-VITD/train.csv")
val_df = pd.read_csv("/content/BLP2023-VITD/dev.csv")
test_df = pd.read_csv("/content/BLP2023-VITD/test.csv")

print(f"Number of instances in training set  : {str(len(train_df)).rjust(4)}")
print(f"Number of instances in validation set: {str(len(val_df)).rjust(4)}")
print(f"Number of instances in Test set      : {str(len(test_df)).rjust(4)}")

Number of instances in training set  : 2700
Number of instances in validation set: 1330
Number of instances in Test set      : 2016


In [None]:
id2label = {0: "Non-Violence", 1: "Passive Violence", 2: "Direct Violence"}
print(f"id2label: {id2label}")

label2id = {v: k for k, v in id2label.items()}
print(f"label2id: {label2id}")

id2label: {0: 'Non-Violence', 1: 'Passive Violence', 2: 'Direct Violence'}
label2id: {'Non-Violence': 0, 'Passive Violence': 1, 'Direct Violence': 2}


In [None]:
hf_train_dataset = Dataset.from_pandas(train_df)
hf_val_dataset = Dataset.from_pandas(val_df)
hf_test_dataset = Dataset.from_pandas(test_df)

print(hf_train_dataset)
print(hf_val_dataset)
print(hf_test_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 2700
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1330
})
Dataset({
    features: ['text', 'label'],
    num_rows: 2016
})


In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer_finetuned(examples['text'], padding='max_length', truncation=True)

hf_train_dataset_tokenized = hf_train_dataset.map(tokenize_function, batched=True)
hf_val_dataset_tokenized = hf_val_dataset.map(tokenize_function, batched=True)
hf_test_dataset_tokenized = hf_test_dataset.map(tokenize_function, batched=True)

print(hf_train_dataset_tokenized)
print(hf_val_dataset_tokenized)
print(hf_test_dataset_tokenized)

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1330 [00:00<?, ? examples/s]

Map:   0%|          | 0/2016 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2700
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1330
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2016
})


## Fine-tune BERT

In [None]:
n_classes = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_classes)
model.to(device)
print(f"Model is sent to {device}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is sent to cuda


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./BERT_finetuned",
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    num_train_epochs=20,
    learning_rate=2e-5,
    push_to_hub=False,
)

# Total Steps = (no. of training instances / batch size) * no. of epochs

In [None]:
# Define the custom compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "precision": precision_score(p.label_ids, preds, average='macro'),
        "recall": recall_score(p.label_ids, preds, average='macro'),
        "f1-score": f1_score(p.label_ids, preds, average='macro'),
    }

In [None]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_dataset_tokenized,
    eval_dataset=hf_val_dataset_tokenized,
    compute_metrics=compute_metrics
)

In [None]:
# Performance of the model before fine-tuning
trainer.evaluate(eval_dataset=hf_test_dataset_tokenized)

{'eval_loss': 1.034454345703125,
 'eval_accuracy': 0.5441468253968254,
 'eval_precision': 0.4035436330518298,
 'eval_recall': 0.33395641080305505,
 'eval_f1-score': 0.2366489092667848,
 'eval_runtime': 61.5264,
 'eval_samples_per_second': 32.766,
 'eval_steps_per_second': 4.096}

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
500,0.9568,0.95899,0.616541,0.441464,0.422357,0.387252
1000,0.7764,0.92376,0.65188,0.622306,0.507924,0.500468
1500,0.547,1.165434,0.671429,0.625762,0.569601,0.58569
2000,0.4432,1.597026,0.67594,0.683746,0.553797,0.568706
2500,0.3167,2.032288,0.655639,0.595294,0.576368,0.583552
3000,0.2264,2.100818,0.66391,0.614346,0.588489,0.596578
3500,0.1702,2.207741,0.669925,0.619548,0.590247,0.601256
4000,0.1261,2.269895,0.675188,0.641611,0.577073,0.594933
4500,0.0804,2.622622,0.665414,0.621114,0.578603,0.593065
5000,0.0433,2.772049,0.666165,0.624809,0.583909,0.598105


TrainOutput(global_step=6760, training_loss=0.27799435360191843, metrics={'train_runtime': 5494.407, 'train_samples_per_second': 9.828, 'train_steps_per_second': 1.23, 'total_flos': 1.4208124557312e+16, 'train_loss': 0.27799435360191843, 'epoch': 20.0})

In [None]:
# Performance of the model after fine-tuning
trainer.evaluate(eval_dataset=hf_test_dataset_tokenized)

{'eval_loss': 3.039658308029175,
 'eval_accuracy': 0.65625,
 'eval_precision': 0.5837544173394584,
 'eval_recall': 0.566123597451191,
 'eval_f1-score': 0.5631878176924914,
 'eval_runtime': 64.0705,
 'eval_samples_per_second': 31.465,
 'eval_steps_per_second': 3.933,
 'epoch': 20.0}

In [None]:
def violence_identifier(sentence):
    # Tokenize the example text
    input_tokens = tokenizer_finetuned(sentence, return_tensors="pt", padding=True, truncation=True)

    # Move the inputs to the same device as the model
    input_tokens = {key: value.to(model.device) for key, value in input_tokens.items()}

    # Pass the tokenized input through the model
    outputs = model(**input_tokens)

    # Get the predicted class probabilities
    logits = outputs.logits
    probabilities = logits.softmax(dim=1)

    # Get the predicted class index
    predicted_class = torch.argmax(probabilities).item()

    print(f"{'*'*50}\nSentence  : {test_example}\nPrediction: {id2label[predicted_class]}\n{'*'*50}")

In [None]:
# Example sentence
test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

violence_identifier(test_example)

**************************************************
Sentence  : কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।
Prediction: Passive Violence
**************************************************


## Push the model to HuggingFace

In [None]:
model.push_to_hub("BERT-Bangla-Sentiment-Analysis-VITD")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mehedihasanbijoy/BERT-Bangla-Sentiment-Analysis-VITD/commit/711b26bd2f69fe6e1d0dece16e73ca5f51effb5d', commit_message='Upload BertForSequenceClassification', commit_description='', oid='711b26bd2f69fe6e1d0dece16e73ca5f51effb5d', pr_url=None, pr_revision=None, pr_num=None)

## Utilize the Fine-tuned Model from HuggingFace

In [None]:
model = BertForSequenceClassification.from_pretrained('mehedihasanbijoy/BERT-Bangla-Sentiment-Analysis-VITD', num_labels=n_classes)
model.to(device)
print(f"Model is sent to {device}")

config.json:   0%|          | 0.00/881 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Model is sent to cuda


In [None]:
# Example sentence
test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

violence_identifier(test_example)

**************************************************
Sentence  : কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।
Prediction: Passive Violence
**************************************************
