<a href="https://colab.research.google.com/github/mehedihasanbijoy/BanglaLLMs/blob/main/Text%20Classification/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Required Libraries

In [1]:
%%capture

!pip install -q gdown
!pip install --upgrade transformers[torch]
!pip install datasets

## Download Necessary Corpora

In [2]:
%%capture

# Download the folder named BLP2023-VITD (for fine-tune the LLM)
!gdown "https://drive.google.com/drive/folders/1TqmhXN2hyLutFlSoVAyTWkBBIK_DaLuF?usp=sharing" --folder

# Fetch the corpus (for fine-tune the tokenizer)
!gdown "https://drive.google.com/drive/folders/1oIT7DZhd4uXTpjgBeRGSP-Fs-1Ux3m-b?usp=sharing" --folder

In [4]:
!unzip "/content/BLP2023-VITD/BLP2023-VITD.zip"

Archive:  /content/BLP2023-VITD/BLP2023-VITD.zip
replace BLP2023-VITD/dev.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: BLP2023-VITD/dev.csv    
  inflating: BLP2023-VITD/test.csv   
  inflating: BLP2023-VITD/train.csv  


## Import Libraries

In [5]:
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, AutoTokenizer, BertForSequenceClassification, AdamW
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

## Log In to HuggingFace-hub

In [22]:
%%capture
!apt install git-lfs

In [23]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Fine-tune the Tokenizer

In [6]:
test = pd.read_csv("/content/BanglaParaphraseBUETNLP/test.csv")
train = pd.read_csv("/content/BanglaParaphraseBUETNLP/train.csv")
valid = pd.read_csv("/content/BanglaParaphraseBUETNLP/valid.csv")

df = pd.concat([test, train, valid], ignore_index=True)
df.reset_index(drop=True, inplace=True)

source_texts = df['source'].tolist()
target_texts = df['target'].tolist()
all_texts = source_texts + target_texts
# all_texts = all_texts[:500000]
all_texts[:3]

['কিছুদিন আগে প্যারিস থেকে ঘুরে এসেছি।',
 'ভাড়া করে ফেললেন কার্নেগি হলের মতো অত্যন্ত অভিজাত অডিটোরিয়াম, যেখানে হাজার হাজার মানুষ একসাথে বসে পারফর্মেন্স দেখতে পারে।',
 'সম্পূর্ণ নিয়ন্ত্রণ হারিয়ে জাহাজ পড়লো ঘোর সমুদ্রে।']

In [7]:
all_considered_characters = [
    ' ',  'ঁ',  'ং',  'ঃ',  'অ',  'আ',  'ই',  'ঈ',  'উ',  'ঊ',  'ঋ',  'এ',  'ঐ',  'ও',  'ঔ',
    'ক',  'খ',  'গ',  'ঘ',  'ঙ',  'চ',  'ছ',  'জ',  'ঝ',  'ঞ',  'ট',  'ঠ',  'ড',  'ঢ',  'ণ',  'ত',
    'থ',  'দ',  'ধ',  'ন',  'প',  'ফ',  'ব',  'ভ',  'ম',  'য',  'র',  'ল',  'শ',  'ষ',  'স',  'হ',
    'ড়',   'ঢ়',   'য়',  '়',  'া',  'ি',  'ী',  'ু',  'ূ',  'ৃ',  'ে',  'ৈ',  'ো',  'ৌ',  '্',  'ৎ',
    '০',  '১',  '২',  '৩',  '৪',  '৫',  '৬',  '৭',  '৮',  '৯']

In [8]:
all_cleaned_sentences = []

for sent in tqdm(all_texts):
    cleaned_sent = ""
    for char in sent:
        if char in all_considered_characters:
            cleaned_sent += str(char)
    all_cleaned_sentences.append(cleaned_sent)

100%|██████████| 933260/933260 [01:28<00:00, 10570.51it/s]


In [9]:
pretrained_tokenizer_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)

example_sent = all_cleaned_sentences[random.randint(0, len(all_cleaned_sentences)-1)]

print(example_sent)
print(tokenizer.tokenize(example_sent))
print(tokenizer.encode(example_sent))
print(tokenizer.decode(tokenizer.encode(example_sent)))

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

এরপর কী হবে তা নিয়ে অবশ্য নিজেরাও নিশ্চিত ছিলো না তারা
['এ', '##র', '##প', '##র', 'ক', '##ী', 'হ', '##ব', '##ে', 'ত', '##া', 'ন', '##ি', '##য', '##ে', 'অ', '##ব', '##শ', '##য', 'ন', '##ি', '##জ', '##ে', '##র', '##া', '##ও', 'ন', '##ি', '##শ', '##চ', '##ি', '##ত', 'ছ', '##ি', '##ল', '##ে', '##া', 'ন', '##া', 'ত', '##া', '##র', '##া']
[101, 1351, 29908, 29903, 29908, 1353, 29916, 1377, 29904, 29917, 1362, 29914, 1366, 29915, 29907, 29917, 1347, 29904, 29910, 29907, 1366, 29915, 29894, 29917, 29908, 29914, 29888, 1366, 29915, 29910, 29892, 29915, 29898, 1357, 29915, 29909, 29917, 29914, 1366, 29914, 1362, 29914, 29908, 29914, 102]
[CLS] এরপর কী হবে তা নিযে অবশয নিজেরাও নিশচিত ছিলো না তারা [SEP]


In [10]:
# Customize training parameters
vocab_size = 30000
min_frequency = 5

# Fine-tune the tokenizer on your custom dataset
tokenizer_finetuned = tokenizer.train_new_from_iterator(np.array(all_cleaned_sentences).reshape(-1, 1), vocab_size=vocab_size)

print(example_sent)
print(tokenizer_finetuned.tokenize(example_sent))
print(tokenizer_finetuned.encode(example_sent))
print(tokenizer_finetuned.decode(tokenizer_finetuned.encode(example_sent)))

এরপর কী হবে তা নিয়ে অবশ্য নিজেরাও নিশ্চিত ছিলো না তারা
['এরপর', 'কী', 'হবে', 'তা', 'নিযে', 'অবশয', 'নিজেরাও', 'নিশচিত', 'ছিলো', 'না', 'তারা']
[2, 647, 555, 306, 157, 257, 931, 26256, 1410, 1090, 166, 266, 3]
[CLS] এরপর কী হবে তা নিযে অবশয নিজেরাও নিশচিত ছিলো না তারা [SEP]


## Process the Violence Inciting Text Detection (VITD) Corpus

In [12]:
train_df = pd.read_csv("/content/BLP2023-VITD/train.csv")
val_df = pd.read_csv("/content/BLP2023-VITD/dev.csv")
test_df = pd.read_csv("/content/BLP2023-VITD/test.csv")

print(f"Number of instances in training set  : {str(len(train_df)).rjust(4)}")
print(f"Number of instances in validation set: {str(len(val_df)).rjust(4)}")
print(f"Number of instances in Test set      : {str(len(test_df)).rjust(4)}")

Number of instances in training set  : 2700
Number of instances in validation set: 1330
Number of instances in Test set      : 2016


In [13]:
id2label = {0: "Non-Violence", 1: "Passive Violence", 2: "Direct Violence"}
print(f"id2label: {id2label}")

label2id = {v: k for k, v in id2label.items()}
print(f"label2id: {label2id}")

id2label: {0: 'Non-Violence', 1: 'Passive Violence', 2: 'Direct Violence'}
label2id: {'Non-Violence': 0, 'Passive Violence': 1, 'Direct Violence': 2}


In [14]:
hf_train_dataset = Dataset.from_pandas(train_df)
hf_val_dataset = Dataset.from_pandas(val_df)
hf_test_dataset = Dataset.from_pandas(test_df)

print(hf_train_dataset)
print(hf_val_dataset)
print(hf_test_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 2700
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1330
})
Dataset({
    features: ['text', 'label'],
    num_rows: 2016
})


In [15]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer_finetuned(examples['text'], padding='max_length', truncation=True)

hf_train_dataset_tokenized = hf_train_dataset.map(tokenize_function, batched=True)
hf_val_dataset_tokenized = hf_val_dataset.map(tokenize_function, batched=True)
hf_test_dataset_tokenized = hf_test_dataset.map(tokenize_function, batched=True)

print(hf_train_dataset_tokenized)
print(hf_val_dataset_tokenized)
print(hf_test_dataset_tokenized)

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1330 [00:00<?, ? examples/s]

Map:   0%|          | 0/2016 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2700
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1330
})
Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2016
})


## Fine-tune BERT

In [16]:
n_classes = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_classes)
model.to(device)
print(f"Model is sent to {device}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is sent to cuda


In [24]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./BERT_finetuned",
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    num_train_epochs=50,
    learning_rate=2e-5,
    push_to_hub=False,
)

# Total Steps = (no. of training instances / batch size) * no. of epochs

In [25]:
# Define the custom compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "precision": precision_score(p.label_ids, preds, average='macro'),
        "recall": recall_score(p.label_ids, preds, average='macro'),
        "f1-score": f1_score(p.label_ids, preds, average='macro'),
    }

In [26]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_dataset_tokenized,
    eval_dataset=hf_val_dataset_tokenized,
    compute_metrics=compute_metrics
)

In [27]:
# Performance of the model before fine-tuning
trainer.evaluate(eval_dataset=hf_test_dataset_tokenized)

{'eval_loss': 0.936097264289856,
 'eval_accuracy': 0.5436507936507936,
 'eval_precision': 0.1812169312169312,
 'eval_recall': 0.3333333333333333,
 'eval_f1-score': 0.23479005998286204,
 'eval_runtime': 65.4452,
 'eval_samples_per_second': 30.804,
 'eval_steps_per_second': 3.851}

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
500,1.0033,0.988675,0.539098,0.179699,0.333333,0.233512
1000,0.9986,0.98365,0.539098,0.179699,0.333333,0.233512
1500,0.9944,0.987364,0.539098,0.179699,0.333333,0.233512
2000,0.9951,0.983839,0.539098,0.179699,0.333333,0.233512
2500,0.9963,0.980099,0.539098,0.179699,0.333333,0.233512
3000,0.9986,0.987746,0.539098,0.179699,0.333333,0.233512
3500,0.9927,0.980256,0.539098,0.179699,0.333333,0.233512
4000,0.9933,0.987973,0.539098,0.179699,0.333333,0.233512
4500,0.9908,0.980619,0.539098,0.179699,0.333333,0.233512
5000,0.9951,0.983995,0.539098,0.179699,0.333333,0.233512


In [None]:
# Performance of the model after fine-tuning
trainer.evaluate(eval_dataset=hf_test_dataset_tokenized)

In [None]:
def violence_identifier(sentence):
    # Tokenize the example text
    input_tokens = tokenizer_finetuned(sentence, return_tensors="pt", padding=True, truncation=True)

    # Move the inputs to the same device as the model
    input_tokens = {key: value.to(model.device) for key, value in input_tokens.items()}

    # Pass the tokenized input through the model
    outputs = model(**input_tokens)

    # Get the predicted class probabilities
    logits = outputs.logits
    probabilities = logits.softmax(dim=1)

    # Get the predicted class index
    predicted_class = torch.argmax(probabilities).item()

    print(f"{'*'*50}\nSentence  : {test_example}\nPrediction: {id2label[predicted_class]}\n{'*'*50}")

In [None]:
# Example sentence
test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

violence_identifier(test_example)

## Push the model to HuggingFace

In [None]:
model.push_to_hub("BERT-Bangla-Sentiment-Analysis-VITD")

## Utilize the Fine-tuned Model from HuggingFace

In [None]:
model = BertForSequenceClassification.from_pretrained('mehedihasanbijoy/BERT-Bangla-Sentiment-Analysis-VITD', num_labels=n_classes)
model.to(device)
print(f"Model is sent to {device}")

In [None]:
# Example sentence
test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

violence_identifier(test_example)