# Install necessary libraries and datasets

In [37]:
%%capture

# Install
!pip install -q gdown
!pip install --upgrade transformers[torch]
!pip install datasets

# Download the folder named Corpus
!gdown "https://drive.google.com/drive/folders/17kqztJuoqRBajJXYHDK3IoPXqraBC_VD?usp=sharing" --folder

# Import necessary libraries

In [38]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import torch

# Convert a dataframe to HuggingFace dataset

In [39]:
df = pd.read_csv("/content/Corpus/sentiment_classification.csv")
df.sample(3)

Unnamed: 0,text,label
316,একটা ভাল উদ্যেগ,0
188,"কোনো খুন হয়নি মিথ্যে কথা,, আওয়ামীলিগ মরলে এই ড...",1
735,আলহামদুলিল্লাহ অনেক অনেক ধন্যবাদ ভাই জান তোমাক...,0


In [40]:
id2label = {0: "Non-Violence", 1: "Passive Violence", 2: "Direct Violence"}

label2id = {v: k for k, v in id2label.items()}
label2id

{'Non-Violence': 0, 'Passive Violence': 1, 'Direct Violence': 2}

In [41]:

hf_dataset = Dataset.from_pandas(df)
hf_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1330
})

# Test the transformed dataset with HF models and pipeline

In [50]:
# Load the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=3, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [52]:
hf_dataset_tokenized = hf_dataset.map(tokenize_function, batched=True)
hf_dataset_tokenized

Map:   0%|          | 0/1330 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1330
})

In [53]:
hf_dataset_tokenized.to_pandas().sample(3)

Unnamed: 0,text,label,input_ids,attention_mask
893,আল্লাহ্ ছাড়্ দেন; কিন্তু ছেড়ে দেন না।ঝড়ে সেই গ...,0,"[101, 76837, 111240, 29621, 26145, 111240, 954...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
115,মাসজিদের দরজা খুলে দিল কে ।।তার পিছনের শক্তি ক...,0,"[101, 18601, 15258, 110013, 16755, 965, 11128,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
526,কাদের কে জংগী বলে তোরা নিজেরই জংগী। আভিজিতের ম...,2,"[101, 948, 101655, 11421, 82937, 955, 22756, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [54]:
# Define the custom compute_metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

In [55]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./distilbert_model",
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=1000,
    num_train_epochs=10,
    learning_rate=1e-5,
    push_to_hub=False,
)

In [56]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset_tokenized,
    eval_dataset=hf_dataset_tokenized,
    compute_metrics=compute_metrics
)

In [57]:
trainer.evaluate()

{'eval_loss': 1.1524354219436646,
 'eval_accuracy': 0.1481203007518797,
 'eval_runtime': 25.203,
 'eval_samples_per_second': 52.772,
 'eval_steps_per_second': 6.626}

In [58]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
200,No log,0.790153,0.675188
400,No log,0.672365,0.731579
600,0.836600,0.520746,0.801504
800,0.836600,0.381061,0.858647
1000,0.519600,0.286913,0.910526
1200,0.519600,0.235954,0.927068
1400,0.519600,0.193338,0.943609
1600,0.308000,0.176265,0.949624


TrainOutput(global_step=1670, training_loss=0.5203932139687909, metrics={'train_runtime': 893.7899, 'train_samples_per_second': 14.88, 'train_steps_per_second': 1.868, 'total_flos': 1761847821619200.0, 'train_loss': 0.5203932139687909, 'epoch': 10.0})

# Test the model

In [65]:
# Example sentence
test_example = "কুত্তার বাচ্চা দেখতে শুয়োরের বাচ্চার থেকে সুন্দর।"

# Tokenize the example text
input_tokens = tokenizer(test_example, return_tensors="pt", padding=True, truncation=True)
print(f"input_tokens: {input_tokens}\n\n")

# Move the inputs to the same device as the model
input_tokens = {key: value.to(model.device) for key, value in input_tokens.items()}
print(f"input_tokens: {input_tokens}\n\n")

# Pass the tokenized input through the model
outputs = model(**input_tokens)
print(f"outputs: {outputs}\n\n")

# Get the predicted class probabilities
logits = outputs.logits
probabilities = logits.softmax(dim=1)
print(f"logits: {logits}\n\n")
print(f"probabilities: {probabilities}\n\n")

# Get the predicted class index
predicted_class = torch.argmax(probabilities).item()
print(f"predicted_class: {predicted_class}\n\n")

print(f"{'*'*50}\nSentence  : {test_example}\nPrediction: {id2label[predicted_class]}\n{'*'*50}")

input_tokens: {'input_ids': tensor([[   101,    948,  98191,  72659,  11128,  17660,  39427, 111240,  39427,
          12079,    965,  11199, 109967,    976,  16166,  13228,  47719,  11421,
          17660,  39427, 111240,  85746,  14339,    978,  64437,  72614,  11128,
            920,    102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}


input_tokens: {'input_ids': tensor([[   101,    948,  98191,  72659,  11128,  17660,  39427, 111240,  39427,
          12079,    965,  11199, 109967,    976,  16166,  13228,  47719,  11421,
          17660,  39427, 111240,  85746,  14339,    978,  64437,  72614,  11128,
            920,    102]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]], device='cuda:0')}


outputs: SequenceClassifierOutput(loss=None, logits=tensor([[-1.6918,  3.2139, -0.5135]], device='cuda:0',
     