In [None]:
!pip install transformers datasets torch accelerate -U


In [3]:
import transformers
from datasets import load_dataset

dataset = load_dataset('csv', data_files={'train': '/kaggle/input/typeofhate/type_train.csv', 'test': '/kaggle/input/typeofhate/encoded_test.csv'})


In [4]:
from transformers import AutoTokenizer

# Choose the model checkpoint (BERT or RoBERTa that supports Bangla, such as 'bert-base-multilingual-cased')
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    # Tokenize the text
    tokenized_inputs = tokenizer(examples['sentence'], padding="max_length", truncation=True, max_length=512)
    
    # Include the labels
    tokenized_inputs['labels'] = examples['hate speech']
    
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/5029 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForSequenceClassification

num_labels = 16  
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    report_to = 'none',
    load_best_model_at_end=True,
)


In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [7]:
from datasets import load_metric

accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    
    # Calculate all metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }


  accuracy_metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [9]:
from transformers import Trainer, DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="pt")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


2024-06-08 09:01:27.532545: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-08 09:01:27.532651: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-08 09:01:27.641400: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.311,0.272223,0.897196,0.897441,0.897196,0.897227
2,0.2466,0.267783,0.900577,0.902277,0.900577,0.900604
3,0.2064,0.251322,0.909127,0.909418,0.909127,0.909156
4,0.1592,0.274622,0.907536,0.908076,0.907536,0.907571
5,0.127,0.305998,0.903162,0.904347,0.903162,0.903196




TrainOutput(global_step=6285, training_loss=0.21526377092966886, metrics={'train_runtime': 10657.387, 'train_samples_per_second': 18.87, 'train_steps_per_second': 0.59, 'total_flos': 5.292091568504832e+16, 'train_loss': 0.21526377092966886, 'epoch': 5.0})

In [10]:
results = trainer.evaluate()
print(results)



{'eval_loss': 0.2513217329978943, 'eval_accuracy': 0.9091270630344005, 'eval_precision': 0.9094180713281026, 'eval_recall': 0.9091270630344005, 'eval_f1': 0.9091557583421215, 'eval_runtime': 98.5348, 'eval_samples_per_second': 51.038, 'eval_steps_per_second': 1.603, 'epoch': 5.0}
