In [None]:
from google.colab import drive

drive.mount('/content/drive')

TRAINING_DATA_PATH = "/content/drive/MyDrive/aml-project/TRAINING_DATA"
TEST_DATA_PATH = "/content/drive/MyDrive/aml-project/TEST_UNIQ_DATA"
TRAIN_CSV = f"{TRAINING_DATA_PATH}/training_dataset.csv"
TEST_CSV = f"{TEST_DATA_PATH}/test_dataset.csv"

print("Training CSV:", TRAIN_CSV)
print("Test CSV:", TEST_CSV)

!pip install transformers datasets torchaudio soundfile evaluate -q

import os
import torch
import torchaudio
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoFeatureExtractor,
    Wav2Vec2ForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import precision_recall_fscore_support
import evaluate

# os.environ["WANDB_DISABLED"] = "true"

processor = AutoFeatureExtractor.from_pretrained("tanmoyio/wav2vec2-large-xlsr-bengali")
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def preprocess_data(batch, data_dir):
    waveform, sample_rate = torchaudio.load(os.path.join(data_dir, batch["file"]))

    inputs = processor(
        waveform.squeeze(0),
        sampling_rate=sample_rate,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=64000
    )
    batch["input_values"] = inputs.input_values[0]
    batch["label"] = 0 if batch["label"].lower().strip() == "non-toxic" else 1
    return batch


def collate_fn(batch):
    input_values = [item["input_values"] for item in batch]
    labels = [item["label"] for item in batch]
    return {"input_values": torch.tensor(input_values), "labels": torch.tensor(labels)}


def compute_metrics(eval_pred):
    preds = np.argmax(eval_pred.predictions, axis=1)
    refs = eval_pred.label_ids

    acc = accuracy_metric.compute(predictions=preds, references=refs)
    f1 = f1_metric.compute(predictions=preds, references=refs, average="weighted")
    precision, recall, _, _ = precision_recall_fscore_support(refs, preds, average="weighted")

    return {
        "accuracy": acc["accuracy"],
        "f1": f1["f1"],
        "precision": precision,
        "recall": recall
    }



df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)

dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

dataset_train = dataset_train.train_test_split(test_size=0.2)

dataset_train = dataset_train.map(lambda batch: preprocess_data(batch, TRAINING_DATA_PATH))
dataset_test = dataset_test.map(lambda batch: preprocess_data(batch, TEST_DATA_PATH))

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "tanmoyio/wav2vec2-large-xlsr-bengali",
    num_labels=2
)

# Freeze feature extractor
# model.freeze_feature_extractor()
# N = 8
# for layer in model.wav2vec2.encoder.layers[:N]:
#     for param in layer.parameters():
#         param.requires_grad = False

training_args = TrainingArguments(
    output_dir="toxic_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train["train"],
    eval_dataset=dataset_train["test"],
    tokenizer=processor,
    data_collator=collate_fn,
    compute_metrics=compute_metrics
)

trainer.train()
print("Evaluating on held-out test set...")
test_results = trainer.evaluate(eval_dataset=dataset_test)
print(test_results)

SAVE_DIR = "/content/drive/MyDrive/aml-project/wav2vec2_bangla_toxic_model"
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print(f"✅ Model saved at {SAVE_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training CSV: /content/drive/MyDrive/aml-project/TRAINING_DATA/training_dataset.csv
Test CSV: /content/drive/MyDrive/aml-project/TEST_UNIQ_DATA/test_dataset.csv




Map:   0%|          | 0/3179 [00:00<?, ? examples/s]

  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at tanmoyio/wav2vec2-large-xlsr-bengali and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1174,0.075335,0.984906,0.984905,0.985105,0.984906


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1174,0.075335,0.984906,0.984905,0.985105,0.984906
2,0.0408,0.034585,0.989937,0.989936,0.990047,0.989937
3,0.037,0.008919,0.998742,0.998742,0.998745,0.998742
4,0.003,0.019776,0.994969,0.994968,0.995018,0.994969
5,0.0026,0.004122,0.998742,0.998742,0.998745,0.998742


Evaluating on held-out test set...


{'eval_loss': 0.0026763915084302425, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 9.8909, 'eval_samples_per_second': 20.221, 'eval_steps_per_second': 2.528, 'epoch': 5.0}
✅ Model saved at /content/drive/MyDrive/aml-project/wav2vec2_bangla_toxic_model


In [None]:
!pip install --upgrade transformers


