# **Team Information**
* Team Name: Neuralsight
* Members: Utpal Barua, Nuzhat Tabassum Medha
* Model: [https://huggingface.co/utpal07/wav2vec2-bangla-finetuned-xlarge](https://huggingface.co/utpal07/wav2vec2-bangla-finetuned-xlarge)
* Libraries: Transformers, Datasets, Evaluate, Librosa, Soundfile, Audiomentations, PyTorch
* Dataset: Custom Bangla audio dataset from /kaggle/input/shobdotori, including training audio files organized by dialects (e.g., Rajshahi) and annotations in CSV format.

# **SECTION 1: Environment Setup & Dependencies**

In [1]:
# Install required packages with specific versions
!pip uninstall -y pyarrow protobuf google-cloud-bigquery-storage
!pip install -U pyarrow==16.1.0 protobuf==4.25.3 datasets==2.21.0 \
    evaluate==0.4.3 librosa soundfile jiwer --no-cache-dir --quiet
!pip install jiwer
!pip install audiomentations==0.27.0
!pip install --upgrade protobuf

Found existing installation: pyarrow 16.1.0
Uninstalling pyarrow-16.1.0:
  Successfully uninstalled pyarrow-16.1.0
Found existing installation: protobuf 6.33.1
Uninstalling protobuf-6.33.1:
  Successfully uninstalled protobuf-6.33.1
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m213.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m300.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.7/260.7 kB[0m [31m339.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
audiomentations 0.27.0 requires librosa<0.10.0,>0.7.2, but you have librosa 0.11.0 which is incompatible.

In [None]:
# Import core libraries
import os
import pandas as pd
import torch
import librosa
import soundfile as sf
import numpy as np
from tqdm.auto import tqdm
from collections import Counter
import warnings


# Import transformers components
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
from transformers import TrainingArguments,Trainer
from transformers import EarlyStoppingCallback

# Import ML utilities
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import evaluate

warnings.filterwarnings('ignore')

In [3]:
# Set device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nTraining on device: {DEVICE}")


Training on device: cuda


# **SECTION 2: Disable W&B and Unnecessary Warnings**

In [4]:
# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["WANDB_API_KEY"] = "dummy"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

In [5]:
# Uninstall wandb completely
!pip uninstall -y wandb

# Verify wandb is not imported
import sys
print("wandb in modules:", "wandb" in sys.modules)

[0mwandb in modules: False


# **SECTION 3: Verify Package Versions**

In [6]:
import google.protobuf
import transformers
import datasets
import peft
import accelerate

print("PACKAGE VERSIONS")
print(f"Accelerate:    {accelerate.__version__}")
print(f"Protobuf:      {google.protobuf.__version__}")
print(f"Transformers:  {transformers.__version__}")
print(f"Datasets:      {datasets.__version__}")
print(f"PEFT:          {peft.__version__}")

PACKAGE VERSIONS
Accelerate:    1.9.0
Protobuf:      6.33.1
Transformers:  4.53.3
Datasets:      2.21.0
PEFT:          0.16.0


# **SECTION 4: Install Audio Augmentation Library**

In [7]:
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch

# Define augmentation pipeline
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.4),
    PitchShift(min_semitones=-2, max_semitones=2, p=0.4),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.3)
])

# **SECTION 5: Configuration Class**

In [8]:
# SECTION 5: Configuration Class

class Config:
    model_name = "utpal07/wav2vec2-bangla-finetuned-large"
    
    # Audio processing
    sampling_rate = 16000
    max_duration = 10.0
    max_length = int(sampling_rate * max_duration)
    
    # Training hyperparameters
    batch_size = 2
    gradient_accumulation_steps = 8
    learning_rate = 5e-5
    num_epochs = 30
    weight_decay = 0.01
    warmup_steps = 500
    
    # Logging and checkpointing
    logging_steps = 100
    save_total_limit = 2
    save_strategy = "epoch"
    eval_strategy = "epoch"
    load_best_model_at_end = True
    metric_for_best_model = "wer"
    greater_is_better = False
    fp16 = True

config = Config()

# **SECTION 6: Utility Functions**

In [9]:
# SECTION 6: Utility Functions

def safe_normalize(audio):
    max_val = np.max(np.abs(audio))
    if max_val > 0:
        return audio / max_val
    return audio

In [10]:
#Load all training data from dialect folders

def load_dataset(base_path):
    
    audio_paths = []
    labels = []
    
    train_path = os.path.join(base_path, "Train")
    annotation_path = os.path.join(base_path, "Train_annotation")
    
    # Load annotations from CSV files
    for csv_file in os.listdir(annotation_path):
        if csv_file.endswith('.csv'):
            dialect = csv_file.replace('.csv', '')
            csv_path = os.path.join(annotation_path, csv_file)
            df = pd.read_csv(csv_path)
            
            for _, row in df.iterrows():
                audio_file = row['audio']
                text = row['text']
                
                # Construct full audio path
                dialect_folder = os.path.join(train_path, dialect)
                audio_path = os.path.join(dialect_folder, audio_file)
                
                if os.path.exists(audio_path):
                    audio_paths.append(audio_path)
                    labels.append(text)
    
    return audio_paths, labels

In [11]:
#Split data into train and validation sets

def prepare_data_for_training(audio_paths, labels, test_size=0.1):
    
    train_paths, val_paths, train_labels, val_labels = train_test_split(
        audio_paths, 
        labels, 
        test_size=test_size, 
        random_state=42, 
        shuffle=True
    )
    return train_paths, val_paths, train_labels, val_labels

# **SECTION 7: Custom Dataset Class**

In [12]:
class BanglaDataset(Dataset):
    
    def __init__(self, audio_paths, labels, processor, max_duration=10.0, 
                 sampling_rate=16000, augment_audio=False):
        self.audio_paths = audio_paths
        self.labels = labels
        self.processor = processor
        self.max_duration = max_duration
        self.sampling_rate = sampling_rate
        self.augment_audio = augment_audio
    
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        label = self.labels[idx]
        
        # Load audio file
        speech_array, sr = sf.read(audio_path)
        
        # Resample if necessary
        if sr != self.sampling_rate:
            speech_array = librosa.resample(
                speech_array, 
                orig_sr=sr, 
                target_sr=self.sampling_rate
            )
        
        # Normalize audio
        speech_array = safe_normalize(speech_array)
        
        # Apply augmentation (only for training)
        if self.augment_audio:
            speech_array = augment(
                samples=speech_array, 
                sample_rate=self.sampling_rate
            )
        
        # Encode labels
        if isinstance(label, str) and len(label.strip()) > 0:
            labels_ids = self.processor.tokenizer(label).input_ids
        else:
            labels_ids = [self.processor.tokenizer.pad_token_id]
        
        # Process audio into model input
        inputs = self.processor(
            speech_array,
            sampling_rate=self.sampling_rate,
            return_tensors="pt",
            padding=True,
            max_length=int(self.max_duration * self.sampling_rate),
            truncation=True
        )
        
        return {
            "input_values": inputs.input_values[0],
            "attention_mask": inputs.attention_mask[0],
            "labels": torch.tensor(labels_ids, dtype=torch.long)
        }

# **SECTION 8: Data Collator for CTC**

In [13]:
class DataCollatorCTCManual:
    
    def __init__(self, processor, padding_value=0.0, label_padding_value=-100):
        self.processor = processor
        self.padding_value = padding_value
        self.label_padding_value = label_padding_value
    
    def __call__(self, batch):
        # Extract input values and labels
        input_values = [torch.tensor(x["input_values"]) for x in batch]
        labels = [torch.tensor(x["labels"]) for x in batch]
        
        # Pad sequences to max length in batch
        input_values_padded = pad_sequence(
            input_values, 
            batch_first=True, 
            padding_value=self.padding_value
        )
        labels_padded = pad_sequence(
            labels, 
            batch_first=True, 
            padding_value=self.label_padding_value
        )
        
        return {
            "input_values": input_values_padded, 
            "labels": labels_padded
        }

# **SECTION 9: Metrics Computation**

In [14]:
def compute_metrics(pred):
    
    wer_metric = evaluate.load("wer")
    
    pred_logits = pred.predictions
    pred_ids = pred_logits.argmax(-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# **SECTION 10: Load and Prepare Data**

In [15]:
# Load and Prepare Data

# Set base path
base_path = "/kaggle/input/shobdotori"

# Load dataset
audio_paths, labels = load_dataset(base_path)

print("DATASET STATISTICS")
print(f"Total samples:     {len(audio_paths)}")
print(f"Sample audio path: {audio_paths[0]}")
print(f"Sample label:      {labels[0]}")

# Split into train and validation
train_paths, val_paths, train_labels, val_labels = prepare_data_for_training(
    audio_paths, labels, test_size=0.1
)

print(f"\nTraining samples:   {len(train_paths)}")
print(f"Validation samples: {len(val_paths)}")

DATASET STATISTICS
Total samples:     3350
Sample audio path: /kaggle/input/shobdotori/Train/Rajshahi/male_rajshahi_2.wav
Sample label:      তুমি কি নতুন বই পড়তে চাও?

Training samples:   3015
Validation samples: 335


# **SECTION 11: Initialize Model and Processor**

In [16]:
# Load tokenizer and feature extractor
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(config.model_name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(config.model_name)

processor = Wav2Vec2Processor(
    tokenizer=tokenizer,
    feature_extractor=feature_extractor
)

In [17]:
# Load pre-trained model
model = Wav2Vec2ForCTC.from_pretrained(
    config.model_name,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.1,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
    ignore_mismatched_sizes=True
)

In [18]:
# Freeze feature encoder (only fine-tune upper layers)
model.freeze_feature_encoder()

# **SECTION 12: Create Datasets**

In [19]:
# Create training dataset (with augmentation)
train_dataset = BanglaDataset(
    train_paths, 
    train_labels, 
    processor,
    augment_audio=True
)

# Create validation dataset (no augmentation)
val_dataset = BanglaDataset(
    val_paths, 
    val_labels, 
    processor,
    augment_audio=False
)

In [20]:
# Initialize data collator
data_collator = DataCollatorCTCManual(processor=processor)

# **SECTION 13: Training Configuration**

In [21]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-bangla-checkpoints",
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    learning_rate=config.learning_rate,
    weight_decay=config.weight_decay,
    warmup_steps=config.warmup_steps,
    num_train_epochs=config.num_epochs,
    logging_steps=config.logging_steps,
    fp16=config.fp16,
    save_strategy=config.save_strategy,
    eval_strategy=config.eval_strategy,
    save_total_limit=config.save_total_limit,
    load_best_model_at_end=config.load_best_model_at_end,
    metric_for_best_model=config.metric_for_best_model,
    greater_is_better=config.greater_is_better,
    report_to=None,
    group_by_length=True
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# **SECTION 14: Initialize Trainer**

In [22]:
# SECTION 14: Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# **SECTION 15: Train Model**

In [23]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Wer
1,No log,0.6178,0.329793
2,0.774100,0.642722,0.332588
3,0.698500,0.652963,0.333147
4,0.649000,0.645762,0.330911
5,0.642800,0.633509,0.333706
6,0.618400,0.632305,0.330911
7,0.586900,0.623879,0.338178
8,0.559200,0.649194,0.334824
9,0.552700,0.631729,0.335942
10,0.532100,0.64011,0.330911


Downloading builder script: 0.00B [00:00, ?B/s]

TrainOutput(global_step=2850, training_loss=0.5012007723356549, metrics={'train_runtime': 17072.6299, 'train_samples_per_second': 5.298, 'train_steps_per_second': 0.167, 'total_flos': 1.1396695447827616e+19, 'train_loss': 0.5012007723356549, 'epoch': 30.0})

#  **SECTION 16: Save Model**

In [24]:
# Save fine-tuned model
model_save_path = "/kaggle/working/wav2vec2-bangla-finetuned"
trainer.save_model(model_save_path)
processor.save_pretrained(model_save_path)

print(f"\nModel saved to {model_save_path}")


Model saved to /kaggle/working/wav2vec2-bangla-finetuned


# **SECTION 17: Evaluate Model (Validation Set)**

In [25]:
!pip install Levenshtein

import Levenshtein

# Get predictions on validation set
predictions = trainer.predict(val_dataset)
pred_ids = np.argmax(predictions.predictions, axis=-1)
pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)

# Decode labels
label_ids = predictions.label_ids
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

# Calculate normalized Levenshtein similarity
similarities = []
for p, r in zip(pred_str, label_str):
    dist = Levenshtein.distance(p, r)
    max_len = max(len(p), len(r))
    sim = 1 - dist / max_len if max_len > 0 else 1.0
    similarities.append(sim)

print("EVALUATION RESULTS")
print(f"Normalized Levenshtein Similarity: {np.mean(similarities):.4f}")

Collecting Levenshtein
  Downloading levenshtein-0.27.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Downloading levenshtein-0.27.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/153.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: Levenshtein
Successfully installed Levenshtein-0.27.3


EVALUATION RESULTS
Normalized Levenshtein Similarity: 0.9048


# **SECTION 18: Generate Test Predictions**

In [27]:
# Load saved model for inference
model_path = "/kaggle/working/wav2vec2-bangla-finetuned"
processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path).to("cuda")


def load_and_prepare_audio(audio_path, sampling_rate=16000):
    audio, sr = sf.read(audio_path)
    
    # Convert stereo to mono if needed
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)
    
    # Resample if necessary
    if sr != sampling_rate:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
    
    # Normalize
    max_val = np.max(np.abs(audio))
    return audio / max_val if max_val > 0 else np.zeros_like(audio)


def transcribe_audio(audio_path):
    audio_input = load_and_prepare_audio(audio_path)
    inputs = processor(
        audio_input, 
        sampling_rate=16000, 
        return_tensors="pt", 
        padding=True
    )
    
    with torch.no_grad():
        logits = model(inputs.input_values.to("cuda")).logits
    
    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
    
    return transcription.strip()


# Process test set
test_dir = os.path.join(base_path, "Test")
test_files = sorted([f for f in os.listdir(test_dir) if f.endswith(".wav")])

predictions = []
for test_file in tqdm(test_files, desc="Transcribing Test Set"):
    audio_path = os.path.join(test_dir, test_file)
    text = transcribe_audio(audio_path)
    predictions.append({"audio": test_file, "text": text})

Transcribing Test Set:   0%|          | 0/450 [00:00<?, ?it/s]

# **SECTION 19: Save Submission**

In [28]:
# Save predictions to CSV
save_path = "/kaggle/working/finalsubmission.csv"
pd.DataFrame(predictions).to_csv(save_path, index=False, encoding="utf-8")

print(f" Submission saved to {save_path}")
print(f" Total predictions: {len(predictions)}")

 Submission saved to /kaggle/working/finalsubmission.csv
 Total predictions: 450
