In [None]:
import os
import torch
import pandas as pd
import librosa
import numpy as np
from torch.utils.data import Dataset
from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score

In [None]:
PROJECT_ROOT = "C:/dev/voicevibe"
METADATA_PATH = os.path.join(PROJECT_ROOT, "/data/processed/metadata.csv")

In [None]:
if os.path.exists("C:/dev/archive/Emotions"):
    AUDIO_BASE_PATH = "C:/dev/archive/Emotions"
    print("Using HIGH-SPEED local disk for audio data.")
else:
    print("Error")

In [None]:
OUTPUT_DIR = os.path.join("C:/dev/voicevibe/models/wav2vec2-finetuned")
MODEL_NAME = "facebook/wav2vec2-base-960h"

In [None]:
os.environ["WANDB_DISABLED"] = "true"

print(f"Starting training script...")
print(f"Metadata: {METADATA_PATH}")
print(f"Audio: {AUDIO_BASE_PATH}")

In [None]:
if not os.path.exists(METADATA_PATH):
    raise FileNotFoundError(f"Metadata not found at {METADATA_PATH}")

df = pd.read_csv(METADATA_PATH)

def clean_filename(path):
    path = str(path).replace('\\', '/')
    return path.split('/')[-1]

df['filename'] = df['path'].apply(clean_filename)
print(f"Loaded {len(df)} rows. Filenames cleaned.")

In [None]:

class AudioDataset(Dataset):
    def __init__(self, dataframe, audio_root, feature_extractor, label_map):
        self.df = dataframe.reset_index(drop=True)
        self.audio_root = audio_root
        self.feature_extractor = feature_extractor
        self.label_map = label_map

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Construct path: /Emotions/Angry/03-01...wav
        # Note: Capitalizing folder name to match your Drive structure
        folder = row['emotion'].capitalize()
        if folder == 'Suprised': folder = 'Suprised'
        
        audio_path = os.path.join(self.audio_root, folder, row['filename'])
        
        try:
            # Load audio at 16kHz
            speech, sr = librosa.load(audio_path, sr=16000)
            
            # Feature extraction
            inputs = self.feature_extractor(
                speech, 
                sampling_rate=16000, 
                padding=True, 
                return_tensors="pt"
            )
            
            return {
                "input_values": inputs.input_values.squeeze(0),
                "labels": torch.tensor(self.label_map[row['emotion']], dtype=torch.long),
            }
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            # Return next item as fallback
            return self.__getitem__((idx + 1) % len(self))

In [None]:
emotions = sorted(df['emotion'].unique())
label_map = {name: i for i, name in enumerate(emotions)}
id2label = {i: name for name, i in label_map.items()}
print(f"Labels: {label_map}")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='weighted'),
    }

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)

train_dataset = AudioDataset(
    df[df['split']=='train'], 
    AUDIO_BASE_PATH, 
    feature_extractor, 
    label_map
)
val_dataset = AudioDataset(
    df[df['split']=='val'], 
    AUDIO_BASE_PATH, 
    feature_extractor, 
    label_map
)

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(emotions),
    id2label=id2label,
    label2id=label_map,
    ignore_mismatched_sizes=True,
)
# We freeze the convolutional layers (feature encoder) to speed up training
model.freeze_feature_encoder()

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8, # Optimized for T4
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=3e-5,
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True, # Critical for Colab speed
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [None]:
print("Starting Training...")
trainer.train()
print(f"Training Complete! Model saved to {OUTPUT_DIR}")
