In [None]:
import os
import torch
import pandas as pd
import librosa
import numpy as np
from torch.utils.data import Dataset
from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import shutil

In [None]:
DRIVE_PROJECT_ROOT = "/content/drive/MyDrive/Colab_VigilAudio"
METADATA_PATH = os.path.join(DRIVE_PROJECT_ROOT, "metadata.csv")
MODEL_NAME = "facebook/wav2vec2-base-960h"

LOCAL_DATA_PATH = "/content/Emotions" 
OUTPUT_DIR = os.path.join(DRIVE_PROJECT_ROOT, "wav2vec2-finetuned")
LOCAL_OUTPUT = "/content/wav2vec2-finetuned"

os.environ["WANDB_DISABLED"] = "true"

In [None]:
def setup_data():
    if not os.path.exists(LOCAL_DATA_PATH):
        print("Copying data to local disk (this takes ~3 mins)...")
        drive_data = os.path.join(DRIVE_PROJECT_ROOT, "Emotions")
        if os.path.exists(drive_data):
            shutil.copytree(drive_data, LOCAL_DATA_PATH)
            print("Data copy complete.")
        else:
            print("Drive data not found. Assuming data is already in /content/Emotions")
    else:
        print("Data already exists on local disk.")

In [None]:
class AudioDataset(Dataset):
    def __init__(self, dataframe, audio_root, feature_extractor, label_map):
        self.df = dataframe.reset_index(drop=True)
        self.audio_root = audio_root
        self.feature_extractor = feature_extractor
        self.label_map = label_map

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filename = os.path.basename(row['path'].replace('\\', '/'))
        folder = row['emotion'].capitalize()
        if folder == 'Suprised': folder = 'Suprised'
        audio_path = os.path.join(self.audio_root, folder, filename)
        
        try:
            speech, _ = librosa.load(audio_path, sr=16000)
            inputs = self.feature_extractor(speech, sampling_rate=16000, padding=True, return_tensors="pt")
            return {
                "input_values": inputs.input_values.squeeze(0),
                "labels": torch.tensor(self.label_map[row['emotion']], dtype=torch.long),
            }
        except Exception:
            return self.__getitem__((idx + 1) % len(self))

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='weighted'),
    }

In [None]:


setup_data()

if not os.path.exists(METADATA_PATH):
print(f"Error: Metadata not found at {METADATA_PATH}")
return

df = pd.read_csv(METADATA_PATH)
emotions = sorted(df['emotion'].unique())
label_map = {name: i for i, name in enumerate(emotions)}
id2label = {i: name for name, i in label_map.items()}

feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
train_ds = AudioDataset(df[df['split']=='train'], LOCAL_DATA_PATH, feature_extractor, label_map)
val_ds = AudioDataset(df[df['split']=='val'], LOCAL_DATA_PATH, feature_extractor, label_map)

model = Wav2Vec2ForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=len(emotions),
id2label=id2label,
label2id=label_map,
ignore_mismatched_sizes=True
)
model.freeze_feature_encoder()

training_args = TrainingArguments(
output_dir="/content/checkpoints",
eval_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=8,
gradient_accumulation_steps=2,
num_train_epochs=5,
learning_rate=3e-5,
warmup_steps=500,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
fp16=True,
report_to="none"
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=val_ds,
tokenizer=feature_extractor,
compute_metrics=compute_metrics
)

print("Starting Training (Restored Logic)...")
trainer.train()

print("Saving final model locally...")
trainer.save_model(LOCAL_OUTPUT)

print("Zipping for download...")
shutil.make_archive("/content/wav2vec2_model", 'zip', LOCAL_OUTPUT)
print("DONE! Please download /content/wav2vec2_model.zip")


