In [None]:
# GPU and auto-reload
# (In the Colab UI: Runtime ▷ Change runtime type ▷ GPU)
%load_ext autoreload
%autoreload 2


In [None]:
from google.colab import drive
drive.mount('/content/drive')              # → enter OAuth code
DRIVE_ROOT = '/content/drive/MyDrive/dlfa_capstone'
DATA_ROOT  = f'{DRIVE_ROOT}/meld_data'
CKPT_ROOT  = f'{DRIVE_ROOT}/checkpoints'
    


In [None]:
#Clone or Pull Repo
import pathlib

REPO_URL = "https://github.com/mehulbhardwaj/emotion-classifier-capstone.git"
REPO_DIR = "/content/emotion-classifier-capstone"

if pathlib.Path(REPO_DIR).exists():
    %cd $REPO_DIR
    !git pull --quiet
else:
    !git clone $REPO_URL $REPO_DIR --quiet
    %cd $REPO_DIR


In [None]:
# CELL TO DOWNLOAD MELD RAW DATA (CSVs and MP4s)
# This cell uses scripts/download_meld_dataset.py to fetch the complete


print("Starting MELD Raw Dataset Download Process...")

DRIVE_RAW_MELD_PATH = f"{DRIVE_ROOT}/meld_data/raw/" 

print(f"Target directory for raw MELD data (CSVs, MP4s): {DRIVE_RAW_MELD_PATH}")

# Create the target directory if it doesn't exist to avoid issues with the script
import os
os.makedirs(DRIVE_RAW_MELD_PATH, exist_ok=True)

# Run the download script.
# This script handles downloading the main MELD.Raw.tar.gz, extracting its contents (CSVs, video tarballs),
# and then extracting the MP4 videos from those inner tarballs.
# You can add --force_download_main or --force_extract_videos if you need to re-run parts of the process.
%cd /content/emotion-classifier-capstone
!python scripts/download_meld_dataset.py --data_dir "{DRIVE_RAW_MELD_PATH}"

print("\n---------------------------------------------------------------------")
print("MELD Raw Data Download and Initial Extraction Process Finished.")
print(f"All downloaded and extracted raw data should be available in: {DRIVE_RAW_MELD_PATH}")
print("1. Convert these MP4 videos to WAV audio files (e.g., using a script like preprocess_meld.py).")


In [None]:
#Cell 3 – Install dependencies via pip
!pip install -U pip wheel
!pip install -q -r requirements.txt

# Cache HF models & datasets on Drive
import os
os.environ["HF_HOME"]        = f"{DRIVE_ROOT}/hf_cache"
os.environ["HF_DATASETS_CACHE"] = f"{DRIVE_ROOT}/hf_cache"


In [None]:
# Cell 4 Revised
import pathlib
import textwrap

# These are from your Cell 2
DRIVE_ROOT = '/content/drive/MyDrive/dlfa_capstone'
DATA_ROOT  = f'{DRIVE_ROOT}/datasets/meld_data'
CKPT_ROOT  = f'{DRIVE_ROOT}/checkpoints'

colab_mlp_fusion_yaml_content = f"""
architecture_name: "mlp_fusion"
experiment_name: "mlp_fusion_colab_run" # Specific experiment name for this run

paths:
  data_root: {DATA_ROOT} # Points to your Drive path
  output_dir_root: {CKPT_ROOT} # Points to your Drive path

# --- General Project Settings ---
dataset_name: "meld"
input_mode: "audio_text" # Assuming this is what mlp_fusion uses
random_seed: 42
device_name: "cuda" # Colab provides GPU
# num_dataloader_workers: 2 # Set a reasonable default for Colab

# --- Data Preparation (Ensure these are false if using existing dataset) ---
run_mp4_to_wav_conversion: false
run_hf_dataset_creation: false

# --- Training Parameters (copy from your local mlp_fusion_default.yaml or adjust) ---
num_epochs: 1 # As per your Cell 5
batch_size: 16 # As per your Cell 5
learning_rate: 0.00003 # Adjusted from your 3e-4, check if this is intended
optimizer_name: "AdamW"
lr_scheduler_name: "linear" # Or null if not used
# ... other training params from your local mlp_fusion_default.yaml ...

# --- Dataset Specific Limits ---
limit_dialogues_train: 50 # As per your Cell 5
limit_dialogues_dev: 10   # As per your Cell 5
limit_dialogues_test: 10  # As per your Cell 5

# --- MLP Fusion Specific Parameters (Copy from your local mlp_fusion_default.yaml) ---
text_encoder_model_name: "distilroberta-base"
audio_encoder_model_name: "facebook/wav2vec2-base-960h"
mlp_hidden_size: 768 # Or whatever your model expects
mlp_dropout_rate: 0.1 # Or whatever your model expects

# Add any other necessary parameters from your local mlp_fusion_default.yaml
"""

colab_config_path = pathlib.Path("configs/mlp_fusion_colab.yaml")
colab_config_path.write_text(colab_mlp_fusion_yaml_content)
print(f"Generated Colab config at: {colab_config_path}")
print("--- Config Content ---")
print(colab_mlp_fusion_yaml_content)



In [None]:
#Cell 5 – Run (or resume) training
# ────────────────────────────────────────────────────────────────────────────
# This picks up `last.ckpt` (if present), writes new ckpts every 500 steps,
# and now cfg.experiment_name is guaranteed to exist.
# ────────────────────────────────────────────────────────────────────────────
!python main.py \
   --config_file configs/mlp_fusion_colab.yaml \
   --architecture mlp_fusion \
   --train_model \
   --num_epochs 1 \
   --limit_dialogues_train 50 \
   --limit_dialogues_dev 10 \
   --limit_dialogues_test 10 \
   --batch_size 16 \
   --learning_rate 3e-4 \
   --experiment_name mlp_fusion


In [None]:
#Cell 6 – Validation / Inference
!python main.py \
   --config_file configs/path_colab.yaml \
   --evaluate_model {CKPT_ROOT}/mlp_fusion/last.ckpt
