# ML-SUPERB Pipeline

## Install necessary packages

In [None]:
## uncomment if using colab
# !git clone https://github.com/olijacklu/MLSuperb-Project.git

# !cp -r /content/MLSuperb-Project/requirements.txt /content/
# !cp -r /content/MLSuperb-Project/config/ /content/
# !cp -r /content/MLSuperb-Project/data/ /content/
# !cp -r /content/MLSuperb-Project/evaluation/ /content/
# !cp -r /content/MLSuperb-Project/models/ /content/
# !cp -r /content/MLSuperb-Project/training/ /content/

In [None]:
!pip install -r requirements.txt

## Import libraries and methods

In [None]:
import os
import json
from tqdm.notebook import tqdm
import torch

from config.config import TRAIN_PAIRS, TORCH_DEFAULT_TYPE
from data.preprocess import preprocess_data
from models.utils import load_model, clean_memory
from training.monolingual import train_and_evaluate_monolingual
from training.multilingual import train_and_evaluate_multilingual
from evaluation.test import test_model
from evaluation.analysis import analyze_layer_weights

## Set base directory (e.g. current working directory, has to be the same as where the data folder is stored)

In [None]:
base_dir = '/content/drive/MyDrive/MVA/NLP/AlgorithmsSpeechNLP' # Important: Specify the path to the directory where the data is stored and where you wish to save any results

torch.set_default_dtype(TORCH_DEFAULT_TYPE)

device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device)
print(f"Using device: {device}")

## Preprocess data (creates JSON for easy lookup of data)

In [None]:
datasets = preprocess_data()

with open(f'{base_dir}/ml_superb_dataset.json', 'w') as f:
    json.dump(datasets, f, indent=2)

print(f"Found {len(datasets)} language-source pairs")

## Open JSON lookup file

In [None]:
with open(f'{base_dir}/ml_superb_dataset.json', 'r') as f:
    datasets = json.load(f)

print(f"Loaded {len(datasets)} language-source pairs")

## Specify whether you want to apply LoRA and/or quantize the upstream model

In [None]:
lora = False
quantize = False

lora_config = {
    'r': 16,
    'lora_alpha': 32,
    'target_modules': ["k_proj", "q_proj", "v_proj"],
    'lora_dropout': 0.1,
    'bias': "none"
} if lora else None

## Choose the upstream model you want to use and load it

In [None]:
model_name = "facebook/hubert-base-ls960"
# model_name = "facebook/wav2vec2-xls-r-300m"

upstream_model, feature_extractor = load_model(model_name, device=device, quantize=quantize)

## Choose which experiments you want to run

In [None]:
run_monolingual = False

run_multilingual_asr = False
run_multilingual_lid = False
run_multilingual_joint = False

## Monolingual experiments (ASR)

In [None]:
# Monolingual experiments
models_by_language = {}
monolingual_results = {}

if run_monolingual:
    # ASR task
    for lang, data_pair in tqdm(TRAIN_PAIRS.items(), desc="Monolingual Experiments"):
        print(f"\nRunning Monolingual ASR for {lang} ({data_pair})")

        model, results, char_mappings = train_and_evaluate_monolingual(
            lang=lang,
            data_pair=data_pair,
            upstream_model=upstream_model,
            feature_extractor=feature_extractor,
            datasets=datasets,
            device=device,
            lora_config=lora_config,
            quantize=quantize
        )

        test_model(
            model=model,
            feature_extractor=feature_extractor,
            datasets=datasets,
            char_mappings=char_mappings,
            model_type="monolingual",
            data_pair=data_pair,
            device=device
        )

        monolingual_results[lang] = results
        models_by_language[lang] = model

        torch.save(model.state_dict(), os.path.join(base_dir, f"{model_name.split('/')[-1]}_monolingual_asr_{lang}.pt"))

        clean_memory()

    analyze_layer_weights(
        models_by_language,
        title=f"{model_name.split('/')[-1].capitalize()} Monolingual Layer Weights",
        save_path=os.path.join(base_dir, f"{model_name.split('/')[-1]}_monolingual_layer_weights.png")
    )

    with open(os.path.join(base_dir, f"{model_name.split('/')[-1]}_monolingual_results.json"), 'w') as f:
        json.dump(monolingual_results, f, indent=2)

## Multilingual experiments (ASR, LID, ASR + LID)

In [None]:
# Multilingual experiments
multilingual_results = {}
multilingual_models = {}

if run_multilingual_asr:
    # ASR task
    print("\nRunning Multilingual ASR")
    asr_model, asr_results, asr_char_mappings = train_and_evaluate_multilingual(
        upstream_model=upstream_model,
        feature_extractor=feature_extractor,
        datasets=datasets,
        task="asr",
        device=device,
        lora_config=lora_config,
        quantize=quantize
        )

    test_model(
        model=asr_model,
        feature_extractor=feature_extractor,
        datasets=datasets,
        char_mappings=asr_char_mappings,
        model_type="multilingual",
        task="asr",
        device=device
    )

    multilingual_results["asr"] = asr_results
    multilingual_models["asr"] = asr_model

    torch.save(asr_model.state_dict(), os.path.join(base_dir, f"{model_name.split('/')[-1]}_multilingual_asr.pt"))

    clean_memory()

if run_multilingual_lid:
    # LID task
    print("\nRunning LID")
    lid_model, lid_results, lid_char_mappings = train_and_evaluate_multilingual(
        upstream_model=upstream_model,
        feature_extractor=feature_extractor,
        datasets=datasets,
        task="lid",
        device=device,
        lora_config=lora_config,
        quantize=quantize
    )

    test_model(
        model=lid_model,
        feature_extractor=feature_extractor,
        datasets=datasets,
        char_mappings=lid_char_mappings,
        model_type="multilingual",
        task="lid",
        device=device
    )

    multilingual_results["lid"] = lid_results
    multilingual_models["lid"] = lid_model

    torch.save(lid_model.state_dict(), os.path.join(base_dir, f"{model_name.split('/')[-1]}_multilingual_lid.pt"))

    clean_memory()

if run_multilingual_joint:
    # ASR+LID task
    print("\nRunning joint ASR+LID")
    joint_model, joint_results, joint_char_mappings = train_and_evaluate_multilingual(
        upstream_model=upstream_model,
        feature_extractor=feature_extractor,
        datasets=datasets,
        task="asr+lid",
        device=device,
        lora_config=lora_config,
        quantize=quantize
    )

    test_model(
        model=joint_model,
        feature_extractor=feature_extractor,
        datasets=datasets,
        char_mappings=joint_char_mappings,
        model_type="multilingual",
        task="asr+lid",
        device=device
    )

    multilingual_results["asr+lid"] = joint_results
    multilingual_models["asr+lid"] = joint_model

    torch.save(joint_model.state_dict(), os.path.join(base_dir, f"{model_name.split('/')[-1]}_multilingual_asr+lid.pt"))

    clean_memory()

if run_multilingual_asr or run_multilingual_lid or run_multilingual_joint:
    analyze_layer_weights(
        multilingual_models,
        title=f"{model_name.split('/')[-1].capitalize()} Multilingual Layer Weights",
        save_path=os.path.join(base_dir, f"{model_name.split('/')[-1]}_multilingual_layer_weights.png")
    )

    with open(os.path.join(base_dir, f"{model_name.split('/')[-1]}_multilingual_results.json"), 'w') as f:
        json.dump(multilingual_results, f, indent=2)