##Setting up Conda on Colab to create custom environment

In [1]:
!pip install -q condacolab
import condacolab
condacolab.install()

‚è¨ Downloading https://github.com/jaimergp/miniforge/releases/download/24.11.2-1_colab/Miniforge3-colab-24.11.2-1_colab-Linux-x86_64.sh...
üì¶ Installing...
üìå Adjusting configuration...
ü©π Patching environment...
‚è≤ Done in 0:00:07
üîÅ Restarting kernel...


##Creating the custom enviroment

In [None]:
print("--- CREATING ENVIRONMENT 'w2v_hpc' ---")
!conda create --name w2v_hpc python=3.8 -y

PY38 = "/usr/local/envs/w2v_hpc/bin/python"
PIP38 = "/usr/local/envs/w2v_hpc/bin/pip"

#downdgrade pip inside Conda to allow installing old libraries
!{PIP38} install "pip<24.0"

print("\n--- INSTALLING DEPENDENCIES ---")
!{PIP38} install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
!{PIP38} install soundfile librosa tqdm
!{PIP38} install "omegaconf==2.0.6" "hydra-core==1.0.7" "antlr4-python3-runtime==4.8"
!{PIP38} install "numpy<1.24"


print("\n--- CLONING & INSTALLING FAIRSEQ (TEMP LOCATION) ---")
!rm -rf /content/temp_build
!mkdir -p /content/temp_build
%cd /content/temp_build

!git clone https://github.com/Open-Speech-EkStep/fairseq
%cd fairseq

print("Checking out HPC Commit: 9589463...")
!git checkout 958946363b4e91f1f6d8ce47605488c45a75c4a1

print("Installing Fairseq...")
!{PIP38} install .

print("\n‚úÖ Environment Setup Complete.")

##Importing Required Libararies

In [2]:
import sys
import os
import shutil
import subprocess
from huggingface_hub import hf_hub_download
import IPython.display as ipd
import soundfile as sf
import librosa
import numpy as np

##Helper Functions for inference

In [3]:
LANG_CONFIG = {
    "guj": {
        "folder": "mscs_guj_eng",
        "w2v_file": "checkpoint_best_gujrati.pt",
        "code": "guj",
        "full_name": "Gujarati"
    },
    "tam": {
        "folder": "mscs_tam_eng",
        "w2v_file": "checkpoint_best_tamil.pt",
        "code": "tam",
        "full_name": "Tamil"
    },
    "tel": {
        "folder": "mscs_tel_eng",
        "w2v_file": "checkpoint_best_telegu.pt",
        "code": "tel",
        "full_name": "Telugu"
    }
}

POOLING_CONFIG = {
    "attn": {
        "folder": "attn_pooling",
        "ckpt_type": "attention",
        "arg_type": "attention"
    },
    "stat": {
        "folder": "stat_pooling",
        "ckpt_type": "stat",
        "arg_type": "base"
    }
}

def run_command(cmd, log_file=None):
    """Runs a shell command and streams output to console AND a log file."""
    print(f"Running: {cmd}")

    f_log = None
    if log_file:
        print(f"Saving metrics logs to: {log_file}")
        f_log = open(log_file, 'w', encoding='utf-8')

    try:
        process = subprocess.Popen(
            cmd,
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            encoding='utf-8',
            errors='replace'
        )

        while True:
            line = process.stdout.readline()
            if line == '' and process.poll() is not None:
                break
            if line:
                print(line.strip())
                if f_log:
                    f_log.write(line)
                    f_log.flush()

        if process.returncode != 0:
            raise subprocess.CalledProcessError(process.returncode, cmd)

    finally:
        if f_log:
            f_log.close()

def setup_codebase():
    print(f"\n--- 1. CLONING CODEBASE ---")

    if os.path.exists(CODE_DIR):
        os.chdir(os.path.dirname(CODE_DIR))
        print(f"Removing existing code at {CODE_DIR}...")
        shutil.rmtree(CODE_DIR)

    os.chdir(os.path.dirname(CODE_DIR))

    print(f"Cloning {GITHUB_REPO}...")
    run_command(f"git clone {GITHUB_REPO} {CODE_DIR}")
    print("‚úÖ Code cloned successfully.")

def download_models_robust():
    print(f"\n--- 2. DOWNLOADING MODELS ({SECONDARY_LANG}/{POOLING_TYPE}) ---")
    if os.path.exists(DOWNLOAD_DIR):
        shutil.rmtree(DOWNLOAD_DIR)
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)

    l_cfg = LANG_CONFIG[SECONDARY_LANG]
    p_cfg = POOLING_CONFIG[POOLING_TYPE]

    diar_ckpt = f"{l_cfg['folder']}/{p_cfg['folder']}/mscs_{l_cfg['code']}_en_{p_cfg['ckpt_type']}_model.ckpt"

    files = {
        "W2V_PT": f"fine_tuned_models/{l_cfg['w2v_file']}",
        "CLSRIL_BASE": "base_model/CLSRIL-23.pt",
        "DIAR_CKPT": diar_ckpt
    }

    final_paths = {}
    for key, repo_path in files.items():
        print(f"Fetching {repo_path}...")
        hf_hub_download(
            repo_id=HF_REPO_ID,
            filename=repo_path,
            local_dir=DOWNLOAD_DIR
        )

        nested = os.path.join(DOWNLOAD_DIR, repo_path)
        flat = os.path.join(DOWNLOAD_DIR, os.path.basename(repo_path))

        if os.path.exists(nested):
            os.rename(nested, flat)
            final_paths[key] = flat
        elif os.path.exists(flat):
             final_paths[key] = flat
        else:
            print(f"‚ö†Ô∏è Error: Could not find downloaded file at {nested}")

    print("Cleaning up empty nested folders...")
    subfolders = ["fine_tuned_models", "base_model", "mscs_guj_eng", "mscs_tam_eng", "mscs_tel_eng"]
    for folder in subfolders:
        folder_path = os.path.join(DOWNLOAD_DIR, folder)
        if os.path.exists(folder_path):
            shutil.rmtree(folder_path)

    return final_paths

def run_inference_script(model_paths, input_audio, output_dir, ground_truth=None):
    print(f"\n--- 3. RUNNING INFERENCE ---")

    if not os.path.exists(PY38):
        raise FileNotFoundError(f"Python executable not found at {PY38}.")

    os.chdir(CODE_DIR)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    audio_segments_dir = os.path.join(output_dir, "audio_segments")
    if not os.path.exists(audio_segments_dir):
        os.makedirs(audio_segments_dir, exist_ok=True)

    lang_full_name = LANG_CONFIG[SECONDARY_LANG]["full_name"]
    model_arg_type = POOLING_CONFIG[POOLING_TYPE]["arg_type"]

    cmd_parts = [
        f'{PY38} rttm_maker_infer.py',
        f'--input_audio "{input_audio}"',
        f'--model_path "{model_paths["DIAR_CKPT"]}"',
        f'--model_type "{model_arg_type}"',
        f'--w2v_path "{model_paths["W2V_PT"]}"',
        f'--lang 3',
        f'--fll_name "{lang_full_name}"',
        f'--device 0',
        f'--output_labels_txt "{output_dir}/labels.txt"',
        f'--output_rttm "{output_dir}/predictions.rttm"',
        f'--segment_output_dir "{audio_segments_dir}"'
    ]

    if ground_truth:
        cmd_parts.append(f'--ground_truth_tsv "{ground_truth}"')

    cmd = ' '.join(cmd_parts)


    log_path = os.path.join(output_dir, "inference_metrics.log")

    run_command(cmd, log_file=log_path)

    print(f"\n‚úÖ SUCCESS! Output saved to: {output_dir}")
    print(f"üìÑ Metrics (JER/DER) saved to: {log_path}")

def pipeline(input_audio, output_dir, ground_truth_tsv=None):
    setup_codebase()
    paths = download_models_robust()
    run_inference_script(paths, input_audio, output_dir, ground_truth_tsv)

# Functions for playing and preprocessing user input audio file

In [4]:
def play_wav_file(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    try:
        data, samplerate = sf.read(file_path)

        print(f"Playing: {os.path.basename(file_path)}")
        print(f"Sample Rate: {samplerate} Hz")
        print(f"Duration: {len(data)/samplerate:.2f} seconds")


        ipd.display(ipd.Audio(data, rate=samplerate))

    except Exception as e:
        print(f"Error reading audio file: {e}")

def preprocess_audio(input_path, output_path=None):

    if not os.path.exists(input_path):
        print(f"Error: File not found at {input_path}")
        return

    print(f"Processing: {input_path}")

    try:
        wav, original_sr = librosa.load(input_path, sr=16000, mono=True)

        print(f"  - Original loaded as: 16000 Hz (Resampled from source)")
        print(f"  - Channels: Mono (Converted)")
        print(f"  - Duration: {len(wav)/16000:.2f} seconds")

        if output_path is None:
            filename, ext = os.path.splitext(input_path)
            output_path = f"{filename}_16k{ext}"

        sf.write(output_path, wav, 16000)
        print(f"‚úÖ Saved processed file to: {output_path}")

    except Exception as e:
        print(f"‚ùå Error processing audio: {e}")

##‚ÄãPlease upload the file and change file name below as required as input

In [22]:
AUDIO_FILENAME = "tamil.wav"
input_file=f"/content/{AUDIO_FILENAME}"
output=f"/content/16k_{AUDIO_FILENAME}"
preprocess_audio(input_file, output)
play_wav_file(output)

Processing: /content/tamil.wav
  - Original loaded as: 16000 Hz (Resampled from source)
  - Channels: Mono (Converted)
  - Duration: 7.17 seconds
‚úÖ Saved processed file to: /content/16k_tamil.wav
Playing: 16k_tamil.wav
Sample Rate: 16000 Hz
Duration: 7.17 seconds


## Please input secondary language here
### SECONDARY_LANG= " "   (select from tam/tel/guj)

In [19]:
GITHUB_REPO = "https://github.com/naman-6420/langdiarization.git"
CODE_DIR = "/content/langdiarization_code"

HF_REPO_ID = "naman-1901/langdiarization"
DOWNLOAD_DIR = "/content/models_download"

PY38 = "/usr/local/envs/w2v_hpc/bin/python"

SECONDARY_LANG = "tam"  #options to select from:-guj,tam,tel
POOLING_TYPE = "attn"   #options to select from:-attn,stat

INPUT_AUDIO_PATH = f"/content/16k_{AUDIO_FILENAME}"
# GROUND_TRUTH_PATH = f"/content/drive/MyDrive/MSCS dataset/PartB_Gujarati/Dev/Transcription_LT_Sequence_Frame_Level_200_actual.tsv" # Optional
OUTPUT_DIRECTORY = f"/content/results/{AUDIO_FILENAME}"

pipeline(INPUT_AUDIO_PATH, OUTPUT_DIRECTORY)


--- 1. CLONING CODEBASE ---
Removing existing code at /content/langdiarization_code...
Cloning https://github.com/naman-6420/langdiarization.git...
Running: git clone https://github.com/naman-6420/langdiarization.git /content/langdiarization_code
Cloning into '/content/langdiarization_code'...
‚úÖ Code cloned successfully.

--- 2. DOWNLOADING MODELS (tam/attn) ---
Fetching fine_tuned_models/checkpoint_best_tamil.pt...


fine_tuned_models/checkpoint_best_tamil.(‚Ä¶):   0%|          | 0.00/1.13G [00:00<?, ?B/s]

Fetching base_model/CLSRIL-23.pt...


base_model/CLSRIL-23.pt:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

Fetching mscs_tam_eng/attn_pooling/mscs_tam_en_attention_model.ckpt...


mscs_tam_eng/attn_pooling/mscs_tam_en_at(‚Ä¶):   0%|          | 0.00/65.0M [00:00<?, ?B/s]

Cleaning up empty nested folders...

--- 3. RUNNING INFERENCE ---
Running: /usr/local/envs/w2v_hpc/bin/python rttm_maker_infer.py --input_audio "/content/16k_tamil.wav" --model_path "/content/models_download/mscs_tam_en_attention_model.ckpt" --model_type "attention" --w2v_path "/content/models_download/checkpoint_best_tamil.pt" --lang 3 --fll_name "Tamil" --device 0 --output_labels_txt "/content/results/tamil.wav/labels.txt" --output_rttm "/content/results/tamil.wav/predictions.rttm" --segment_output_dir "/content/results/tamil.wav/audio_segments"
Saving metrics logs to: /content/results/tamil.wav/inference_metrics.log
Using device: cpu
Loading Wav2Vec2 from: /content/models_download/checkpoint_best_tamil.pt...
Loading attention model...
Processing audio: /content/16k_tamil.wav
Saved labels to: /content/results/tamil.wav/labels.txt
Saved RTTM to: /content/results/tamil.wav/predictions.rttm
Success.

‚úÖ SUCCESS! Output saved to: /content/results/tamil.wav
üìÑ Metrics (JER/DER) saved t

# Playing the extracted English Audio Clips


In [20]:
english_segments_dir = f"/content/results/{AUDIO_FILENAME}/audio_segments/English"

if os.path.exists(english_segments_dir) and os.path.isdir(english_segments_dir):
    print(f"\n--- Playing English Audio Segments from: {english_segments_dir} ---")
    audio_files = [f for f in os.listdir(english_segments_dir) if f.endswith('.wav')]
    audio_files.sort()

    if not audio_files:
        print("No .wav files found in the English segments directory.")
    else:
        for audio_file in audio_files:
            full_path = os.path.join(english_segments_dir, audio_file)
            play_wav_file(full_path)
else:
    print(f"English audio segments directory not found: {english_segments_dir}")


--- Playing English Audio Segments from: /content/results/tamil.wav/audio_segments/English ---
Playing: 16k_tamil_006_English_01.wav
Sample Rate: 16000 Hz
Duration: 0.40 seconds


Playing: 16k_tamil_008_English_02.wav
Sample Rate: 16000 Hz
Duration: 0.80 seconds


# Telegu-English Transcript-
## Ee roj **morning** nenu park lo nadustunte oka **stranger** ni kalisa. Atanu naku oka manchi **advice** ichadu, dani valla na **mindset** koncham marindi. Ippudu nenu na **goals** meeda **focus** chestunna.

# Gujrati- English Transcript-
## Gujarat! Su che gujarat ? Ek roaring lion je atla varsho thi India na agragani rajyo ma aave che , anek moti hastiyo jemnu India mate contribution che jemke Narendra Modi, Sardar Patel, Tatas. World famous garba and kite festival jya duniya aakhi gujarat ma aave.