In [None]:
# Install dependencies and Kaldi
!sudo apt-get update
!sudo apt-get install -y cmake make gcc g++ git subversion python3-dev python3-pip
!sudo apt-get install -y sox wget zlib1g-dev automake autoconf unzip

!git clone https://github.com/kaldi-asr/kaldi.git
!cd kaldi/tools && extras/install_mkl.sh
!cd kaldi/tools && make -j 4
!cd kaldi/src && ./configure --shared && make depend -j 4 && make -j 4

# Install additional Python packages
!pip install numpy pandas librosa

# Upload your training_data folder to Colab or mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Write the normalize_text.py script
normalize_text_script = """
import os
import re

# Define mappings for abbreviations and symbols
abbreviation_mapping = {
    "mr": "mister",
    "mrs": "missus",
    "dr": "doctor",
    "st": "saint",
    "jr": "junior",
    "sr": "senior",
    "$": "dollar",
    "₹": "rupee",
    "€": "euro",
    "£": "pound",
    "&": "and"
}

def normalize_text(text):
    text = text.lower()
    
    # Replace abbreviations and symbols
    for abbr, full in abbreviation_mapping.items():
        text = re.sub(r'\\b' + re.escape(abbr) + r'\\b', full, text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove remaining punctuation
    text = re.sub(r'\\d+', lambda x: num2words(int(x.group())), text)  # Convert numbers to words
    
    return text

def num2words(number):
    # Simplified number to words conversion; consider using the num2words library for full conversion
    words = {
        0: 'zero', 1: 'one', 2: 'two', 3: 'three', 4: 'four', 5: 'five', 
        6: 'six', 7: 'seven', 8: 'eight', 9: 'nine', 10: 'ten', 11: 'eleven', 
        12: 'twelve', 13: 'thirteen', 14: 'fourteen', 15: 'fifteen', 16: 'sixteen', 
        17: 'seventeen', 18: 'eighteen', 19: 'nineteen', 20: 'twenty', 30: 'thirty', 
        40: 'forty', 50: 'fifty', 60: 'sixty', 70: 'seventy', 80: 'eighty', 90: 'ninety'
    }
    
    if number < 20:
        return words[number]
    elif number < 100:
        return words[number // 10 * 10] + ('' if number % 10 == 0 else ' ' + words[number % 10])
    else:
        return str(number)  # For simplicity, handle numbers less than 100 only

def normalize_files(text_dir):
    for filename in os.listdir(text_dir):
        if filename.endswith('.txt'):
            filepath = os.path.join(text_dir, filename)
            with open(filepath, 'r') as file:
                text = file.read()
            normalized_text = normalize_text(text)
            with open(filepath, 'w') as file:
                file.write(normalized_text)

if __name__ == "__main__":
    text_dir = "training_data/text_files"
    normalize_files(text_dir)
"""

# Write the script to a file
with open('normalize_text.py', 'w') as f:
    f.write(normalize_text_script)

# Run the normalization script
!python3 normalize_text.py

# Create the prepare_data.sh script
prepare_data_script = """
#!/bin/bash

DATA_DIR=training_data
SPEECH_DIR=$DATA_DIR/speech_files
TEXT_DIR=$DATA_DIR/text_files
DEST_DIR=data/local

mkdir -p $DEST_DIR

# Create wav.scp
find $SPEECH_DIR -name "*.wav" | while read file; do
  utt_id=$(basename $file .wav)
  echo "$utt_id $file" >> $DEST_DIR/wav.scp
done

# Create text
find $TEXT_DIR -name "*.txt" | while read file; do
  utt_id=$(basename $file .txt)
  text=$(cat $file)
  echo "$utt_id $text" >> $DEST_DIR/text
done

# Create utt2spk and spk2utt
awk '{print $1 " " $1}' $DEST_DIR/wav.scp > $DEST_DIR/utt2spk
cp $DEST_DIR/utt2spk $DEST_DIR/spk2utt
"""

# Write the script to a file
with open('prepare_data.sh', 'w') as f:
    f.write(prepare_data_script)

# Make the script executable
!chmod +x prepare_data.sh

# Run the script
!./prepare_data.sh

# Create the required directories
!mkdir -p exp/make_mfcc/local mfcc

# Feature extraction
!steps/make_mfcc.sh --nj 1 --cmd "run.pl" data/local exp/make_mfcc/local mfcc
!steps/compute_cmvn_stats.sh data/local exp/make_mfcc/local mfcc

# Create the dictionary and language models
!utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang

# Train the monophone model
!steps/train_mono.sh --nj 1 --cmd "run.pl" data/local data/lang exp/mono

# Create the graph
!utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph

# Decode
!steps/decode.sh --nj 1 --cmd "run.pl" exp/mono/graph data/local exp/mono/decode
