## Authors

* parkerqueen
* msaad039

## Resources & Articles
[Urdu Resources](http://www.cle.org.pk/software/ling_resources.htm)

## Imports

In [None]:
!pip3 install tqdm

import os
import csv
from random import shuffle
from shutil import copyfile
from tqdm import tqdm

## Useful Paths

In [None]:
kaldi_root = '../../'
kaldi_tools = os.path.join(kaldi_root, 'tools')
kaldi_examples = os.path.join(kaldi_root, 'egs')

## Training Kaldi

#### Installing SRILM

In [None]:
!cp srilm.tgz "$kaldi_tools"
%cd "$kaldi_tools"
!./install_srilm.sh

#### Installing Dependencies

In [None]:
!./extras/install_mkl.sh
!./extras/check_dependencies.sh
%cd "$kaldi_examples"/qoul

#### Files To Be Written

In [None]:
traintext = open('data/train/text', 'w')
trainscp = open('data/train/wav.scp', 'w')
trainutt2spk = open('data/train/utt2spk', 'w')

testtext = open('data/test/text', 'w')
testscp = open('data/test/wav.scp', 'w')
testutt2spk = open('data/test/utt2spk', 'w')

#### Preparing CSALT Corpus

In [None]:
# Setting Paths
wav_dir = 'audio/csalt'
transc_path = 'audio/csalt_transcription.txt'

In [None]:
with open(transc_path, 'r') as transc:
    # Preparing Data
    lines = transc.readlines()
    
    wavfiles = next(os.walk(wav_dir))[2]
    shuffle(wavfiles)
    shuffle(wavfiles)
    index = int(len(wavfiles) * 0.9)
    trainfiles = wavfiles[:index]
    testfiles = wavfiles[index:]

    for wav in tqdm(trainfiles):
        source_path = os.path.join(wav_dir, wav)
        wavname = os.path.splitext(wav)[0]

        # Writing to data/train/text
        traintext.write(f'csalt_speaker_{wavname} {lines[int(wav[1: -4]) - 1].strip()}\n')

        # Writing to data/train/wav.scp
        trainscp.write(f'csalt_speaker_{wavname} {source_path}\n')

        # Writing to data/train/utt2spk
        trainutt2spk.write(f'csalt_speaker_{wavname} csalt_speaker\n')

    for wav in tqdm(testfiles):
        source_path = os.path.join(wav_dir, wav)
        wavname = os.path.splitext(wav)[0]

        # Writing to data/test/text
        testtext.write(f'csalt_speaker_{wavname} {lines[int(wav[1: -4]) - 1].strip()}\n')

        # Writing to data/test/wav.scp
        testscp.write(f'csalt_speaker_{wavname} {source_path}\n')

        # Writing to data/test/utt2spk
        testutt2spk.write(f'csalt_speaker_{wavname} csalt_speaker\n')

#### Preparing RUMI Corpus

In [None]:
wav_dir = 'audio/rumi'
transc_path = 'audio/rumi_transcription.txt'

In [None]:
with open(transc_path, 'r', encoding = 'utf8') as transc:
    lines = transc.readlines()
    transcription = {line.split(' ', 1)[0]: line.split(' ', 1)[1].strip() for line in lines}

In [None]:
for speaker in next(os.walk(wav_dir))[1]:

    wavfiles = next(os.walk(os.path.join(wav_dir, speaker)))[2]
    shuffle(wavfiles)
    shuffle(wavfiles)
    index = int(len(wavfiles) * 0.9)
    trainfiles = wavfiles[:index]
    testfiles = wavfiles[index:]

    for wav in tqdm(trainfiles):
        source_path = os.path.join(wav_dir, speaker, wav)
        wavname = os.path.splitext(wav)[0]

        # Writing to data/train/text
        traintext.write(f'rumi_speaker_{speaker}_{wavname} {transcription[wavname]}\n')

        # Writing to data/train/wav.scp
        trainscp.write(f'rumi_speaker_{speaker}_{wavname} {source_path}\n')

        # Writing to data/train/utt2spk
        trainutt2spk.write(f'rumi_speaker_{speaker}_{wavname} rumi_speaker_{speaker}\n')

    for wav in tqdm(testfiles):
        source_path = os.path.join(wav_dir, speaker, wav)
        wavname = os.path.splitext(wav)[0]

        # Writing to data/test/text
        testtext.write(f'rumi_speaker_{speaker}_{wavname} {transcription[wavname]}\n')

        # Writing to data/test/wav.scp
        testscp.write(f'rumi_speaker_{speaker}_{wavname} {source_path}\n')

        # Writing to data/test/utt2spk
        testutt2spk.write(f'rumi_speaker_{speaker}_{wavname} rumi_speaker_{speaker}\n')

#### Closing Files (IMPORTANT)

In [None]:
traintext.close()
trainscp.close()
trainutt2spk.close()

testtext.close()
testscp.close()
testutt2spk.close()

#### Preparing Language Data

In [None]:
!echo 'SIL\nOOV' > /data/local/lang/silence_phones.txt
!echo 'SIL' > /data/local/lang/optional_silence.txt
!cut -d ' ' -f 2- /data/local/lang/lexicon.txt | sed 's/ /\n/g' | sed '/OOV/d' | sed '/SIL/d' | sort -u > /data/local/lang/nonsilence_phones.txt

#### Creating Mandatory Symlinks

In [None]:
!ln -s "$kaldi_examples"/wsj/s5/steps .
!ln -s "$kaldi_examples"/wsj/s5/utils .

#### Training

Data, Lexicon & Language Model Preparation

In [None]:
!./prep.sh

MFCC Features Extractions

In [None]:
!./mfcc_cmvn.sh

Training Monophone Model

In [None]:
!./mono.sh

Training & Decoding Triphone-1 Model (Deltas + Delta-Delta Model)

In [None]:
!./tri1.sh

Training & Decoding Triphone-2 Model (LDA + MLLT Model)

In [None]:
!./tri2.sh

Training & Decoding Triphone-3 Model (LDA + MLLT + SAT Model)

In [None]:
!./tri3.sh

Training & Decoding SGMM2 Model

In [None]:
!./sgmm2.sh

Training & Decoding MMI + SGMM2 Model

In [None]:
!./mmi_sgmm2.sh