<a href="https://colab.research.google.com/github/ngoyal88/pronunciation-checker/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://www.openslr.org/resources/12/train-clean-100.tar.gz


--2025-05-19 13:13:00--  https://www.openslr.org/resources/12/train-clean-100.tar.gz
Resolving www.openslr.org (www.openslr.org)... 46.101.158.64
Connecting to www.openslr.org (www.openslr.org)|46.101.158.64|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://openslr.elda.org/resources/12/train-clean-100.tar.gz [following]
--2025-05-19 13:13:01--  https://openslr.elda.org/resources/12/train-clean-100.tar.gz
Resolving openslr.elda.org (openslr.elda.org)... 141.94.109.138, 2001:41d0:203:ad8a::
Connecting to openslr.elda.org (openslr.elda.org)|141.94.109.138|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6387309499 (5.9G) [application/x-gzip]
Saving to: ‘train-clean-100.tar.gz’


2025-05-19 13:16:21 (30.5 MB/s) - ‘train-clean-100.tar.gz’ saved [6387309499/6387309499]



In [None]:
!pip install torch torchaudio transformers librosa soundfile matplotlib pandas tqdm




In [None]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

# --- Configuration ---
DATASET_PATH = "/content/LibriSpeech/train-clean-100"  # Replace with LibriSpeech subset path
SAMPLE_RATE = 16000
N_MFCC = 13
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Step 1: Dataset Class ---
class SpeechDataset(Dataset):
    def __init__(self, file_paths, labels, max_len=100):
        self.file_paths = file_paths
        self.labels = labels
        self.max_len = max_len
        self.encoder = LabelEncoder()
        self.encoded_labels = self.encoder.fit_transform(labels)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        y, sr = librosa.load(self.file_paths[idx], sr=SAMPLE_RATE)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
        mfcc = mfcc.T
        if mfcc.shape[0] > self.max_len:
            mfcc = mfcc[:self.max_len]
        else:
            mfcc = np.pad(mfcc, ((0, self.max_len - mfcc.shape[0]), (0, 0)))

        return torch.tensor(mfcc, dtype=torch.float32), torch.tensor(self.encoded_labels[idx], dtype=torch.long)

# --- Step 2: LSTM Model ---
class LSTMSpeechModel(nn.Module):
    def __init__(self, input_dim=13, hidden_dim=128, output_dim=40):  # output_dim = number of phonemes
        super(LSTMSpeechModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        output, _ = self.lstm(x)
        output = self.fc(output[:, -1, :])
        return output

# --- Step 3: Training Script ---
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    model.train()
    for epoch in range(epochs):
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)

            optimizer.zero_grad()
            output = model(x_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# --- Data Prep ---
def collect_data():
    file_paths = []
    labels = []
    for root, _, files in os.walk(DATASET_PATH):
        for file in files:
            if file.endswith(".flac"):
                file_paths.append(os.path.join(root, file))
                labels.append(file.split("-")[0])  # Dummy label for now
    return file_paths[:1000], labels[:1000]

file_paths, labels = collect_data()
dataset = SpeechDataset(file_paths, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

model = LSTMSpeechModel(output_dim=len(set(labels))).to(DEVICE)
train_model(model, dataloader)
torch.save(model.state_dict(), "pronunciation_model.pth")

Epoch 1, Loss: 1.4704666137695312
Epoch 2, Loss: 1.2330764532089233
Epoch 3, Loss: 0.5994546413421631
Epoch 4, Loss: 0.30530643463134766
Epoch 5, Loss: 0.24300143122673035
Epoch 6, Loss: 0.17126058042049408
Epoch 7, Loss: 0.09786786139011383
Epoch 8, Loss: 0.16016128659248352
Epoch 9, Loss: 0.04011086747050285
Epoch 10, Loss: 0.041672393679618835


In [None]:
import torch
import pickle

# Save full model
torch.save(model, "pronunciation_model.pth")

# Load and convert to pickle
model = torch.load("pronunciation_model.pth", map_location="cpu")

with open("pronunciation_model.pkl", "wb") as f:
    pickle.dump(model, f)


ModuleNotFoundError: No module named 'lstm_model'