In [None]:
!pip install tensorflow-addons

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.23.0 typeguard-2.13.3


In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Masking, Bidirectional, GRU, TimeDistributed, Dense, Activation, Lambda
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
# Download and unzip dataset
import gdown

url = 'https://drive.google.com/uc?id=1jyvhdZHn0s5Owkr21k5Ff-c96sIQLtEu'
output = 'all_wav.zip'
gdown.download(url, output, quiet=False)
!unzip -q 'all_wav.zip' -d '/content/all_wav'

url = 'https://drive.google.com/uc?id=1vqvn0F0YYhEFbzLgP9wJ36vyInUnO5b5'
output = 'dataset.csv'
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1jyvhdZHn0s5Owkr21k5Ff-c96sIQLtEu
From (redirected): https://drive.google.com/uc?id=1jyvhdZHn0s5Owkr21k5Ff-c96sIQLtEu&confirm=t&uuid=f6a50331-9ba4-41d1-8d72-9b3e3f794d07
To: /content/all_wav.zip
100%|██████████| 2.48G/2.48G [00:46<00:00, 53.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vqvn0F0YYhEFbzLgP9wJ36vyInUnO5b5
To: /content/dataset.csv
100%|██████████| 2.87M/2.87M [00:00<00:00, 21.0MB/s]


'dataset.csv'

In [None]:
df = pd.read_csv('/content/dataset.csv')
df.head()

Unnamed: 0,wav_filename,wav_filesize,transcript,confidence_level
0,./all_wav/Tehran_SayeRoshan0_101.wav,83044,اتفاقاتی که ندیده بودم,0.927557
1,./all_wav/Tehran_SayeRoshan0_105.wav,54468,مسجد,0.927557
2,./all_wav/Tehran_SayeRoshan0_107.wav,136036,جمع شدن مسلمین برای نمازهای جماعت,0.864152
3,./all_wav/Tehran_SayeRoshan0_108.wav,106788,همیشه برای محمدرضا پهلوی,0.927557
4,./all_wav/Tehran_SayeRoshan0_109.wav,170020,چه زمانی در کسوت شاه ایران نوکری اجانب را می‌کرد,0.854824


In [None]:
# Load dataset
df = pd.read_csv('/content/dataset.csv')

# Update paths for WAV files
df['wav_filename'] = df['wav_filename'].apply(lambda x: x.replace('./all_wav/', '/content/all_wav/all_wav/'))

# Filter dataset to include only existing WAV files
df = df[df['wav_filename'].apply(os.path.isfile)]

# Character map for Persian characters
char_map_str = """
' 0
<SPACE> 1
ا 2
ب 3
پ 4
ت 5
ث 6
ج 7
چ 8
ح 9
خ 10
د 11
ذ 12
ر 13
ز 14
ژ 15
س 16
ش 17
ص 18
ض 19
ط 20
ظ 21
ع 22
غ 23
ف 24
ق 25
ک 26
گ 27
ل 28
م 29
ن 30
و 31
ه 32
ی 33
، 34
؟ 35
"""
char_map = {}
index_map = {}
for line in char_map_str.strip().split('\n'):
    ch, index = line.split()
    char_map[ch] = int(index)
    index_map[int(index)] = ch
index_map[1] = ' '

# Ensure space character is in char_map
char_map[' '] = char_map['<SPACE>']

# Audio processing functions
def load_audio(file_path, sr=16000):
    audio, _ = librosa.load(file_path, sr=sr)
    return audio

def extract_features(audio, n_mfcc=13, sr=16000):
    mfcc_features = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfcc_features.T

# Prepare the dataset
X = []
y = []
input_lengths = []
label_lengths = []

for index, row in df.iterrows():
    audio_path = row['wav_filename']
    audio = load_audio(audio_path)
    features = extract_features(audio)
    X.append(features)
    input_lengths.append([features.shape[0]])
    label = [char_map.get(c, char_map[' ']) for c in row['transcript']]
    y.append(label)
    label_lengths.append([len(label)])

if len(X) == 0 or len(y) == 0:
    raise ValueError("No valid audio files were found. Please check the dataset and the paths.")

X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='post', dtype='float32')
y = tf.keras.preprocessing.sequence.pad_sequences(y, padding='post', value=-1)

# Define the model
input_data = Input(name='the_input', shape=(None, 13))
masking_layer = Masking(mask_value=0.0)(input_data)
bilstm_layer_1 = Bidirectional(LSTM(128, return_sequences=True))(masking_layer)
bilstm_layer_2 = Bidirectional(LSTM(128, return_sequences=True))(bilstm_layer_1)
time_dense = TimeDistributed(Dense(len(char_map) + 1))(bilstm_layer_2)
y_pred = Activation('softmax', name='activation')(time_dense)

# Define the CTC loss function
def ctc_loss(y_true, y_pred):
    labels = tf.cast(y_true[:, :, 0], tf.int32)
    input_length = tf.cast(y_true[:, 0, 1], tf.int32)
    label_length = tf.cast(y_true[:, 0, 2], tf.int32)
    return tf.keras.backend.ctc_batch_cost(labels, y_pred, input_length, label_length)

# Compile the model with CTC loss
labels = Input(name='the_labels', shape=[None], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
loss_out = Lambda(ctc_loss, output_shape=(1,), name='ctc')([labels, y_pred, input_length, label_length])

model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
model.compile(optimizer=tf.keras.optimizers.Adam(), loss={'ctc': lambda y_true, y_pred: y_pred})

# Split the data into training and validation sets
X_train, X_val, y_train, y_val, input_length_train, input_length_val, label_length_train, label_length_val = train_test_split(X, y, input_lengths, label_lengths, test_size=0.2, random_state=42)

# Data generator
def data_generator(X, y, input_lengths, label_lengths, batch_size=32):
    while True:
        for i in range(0, len(X), batch_size):
            X_batch = X[i:i+batch_size]
            y_batch = y[i:i+batch_size]
            input_lengths_batch = input_lengths[i:i+batch_size]
            label_lengths_batch = label_lengths[i:i+batch_size]
            y_true = np.zeros((batch_size, max([len(y_) for y_ in y_batch]), 3))
            for j in range(len(y_batch)):
                y_true[j, :len(y_batch[j]), 0] = y_batch[j]
                y_true[j, 0, 1] = input_lengths_batch[j][0]
                y_true[j, 0, 2] = label_lengths_batch[j][0]
            yield (
                {
                    'the_input': np.array(X_batch),
                    'the_labels': np.array(y_batch),
                    'input_length': np.array(input_lengths_batch),
                    'label_length': np.array(label_lengths_batch)
                },
                y_true
            )

train_gen = data_generator(X_train, y_train, input_length_train, label_length_train, batch_size=32)
val_gen = data_generator(X_val, y_val, input_length_val, label_length_val, batch_size=32)

steps_per_epoch = len(X_train) // 32
validation_steps = len(X_val) // 32

# Train the model
model.fit(train_gen, steps_per_epoch=steps_per_epoch, epochs=10, validation_data=val_gen, validation_steps=validation_steps)

# Save the model
model.save('asr_model.keras')

# Load the model for inference
inference_model = Model(inputs=input_data, outputs=y_pred)
inference_model.set_weights(model.get_weights()[:-2])

# Load a sample and predict
sample_index = 0
sample_features = X[sample_index]
sample_input_length = np.array([input_length_train[sample_index]])

sample_features = np.expand_dims(sample_features, axis=0)
sample_input_length = np.expand_dims(sample_input_length, axis=0)

preds = inference_model.predict(sample_features)
decoded_pred = tf.keras.backend.ctc_decode(preds, input_length=sample_input_length)[0][0]
decoded_pred = tf.keras.backend.get_value(decoded_pred)

predicted_text = ''.join([index_map[i] for i in decoded_pred if i != -1])

actual_text = df.iloc[sample_index]['transcript']

print(f"Predicted text: {predicted_text}")
print(f"Actual text: {actual_text}")

TypeError: Exception encountered when calling layer "ctc" (type Lambda).

ctc_loss() missing 1 required positional argument: 'y_pred'

Call arguments received by layer "ctc" (type Lambda):
  • inputs=['tf.Tensor(shape=(None, None), dtype=float32)', 'tf.Tensor(shape=(None, None, 38), dtype=float32)', 'tf.Tensor(shape=(None, 1), dtype=int64)', 'tf.Tensor(shape=(None, 1), dtype=int64)']
  • mask=['None', 'tf.Tensor(shape=(None, None), dtype=bool)', 'None', 'None']
  • training=None