<a href="https://colab.research.google.com/github/mobinym/Projects/blob/main/ASR_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow keras librosa numpy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from concurrent.futures import ThreadPoolExecutor


In [None]:
# تابع استخراج ویژگی‌ها
def extract_features(file_name):
    audio, sample_rate = librosa.load(file_name, sr=22050)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
    return np.mean(mfccs.T, axis=0)

In [None]:
audio_path = '/content/drive/MyDrive/DataSets/myaudio'
text_path = '/content/drive/MyDrive/DataSets/myaudio_tiny.xlsx'  # فایل CSV که شامل نام فایل‌های صوتی و متن آنهاست

In [None]:
# خواندن فایل‌های صوتی و متنی
df = pd.read_excel(text_path)

audio_files = [os.path.join(audio_path, f).replace('/myaudio/myaudio/', '/myaudio/') for f in df['audio']]
texts = df['text']

# # استخراج ویژگی‌ها و برچسب‌ها
# features = np.array([extract_features(f) for f in audio_files])
# print("Features:", features)
# print("Labels:", labels)

# استخراج ویژگی‌ها به صورت موازی
with ThreadPoolExecutor() as executor:
    features = list(executor.map(extract_features, audio_files))

features = pd.DataFrame(features)
labels = texts.tolist()
features.info()
features.describe()
# چاپ ویژگی‌ها و برچسب‌ها
print("Features:", features)
print("Labels:", labels)


# تبدیل برچسب‌ها به مقادیر عددی
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(labels)
sequences = tokenizer.texts_to_sequences(labels)
max_sequence_length = max([len(seq) for seq in sequences])
X_train = pad_sequences(sequences, maxlen=max_sequence_length)
X_train.shape
# نمایش شکل داده‌ها
print("Features shape: ", features.shape)
print('X_train',X_train.shape)
X_train = np.argmax(X_train, axis=1)
print('X_train',X_train.shape)
print("Labels shape: ", X_train.shape)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082 entries, 0 to 1081
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1082 non-null   float32
 1   1       1082 non-null   float32
 2   2       1082 non-null   float32
 3   3       1082 non-null   float32
 4   4       1082 non-null   float32
 5   5       1082 non-null   float32
 6   6       1082 non-null   float32
 7   7       1082 non-null   float32
 8   8       1082 non-null   float32
 9   9       1082 non-null   float32
 10  10      1082 non-null   float32
 11  11      1082 non-null   float32
 12  12      1082 non-null   float32
dtypes: float32(13)
memory usage: 55.1 KB
Features:               0           1          2          3          4          5   \
0    -231.467545   98.935799  -0.679422  45.096607 -25.350702   7.527362   
1    -181.159988  109.042076  -6.698942  53.532154 -32.726776   6.603848   
2    -238.634201   92.140923   5.017074  48.673630 -21.840

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding

# تغییر شکل داده‌ها برای ورودی به مدل
features = np.expand_dims(features, axis=1)
features.shape
# ساخت مدل
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(features.shape[1], features.shape[2])))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

# کامپایل کردن مدل
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# آموزش مدل
model.fit(features, X_train, epochs=10, batch_size=32, validation_split=0.2)

#---------------------------------------------------------------------------------------------------------------


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d8ef5a96b00>

In [None]:
import numpy as np

def normalize_features(features):
    mean = np.mean(features, axis=0)
    std = np.std(features, axis=0)
    normalized_features = (features - mean) / std
    return normalized_features

def predict_text(audio_file):
    feature = extract_features(audio_file)
    feature = normalize_features(feature)  # نرمال‌سازی ویژگی‌ها
    feature = np.expand_dims(feature, axis=0)
    feature = np.expand_dims(feature, axis=1)  # تغییر محور به 1 به جای 2

    # چاپ ویژگی‌های صوتی
    print("Extracted Features: ", feature)

    # پیش‌بینی با مدل
    prediction = model.predict(feature)

    # چاپ پیش‌بینی خام مدل
    print("Raw Prediction: ", prediction)

    # بررسی مقادیر NaN در پیش‌بینی
    if np.isnan(prediction).any():
        print("Prediction contains NaN values. Please check the model and input features.")
        return ""

    # تبدیل پیش‌بینی به متن
    predicted_sequence = np.argmax(prediction, axis=1)
    predicted_text = tokenizer.sequences_to_texts([predicted_sequence])

    return predicted_text[0]

# آزمایش مدل با یک فایل صوتی
test_audio_file = '/content/12560177.wav'
predicted_text = predict_text(test_audio_file)
print("Predicted Text: ", predicted_text)


Extracted Features:  [[[-3.1818354   1.4371285  -0.16468741  0.73870456  0.09384083
    0.34663373  0.04638166  0.1747061   0.0812481   0.1664555
    0.04077761  0.19929023  0.02135559]]]
Raw Prediction:  [[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan]]
Prediction contains NaN values. Please check the model and input features.
Predicted Text:  
