In [1]:
# Import necessary libraries
import joblib
import pandas as pd
import numpy as np
import librosa
from scipy.signal import stft, find_peaks
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
# Define a function to extract audio features
def extract_features(audio_file):
    """
    Load an audio file and extract features such as mean and standard deviation of MFCCs.

    Args:
        audio_file (str): Path to the audio file

    Returns:
        pd.Series: A pandas series containing the extracted features
    """
    # Load audio file
    y, sr = librosa.load(audio_file, sr=44100)

    # Extract some audio features (e.g., mean and standard deviation of MFCCs)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = mfccs.mean(axis=1)
    mfccs_std = mfccs.std(axis=1)

    _, _, sxx = stft(y, sr, nfft=512)

    sxx = np.abs(sxx)

    melsxx = librosa.feature.melspectrogram(y=y, sr=sr)

    # Extract MFCCs (Mel-Frequency Cepstral Coefficients)
    delta_mfccs = librosa.feature.delta(mfccs)
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)

    combined_mfccs = np.concatenate((mfccs, delta_mfccs, delta2_mfccs))
    combined_mfccs1 = np.concatenate((mfccs, delta_mfccs))

    # Extract pitch
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    pitch = np.mean(pitches[magnitudes > np.median(magnitudes)])

    # Extract Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y=y)

    # Extract spectral contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    peaks, _ = find_peaks(sxx.flatten())

    # Extract Energy
    energy = librosa.feature.rms(y=y)

    # Calculate the spectral centroid
    spectral_centroid = librosa.feature.spectral_centroid(S=sxx)[0]

    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    spectral_flatness = librosa.feature.spectral_flatness(y=y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=512)
    # chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
    # chroma_cqt = librosa.feature.chroma_cqt(y=y, sr=sr)

    # Compute the tempo (beats per minute)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)


    # Create a pandas series with the extracted features
    # columns = ["mfccs_mean_1", "mfccs_mean_2", "mfccs_std_1", "mfccs_std_2"]
    columns = [
        "stft",
        "melSpectro",
        "pitch",
        "zcr",
        "mfcc",
        "chroma",
        "spectral_contrast",
        "peak_mean",
        "energy"
]
    df = [sxx.flatten(), melsxx.flatten(), pitches.flatten(), zcr.flatten(), combined_mfccs,
     spectral_contrast.flatten(), peaks.flatten(), energy.flatten()]
    # df = np.concatenate([sxx.flatten(), melsxx.flatten(), pitches.flatten(), zcr.flatten(), combined_mfccs.flatten(),
        #  chroma.flatten(), spectral_contrast.flatten(), np.mean(peaks), energy.flatten()])
    
    # ---------------------------------------------
    # IMPORTNANT!!!\/\/
    p_features = zcr.flatten().tolist() + \
        mfccs.flatten().tolist() + \
        mfccs_mean.flatten().tolist() + \
        mfccs_std.flatten().tolist() + \
        pitch.flatten().tolist() + \
        energy.flatten().tolist() # + \
        # spectral_centroid.flatten().tolist() + \
        # tempo.flatten().tolist()
    
    w_features = sxx.flatten().tolist() + \
        melsxx.flatten().tolist() + \
        zcr.flatten().tolist() + \
        combined_mfccs.flatten().tolist() + \
        spectral_contrast.flatten().tolist() + \
        spectral_centroid.flatten().tolist() + \
        spectral_bandwidth.flatten().tolist() + \
        spectral_rolloff.flatten().tolist() + \
        spectral_flatness.flatten().tolist() + \
        chroma_stft.flatten().tolist() + \
        mfccs_mean.flatten().tolist() + \
        mfccs_std.flatten().tolist() + \
        pitch.flatten().tolist() + \
        energy.flatten().tolist() + \
        tempo.flatten().tolist()
        # chroma_cens.flatten().tolist() + \
    #     chroma_cqt.flatten().tolist()
    
    all_features = mfccs_mean.flatten().tolist()  + \
        mfccs_std.flatten().tolist() + \
        zcr.flatten().tolist() + \
        pitch.flatten().tolist() + \
        energy.flatten().tolist() + \
        tempo.flatten().tolist() + \
        sxx.flatten().tolist() + \
        melsxx.flatten().tolist() + \
        combined_mfccs.flatten().tolist() + \
        spectral_contrast.flatten().tolist() + \
        spectral_centroid.flatten().tolist() + \
        spectral_bandwidth.flatten().tolist() + \
        spectral_rolloff.flatten().tolist() + \
        spectral_flatness.flatten().tolist() + \
        chroma_stft.flatten().tolist() #+ \
        # chroma_cqt.flatten().tolist()
        # chroma_cens.flatten().tolist() #+ \
    
    # ---------------------------------------------
    # d = np.concatenate(melsxx.flatten())
    # + melsxx.flatten() + pitch.flatten() + \
        # chroma.flatten() + spectral_contrast.flatten() + \
        # energy.flatten() + zcr.flatten()
    data = {
        column: [value]
        for column, value in zip(columns, df)
    }
    # ---------------------------------------------
    return pd.DataFrame(p_features).T, pd.DataFrame(w_features).T
    # return pd.DataFrame(all_features).T
    # ---------------------------------------------
    # d_flat = np.ndarray(d)
    # d_flat.shape
    # features = pd.DataFrame(d_flat)
    # return features

In [3]:
p, w = extract_features(f"./recordz/Omar/d150.wav")

In [4]:
def create_df(length, person_labels, word_labels, sub_length):
    dfp = pd.DataFrame([])
    dfw = pd.DataFrame([])
    m = 1
    for p in person_labels:
        for i in range(m, sub_length + m):
            for w in word_labels:
                audio_file = f"./recordz/{p}/{w}{i}.wav"
                print(audio_file)
                person, word = extract_features(audio_file)
                person['person_label'] = p
                word['word_label'] = w
                # Create a DataFrame with a single row
                dfp = pd.concat([dfp, person])
                dfw = pd.concat([dfw, word])
        m += 15
        if p == "Alaa" or p == "Mahmoud":
            m += 15
    return dfp, dfw

In [5]:
def create_one_df(length, person_labels, word_labels, sub_length):
    df = pd.DataFrame([])
    # labeled_df = pd.DataFrame([])
    m = 1
    for p in person_labels:
        for i in range(m, sub_length + m):
            for w in word_labels:
                audio_file = f"./recordz/{p}/{w}{i}.wav"
                print(audio_file)
                d = extract_features(audio_file)
                # d['index'] = i
                d['person_label'] = p
                d['word_label'] = w
                # Create a DataFrame with a single row
                df = pd.concat([df, d])
                # x = pd.concat([x, d])
        m += 15
        if p == "Alaa" or p == "Mahmoud":
            m += 15
    return df

In [6]:
person_labels = ['Mahmoud', 
                #  'Ziyad',
                   'Amgad', 'Alaa',
                #  'Ibrahim',
                 'Ali', 'Marwan', 'Shawky', 'Abdallah', 'Omar',]
word_labels = ['d', 'g', 'm']
length = 150
sub_length = 15

dfp, dfw = create_df(length, person_labels, word_labels, sub_length)

./recordz/Mahmoud/d1.wav


./recordz/Mahmoud/g1.wav
./recordz/Mahmoud/m1.wav
./recordz/Mahmoud/d2.wav
./recordz/Mahmoud/g2.wav
./recordz/Mahmoud/m2.wav
./recordz/Mahmoud/d3.wav
./recordz/Mahmoud/g3.wav
./recordz/Mahmoud/m3.wav
./recordz/Mahmoud/d4.wav
./recordz/Mahmoud/g4.wav
./recordz/Mahmoud/m4.wav
./recordz/Mahmoud/d5.wav
./recordz/Mahmoud/g5.wav
./recordz/Mahmoud/m5.wav
./recordz/Mahmoud/d6.wav
./recordz/Mahmoud/g6.wav
./recordz/Mahmoud/m6.wav
./recordz/Mahmoud/d7.wav
./recordz/Mahmoud/g7.wav
./recordz/Mahmoud/m7.wav
./recordz/Mahmoud/d8.wav
./recordz/Mahmoud/g8.wav
./recordz/Mahmoud/m8.wav
./recordz/Mahmoud/d9.wav
./recordz/Mahmoud/g9.wav
./recordz/Mahmoud/m9.wav
./recordz/Mahmoud/d10.wav
./recordz/Mahmoud/g10.wav
./recordz/Mahmoud/m10.wav
./recordz/Mahmoud/d11.wav
./recordz/Mahmoud/g11.wav
./recordz/Mahmoud/m11.wav
./recordz/Mahmoud/d12.wav
./recordz/Mahmoud/g12.wav
./recordz/Mahmoud/m12.wav
./recordz/Mahmoud/d13.wav
./recordz/Mahmoud/g13.wav
./recordz/Mahmoud/m13.wav
./recordz/Mahmoud/d14.wav
./recordz/Ma

In [7]:
dfkp = pd.DataFrame([])
dfkw = pd.DataFrame([])
# dfk = pd.DataFrame([])
p = "Kamal"
for i in range(121, 136):
    for w in word_labels:
        audio_file = f"./recordz/{p}/{w}{i}.wav"
        print(audio_file)
        pf, wf = extract_features(audio_file)
        pf['person_label'] = p
        wf['word_label'] = w
        # dfk = pd.concat([dfk, f])
        # Create a DataFrame with a single row
        dfkp = pd.concat([dfkp, pf])
        dfkw = pd.concat([dfkw, wf])
dfp = pd.concat([dfp, dfkp])
dfw = pd.concat([dfw, dfkw])
# df = pd.concat([df, dfk])

./recordz/Kamal/d121.wav


./recordz/Kamal/g121.wav
./recordz/Kamal/m121.wav
./recordz/Kamal/d122.wav
./recordz/Kamal/g122.wav
./recordz/Kamal/m122.wav
./recordz/Kamal/d123.wav
./recordz/Kamal/g123.wav
./recordz/Kamal/m123.wav
./recordz/Kamal/d124.wav
./recordz/Kamal/g124.wav
./recordz/Kamal/m124.wav
./recordz/Kamal/d125.wav
./recordz/Kamal/g125.wav
./recordz/Kamal/m125.wav
./recordz/Kamal/d126.wav
./recordz/Kamal/g126.wav
./recordz/Kamal/m126.wav
./recordz/Kamal/d127.wav
./recordz/Kamal/g127.wav
./recordz/Kamal/m127.wav
./recordz/Kamal/d128.wav
./recordz/Kamal/g128.wav
./recordz/Kamal/m128.wav
./recordz/Kamal/d129.wav
./recordz/Kamal/g129.wav
./recordz/Kamal/m129.wav
./recordz/Kamal/d130.wav
./recordz/Kamal/g130.wav
./recordz/Kamal/m130.wav
./recordz/Kamal/d131.wav
./recordz/Kamal/g131.wav
./recordz/Kamal/m131.wav
./recordz/Kamal/d132.wav
./recordz/Kamal/g132.wav
./recordz/Kamal/m132.wav
./recordz/Kamal/d133.wav
./recordz/Kamal/g133.wav
./recordz/Kamal/m133.wav
./recordz/Kamal/d134.wav
./recordz/Kamal/g134.wav


In [8]:
# df.head(1)

In [9]:
dfwv = dfw.dropna(axis=0).dropna(axis=1)

In [10]:
dfp.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2553,2554,2555,2556,2557,2558,2559,2560,2561,person_label
0,0.089844,0.094238,0.112305,0.064941,0.058105,0.10791,0.128906,0.175781,0.195312,0.191406,...,0.013817,0.013135,0.010284,0.009722,0.008777,0.005822,0.005309,0.005881,0.005176,Mahmoud
0,0.089844,0.109863,0.138672,0.121582,0.143066,0.176758,0.1875,0.204102,0.195312,0.191406,...,0.00224,0.00202,0.002087,0.00199,0.001887,0.001993,0.001857,0.001768,0.001543,Mahmoud


In [11]:
y_p = dfp["person_label"]
y_w = dfw["word_label"]

pX = dfp.drop(['person_label'], axis=1)
wX = dfwv.drop(['word_label'], axis=1)
# # wX = dfw.drop(['word_label'], axis=1)
# X = dfv.drop(['person_label', 'word_label'], axis=1)

In [12]:
# from sklearn.manifold import TSNE

# # Assuming you have a high-dimensional dataset X
# # Specify the number of components (2 for 2D, 3 for 3D)
# tsne = TSNE(n_components=3, random_state=42)

# pX.columns = pX.columns.astype(str)
# wX.columns = wX.columns.astype(str)

# # Fit and transform the data to the lower-dimensional space
# pX_tsne = tsne.fit_transform(pX)
# wX_tsne = tsne.fit_transform(wX)

In [13]:
# print(pX_tsne.shape)
# pX.shape

In [14]:
# lens = [len(seq) for seq in df["energy"]]
# lens = [len(seq) for seq in df["energy"] if len(seq) != 169]
# lens

In [15]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [16]:
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
y_p_le = labelencoder.fit_transform(y_p)
reshaped_y_p_le = np.reshape(y_p_le, -1)
# print(reshaped_y_p_le.tolist())

y_w_le = labelencoder.fit_transform(y_w)
reshaped_y_w_le = np.reshape(y_w_le, -1)
# reshaped_y_w_le.tolist()

labelencoder.classes_

array(['d', 'g', 'm'], dtype=object)

In [17]:
# enc = OneHotEncoder(handle_unknown='ignore')

# y_p_ohe = pd.DataFrame(enc.fit_transform(reshaped_y_p_le).toarray())
# merge with main df bridge_df on key values
# y_p_ohe

In [18]:
py = reshaped_y_p_le.ravel()
wy = reshaped_y_w_le.ravel()

pX = pX.values.reshape((len(py), len(pX.columns)))
wX = wX.values.reshape((len(wy), len(wX.columns)))
# X = X.values.reshape((len(wy), len(X.columns)))

In [19]:
pX_train, pX_test, py_train, py_test = train_test_split(pX, py, test_size=0.2, random_state=42)
wX_train, wX_test, wy_train, wy_test = train_test_split(wX, wy, test_size=0.2, random_state=42)

In [20]:
p_model_svc = SVC(kernel='linear', probability=True)
p_model_svc.fit(pX_train, py_train)

In [21]:
p_svc_proba_pred = p_model_svc.predict_proba(pX_test)
p_svc_pred = p_model_svc.predict(pX_test)

accuracy = accuracy_score(py_test, p_svc_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9135802469135802


In [22]:
# p_model_gb = GradientBoostingClassifier()
# p_model_gb.fit(pX_train, py_train)

In [23]:
# p_gb_proba_pred = p_model_gb.predict_proba(pX_test)
# p_gb_pred = p_model_gb.predict(pX_test)


# accuracy = accuracy_score(py_test, p_gb_pred)
# print(f"Accuracy: {accuracy}")

In [24]:
# p_model_rf = RandomForestClassifier(n_estimators=100)
# p_model_rf.fit(pX_train, py_train)

In [25]:
# p_rf_proba_pred = p_model_rf.predict_proba(pX_test)
# p_rf_pred = p_model_rf.predict(pX_test)

# accuracy = accuracy_score(py_test, p_rf_pred)
# print(f"Accuracy: {accuracy}")

In [26]:
w_model_svc = SVC(kernel='linear', probability=True)
w_model_svc.fit(wX_train, wy_train)

In [27]:
w_svc_proba_pred = w_model_svc.predict_proba(wX_test)
w_svc_pred = w_model_svc.predict(wX_test)

accuracy = accuracy_score(wy_test, w_svc_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8888888888888888


In [28]:
# # Use GridSearchCV for cross-validation
# grid_search = GridSearchCV(w_model_svc, param_grid, cv=5)
# grid_search.fit(wX, y_w)

# # Get the best parameter
# w_best_C = grid_search.best_params_['C']
# w_best_C

In [29]:
# w_model_gb = GradientBoostingClassifier()
# w_model_gb.fit(wX_train, wy_train)

In [30]:
# w_gb_proba_pred = w_model_gb.predict_proba(wX_test)
# w_gb_pred = w_model_gb.predict(wX_test)


# accuracy = accuracy_score(wy_test, w_gb_pred)
# print(f"Accuracy: {accuracy}")

In [31]:
# w_model_rf = RandomForestClassifier(n_estimators=100)
# w_model_rf.fit(wX_train, wy_train)

In [32]:
# w_rf_proba_pred = w_model_rf.predict_proba(wX_test)
# w_rf_pred = w_model_rf.predict(wX_test)

# accuracy = accuracy_score(wy_test, w_rf_pred)
# print(f"Accuracy: {accuracy}")

In [33]:
# from sklearn.model_selection import cross_val_score

# p_cv_scores = cross_val_score(p_model_svc, pX, y_p, cv=4)  # Adjust the number of folds (cv)

# # Print the cross-validation scores
# print("Cross-validation Scores:", p_cv_scores)
# print("Mean CV Score:", p_cv_scores.mean())

# # Adjust the number of folds (cv)
# w_cv_scores = cross_val_score(w_model_svc, wX, y_w, cv=4)

# # Print the cross-validation scores
# print("Cross-validation Scores:", w_cv_scores)
# print("Mean CV Score:", w_cv_scores.mean())

# Save


In [34]:
# Save the model
joblib.dump(p_model_svc, 'p_model_svc.pkl')
joblib.dump(w_model_svc, 'w_model_svc_z.pkl')

['w_model_svc.pkl']

In [35]:
# max(w_svc_proba_pred[52])

In [36]:
# vx = w_svc_pred[52:88]
# d = labelencoder.inverse_transform(vx)
# for i in range(len(d)):
#     print(d[i], "  ", vx[i])

In [37]:
# from pydub import AudioSegment
# from pydub.playback import play
# from pydub.effects import low_pass_filter

# # Load the audio file
# audio = AudioSegment.from_file(f"./recordz/Omar/d150.wav", format="wav")

# # Apply a low-pass filter (example values; you may need to adjust parameters)
# cutoff_frequency = 3000
# filtered_audio = low_pass_filter(audio, cutoff_frequency)

# play(audio)
# # Play the filtered audio
# play(filtered_audio)

# # Print information about the filtered audio
# print(filtered_audio)

spectral_centroid = librosa.feature.spectral_centroid(y, sr=sr)
spectral_bandwidth = librosa.feature.spectral_bandwidth(y, sr=sr)
spectral_contrast = librosa.feature.spectral_contrast(y, sr=sr)
spectral_rolloff = librosa.feature.spectral_rolloff(y, sr=sr)
spectral_flatness = librosa.feature.spectral_flatness(y, sr=sr)
chroma_stft = librosa.feature.chroma_stft(y, sr=sr)
chroma_cens = librosa.feature.chroma_cens(y, sr=sr)
chroma_cqt = librosa.feature.chroma_cqt(y, sr=sr)