In [1]:
# Audio processing
import librosa
import noisereduce
# Data cleaning
import os
import pandas as pd
import numpy as np
from numpy import mean, var
# Machine learning
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
# Advanced options
import warnings
warnings.filterwarnings("ignore")

In [2]:
FRAMESIZE = 1024
HOPLENGTH = 512
MELS = 13

In [3]:
x_features = np.array(pd.read_csv('..\\..\\data\\speaker_data.csv'))
y_features = np.array(pd.read_csv('..\\..\\data\\speaker_target.csv'))
x_features

array([[2.78430246e-02, 2.08723382e-03, 1.36342971e-02, ...,
        5.92851219e+01, 7.02330399e+01, 6.59210815e+01],
       [3.36477980e-02, 4.33203066e-03, 1.53903021e-02, ...,
        7.47276688e+01, 8.50247574e+01, 1.04147827e+02],
       [2.33854018e-02, 1.25041464e-03, 1.33162923e-02, ...,
        4.32921295e+01, 5.31233444e+01, 5.19314690e+01],
       ...,
       [5.10469340e-02, 5.12684276e-03, 1.93619933e-02, ...,
        1.90371382e+00, 2.08918190e+00, 1.83357382e+00],
       [3.97482328e-02, 4.21313103e-03, 1.57446042e-02, ...,
        3.57322426e+01, 2.41789112e+01, 4.22590942e+01],
       [2.74238139e-02, 9.07653593e-04, 1.06191849e-02, ...,
        4.48041677e-01, 3.65600407e-01, 3.30244064e-01]])

## Data Modeling

- We will model the data with those five classifiers:
    - Logistic Regression
    - XGBoost
    - SVM
    - Random Forest
- And we will model the Big-Chunk-Data vs Small-Chunk-Data and compare between them.

### Hyperparameters Tuning

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x_features, y_features, test_size=0.3)

1- Logistic Regression

In [5]:
# LR = LogisticRegression()
# LRparam_grid = {
#     'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     'penalty': ['l1', 'l2'],
#     # 'max_iter': list(range(100,800,100)),
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# }
# LR_search = GridSearchCV(LR, param_grid=LRparam_grid, refit = True, verbose = 3, cv=5)
#
# # fitting the model for grid search
# LR_search.fit(x_train , y_train)
# LR_search.best_params_
# # summarize
# print('Mean Accuracy: %.3f' % LR_search.best_score_)
# print('Config: %s' % LR_search.best_params_)

In [6]:
# def random_search_tuning():
#     # define the train set and test set
#     x_train, x_val, y_train, y_val = train_test_split(x_features, y_features, test_size=0.05)
#     print("Shapes - X_train: ", x_train.shape,
#           ", X_val: ", x_val.shape, ", y_train: ",
#           y_train.shape, ", y_val: ", y_val.shape)
#
#     params = {'max_depth': [3, 6, 10, 15],
#               'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
#               'subsample': np.arange(0.5, 1.0, 0.1),
#               'colsample_bytree': np.arange(0.5, 1.0, 0.1),
#               'colsample_bylevel': np.arange(0.5, 1.0, 0.1),
#               'n_estimators': [100, 250, 500, 750],
#               'num_class': [10]
#               }
#
#     xgbclf = XGBClassifier(objective="multi:softmax", tree_method='hist', random_state=0)
#     clf = RandomizedSearchCV(estimator=xgbclf,
#                              param_distributions=params,
#                              scoring='accuracy',
#                              n_iter=25,
#                              n_jobs=4,
#                              verbose=1)
#
#     clf.fit(x_train, y_train)
#
#     best_combination = clf.best_params_
#
#     return best_combination
#
#
# best_params = random_search_tuning()
# print("Best hyperparameter combination: ", best_params)

In [7]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in range(200,2000,200)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
#
# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation,
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(x_train, y_train)
#
#
# def evaluate(model, test_features, test_labels):
#     y_pred = model.predict(test_features)
#     accuracy = accuracy_score(y_test, y_pred)
#     print (accuracy)
#     print(confusion_matrix(y_test,y_pred))
#
#
# best_random = rf_random.best_estimator_
# evaluate(best_random, x_test, y_test)
#
# print(rf_random.best_params_)

In [8]:
# # example of grid searching key hyperparametres for KNeighborsClassifier
# # define dataset
# X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# # define models and parameters
# model = KNeighborsClassifier()
# n_neighbors = range(1, 21, 2)
# weights = ['uniform', 'distance']
# metric = ['euclidean', 'manhattan', 'minkowski']
# # define grid search
# grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
# grid_result = grid_search.fit(X, y)
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

### Model Selection

In [9]:
def train_model(model, x_features, y_features):

    x_train, x_test, y_train, y_test = train_test_split(x_features[:,:], y_features, test_size=0.3)
    sc = StandardScaler()

    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    model.fit(x_train, y_train.ravel())
    y_pred = model.predict(x_test)

    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, model.predict(x_test))
    score = cross_val_score(model, x_features, y_features, cv=KFold(n_splits=15))

    print(f'Model Accuracy = {accuracy}\n Confusion Matrix = {cm} \n Avg Cross Validation Score = {score.mean()}')

    # # Predicting the training set
    # # result through scatter plot
    # X_set, y_set = x_train[:,:], y_train[:,0]
    # X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
    #                      stop = X_set[:, 0].max() + 1, step = 0.01),
    #                      np.arange(start = X_set[:, 1].min() - 1,
    #                      stop = X_set[:, 1].max() + 1, step = 0.01))
    #
    # plt.contourf(X1, X2, model.predict(np.array([X1.ravel(),
    #              X2.ravel()]).T).reshape(X1.shape), alpha = 0.5,
    #              cmap = ListedColormap(('#b35c52', '#6a9681', '#6b7ca8', '#9483a0', '#adaa7b')))
    #
    # plt.xlim(X1.min(), X1.max())
    # plt.ylim(X2.min(), X2.max())
    #
    # for i, j in enumerate(np.unique(y_set)):
    #     plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
    #                 color = ListedColormap(('#9b2012', '#0f633b', '#234499', '#602d90', '#88842a'))(i), label = j)
    #
    # plt.title('Model (Training set)')
    # plt.xlabel('PCA1') # for Xlabel
    # plt.ylabel('PCA2') # for Ylabel
    # plt.legend() # to show legend
    #
    # # show scatter plot
    # plt.show()

    return model

In [10]:
lr = LogisticRegression(random_state = 0, C = 0.1, penalty = 'l2', solver = 'saga')
# svm = SVC(C=100, kernel='linear', random_state = 0)
xgb = XGBClassifier(objective="multi:softmax", tree_method='hist', random_state=0, subsample = 0.6, num_class = 10, n_estimators = 500, max_depth = 3, learning_rate = 0.4, colsample_bytree = 0.7, colsample_bylevel = 0.5)
rf = RandomForestClassifier(random_state = 0, n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=90, bootstrap=False)
knn = KNeighborsClassifier(n_neighbors=15, metric='euclidean', weights='uniform')

In [11]:
print(f"Logistic Regression\n")
lr_s = train_model(lr, x_features, y_features)

Logistic Regression

Model Accuracy = 0.8275862068965517
 Confusion Matrix = [[ 4  0  0  0  2]
 [ 1  7  0  2  0]
 [ 0  0 16  2  0]
 [ 0  0  1 12  1]
 [ 0  0  0  1  9]] 
 Avg Cross Validation Score = 0.08504273504273505


In [12]:
print(f"XGBoost\n")
train_model(xgb, x_features, y_features)

XGBoost

Model Accuracy = 0.7931034482758621
 Confusion Matrix = [[ 5  0  0  0  0]
 [ 0  7  1  0  0]
 [ 0  1 14  1  0]
 [ 0  1  2  8  0]
 [ 0  0  0  6 12]] 
 Avg Cross Validation Score = 0.8529914529914531


In [13]:
print(f"Random Forest\n")
train_model(rf, x_features, y_features)

Random Forest

Model Accuracy = 0.7931034482758621
 Confusion Matrix = [[ 5  0  0  0  1]
 [ 0 11  2  0  0]
 [ 0  0 15  2  3]
 [ 0  1  1  9  2]
 [ 0  0  0  0  6]] 
 Avg Cross Validation Score = 0.8525641025641026


In [14]:
print(f"KNN\n")
train_model(knn, x_features, y_features)

KNN

Model Accuracy = 0.7758620689655172
 Confusion Matrix = [[ 3  0  0  0  0]
 [ 0 11  3  1  1]
 [ 0  1 13  0  0]
 [ 0  1  3  6  3]
 [ 0  0  0  0 12]] 
 Avg Cross Validation Score = 0.19743589743589746


### Model Building

In [15]:
# performing preprocessing part
sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [16]:
classifier = RandomForestClassifier(random_state = 0, n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=90, bootstrap=False)

classifier.fit(x_train, y_train)
classifier

### Model verification

In [17]:
# Functions we will use
def transform_audio(audio, FRAMESIZE, HOPLENGTH, MELS):

    audio_noised, sr = librosa.load(audio, duration=2)
    audio_array = noisereduce.reduce_noise(y=audio_noised, sr=sr)
    # audio_array, sr = librosa.load(audio, duration=2)

    log_mel_audio_list_mean = []
    log_mel_audio_list_var = []
    mfccs_audio_list_mean = []
    mfccs_audio_list_var = []
    cqt_audio_list_mean = []
    cqt_audio_list_var = []
    chromagram_audio_list_mean = []
    chromagram_audio_list_var = []
    tone_audio_list_mean = []
    tone_audio_list_var = []

    log_mel_audio = librosa.power_to_db(librosa.feature.melspectrogram(audio_array, sr=sr, n_fft=FRAMESIZE, hop_length=HOPLENGTH, n_mels=MELS))
    mfccs_audio = librosa.feature.mfcc(y=audio_array, n_mfcc=MELS, sr=sr, n_fft=FRAMESIZE, hop_length=HOPLENGTH)

    cqt_audio = np.abs(librosa.cqt(y=audio_array, sr=sr, hop_length=HOPLENGTH))
    chromagram_audio = librosa.feature.chroma_stft(audio_array, sr=sr, n_fft=FRAMESIZE, hop_length=HOPLENGTH)
    tone_audio = librosa.feature.tonnetz(y=audio_array, sr=sr)

    for i in range(len(log_mel_audio)):
         log_mel_audio_list_mean.append(log_mel_audio[i].mean())
         log_mel_audio_list_var.append(log_mel_audio[i].var())

    for i in range(len(mfccs_audio)):
         mfccs_audio_list_mean.append(mfccs_audio[i].mean())
         mfccs_audio_list_var.append(mfccs_audio[i].var())

    for i in range(len(cqt_audio)):
         cqt_audio_list_mean.append(cqt_audio[i].mean())
         cqt_audio_list_var.append(cqt_audio[i].var())

    for i in range(len(chromagram_audio)):
         chromagram_audio_list_mean.append(chromagram_audio[i].mean())
         chromagram_audio_list_var.append(chromagram_audio[i].var())

    for i in range(len(tone_audio)):
         tone_audio_list_mean.append(tone_audio[i].mean())
         tone_audio_list_var.append(tone_audio[i].var())

    sb_audio = librosa.feature.spectral_bandwidth(y=audio_array, sr=sr, n_fft=FRAMESIZE, hop_length=HOPLENGTH)

    ae_audio = fancy_amplitude_envelope(audio_array, FRAMESIZE, HOPLENGTH)
    rms_audio = librosa.feature.rms(audio_array, frame_length=FRAMESIZE, hop_length=HOPLENGTH)

    return np.hstack((mean(ae_audio), var(ae_audio), mean(rms_audio), var(rms_audio), mean(sb_audio), var(sb_audio), chromagram_audio_list_mean, chromagram_audio_list_var, tone_audio_list_mean, tone_audio_list_var, cqt_audio_list_mean, cqt_audio_list_var, mfccs_audio_list_mean, mfccs_audio_list_var, log_mel_audio_list_mean, log_mel_audio_list_var))


def fancy_amplitude_envelope(signal, framesize, hoplength):
    return np.array([max(signal[i:i+framesize]) for i in range(0, len(signal), hoplength)])

In [18]:
def test_classifier(path):
    file_paths = f"A:\\Professional\\Engineering CU\\DSP_Data_Verification\\{path}\\"
    for file in os.listdir(file_paths):
        audio = file_paths+file
        x_ver = transform_audio(audio, FRAMESIZE, HOPLENGTH, MELS)
        x_ver = sc.transform(x_ver.reshape(1,-1))

        print(file)
        print(classifier.predict(x_ver))

In [19]:
test_classifier('others')

adham_other13.wav
[1]
adham_other14.wav
[3]
ahmed_others6.wav
[3]
audio.wav
[3]
input1.wav
[0]
input2.wav
[3]
input3.wav
[3]
mahmoud_other13.wav
[2]
mahmoud_other14.wav
[2]


In [20]:
# dest = "..\\..\\models\\"
# picklefile = "rf_speaker_mod"
# pickle.dump(classifier,open(dest + picklefile+'.pkl','wb'))