In [None]:
import time, os, pickle
import joblib
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import missingno as msno
import plotly.graph_objects as go
from datetime import datetime
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE


from sklearn.model_selection import KFold, train_test_split,GridSearchCV
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler,RobustScaler,StandardScaler
from sklearn.feature_selection import VarianceThreshold,mutual_info_classif, f_classif
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import f_classif, mutual_info_classif, VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from feature_engine.selection import (
    RecursiveFeatureElimination,
    DropConstantFeatures,
    DropDuplicateFeatures,
)
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from operator import itemgetter
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    classification_report,
    confusion_matrix
)

from sklearn.feature_selection import RFECV


#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Allow long lines

In [None]:
# Specify the path to the CSV file in the parent directory
file_path = "../Dataset/pd_speech_features/pd_speech_features.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, sep=',',skiprows=1)
df.drop("id", axis=1, inplace=True)
X = df.iloc[:,0:753]  #independent columns
y = df.iloc[:,-1]    #target column i.e price range

In [None]:
X.head()

In [None]:
y.head()

# Remove constant features

The DropConstantFeatures class from Feature-engine finds and removes constant and quasi-constant features from a dataset. We can remove constant features by setting the parameter tol to 1, or quasi-constant with smaller values for tol.

In [None]:
sel = DropConstantFeatures(tol=1, variables=None, missing_values='raise')

sel.fit(X)

In [None]:
# list of constant features

sel.features_to_drop_

# Remove quasi-constant features

In [None]:
sel = DropConstantFeatures(tol=0.998, variables=None, missing_values='raise')

sel.fit(X)

In [None]:
# number of quasi-constant features

len(sel.features_to_drop_)

# Remove duplicated features

In [None]:
# set up the selector
sel = DropDuplicateFeatures(variables=None, missing_values='raise')

# find the duplicate features, this might take a while
sel.fit(X)

In [None]:
# these are the pairs of duplicated features
# each set are duplicates

sel.duplicated_feature_sets_

# Correlation Heatmap

In [None]:
# visualise correlated features:

# I will build a correlation matrix, which examines the 
# correlation of all features (that is, for all possible feature combinations)
# and then visualise the correlation matrix using a heatmap

# the default correlation method of pandas.corr is pearson
# I include it anyways for the demo
corrmat = X.corr(method='pearson')

# we can make a heatmap with the package seaborn
# and customise the colours of searborn's heatmap
cmap = sns.diverging_palette(220, 20, as_cmap=True)

# some more parameters for the figure
fig, ax = plt.subplots()
fig.set_size_inches(11,11)

# and now plot the correlation matrix
sns.heatmap(corrmat, cmap=cmap)

In the plot above, the dark orange squares correspond to highly correlated features (>0.8). Darker blue squares corresponds to negatively correlated features (<-0.8).

The diagonal represents the correlation of a feature with itself, therefore the value is 1.

We can see that there are a few features that are highly correlated.

# Remove Correlated

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything else
# without any further insight.

def correlation(dataset, threshold):
    
    # create a set where I will store the names of correlated columns
    col_corr = set()
    
    # create the correlation matrix
    corr_matrix = dataset.corr()
    
    # for each feature in the dataset (columns of the correlation matrix)
    for i in range(len(corr_matrix.columns)):
        
        # check with other features
        for j in range(i):
            
            # if the correlation is higher than a certain threshold
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                
                # print correlation, and variables examined
                # keep in mind that the columns and rows of the dataframe are identical
                # so we can identify the features being examned by looking for i,j
                # in the column names
                print(abs(corr_matrix.iloc[i, j]), corr_matrix.columns[i], corr_matrix.columns[j])
                
                # get the name of the correlated feature
                colname = corr_matrix.columns[j]
                
                # and add it to our correlated set
                col_corr.add(colname)
                
    return col_corr

In [None]:
corr_features = correlation(X, 0.8)
len(set(corr_features))

In [None]:
X.drop(labels=corr_features, axis=1, inplace=True)

X.shape

In [None]:
X.head()

# Train Test Split

In [16]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2, random_state=42)

# RFE

In [17]:
# Define your estimator and feature range
estimator = LogisticRegression()
#n_features_range = range(1, X_train.shape[1] + 1)
n_features_range = range(10,14)
print(n_features_range)

# Perform cross-validation for each number of features
best_score = -float("inf")
best_n_features = 1
for n_features in n_features_range:
    rfe = RFE(estimator, n_features_to_select=n_features)
    scores = cross_val_score(rfe, X_train, y_train, cv=5, scoring='accuracy')
    score = scores.mean()
    print(str(n_features) + " " + str(score) )
    if score > best_score:
        best_score = score
        best_n_features = n_features
        # Fit the RFE model
        rfe.fit(X_train, y_train)
        # Get the selected features
        selected_features = rfe.support_
        print("Selected features:", selected_features)

print("Best number of features:", best_n_features)

range(10, 14)
10 0.807947658402204
Selected features: [False False False False False False False False  True False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False  True False False  True  True  True False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
  True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False Fal

In [18]:
print(score)

0.8161707988980715


In [19]:
type(X_train)

pandas.core.frame.DataFrame

In [20]:
type(selected_features)

numpy.ndarray

In [21]:
X_train_selected = X_train.loc[:,selected_features]
X_test_selected = X_test.loc[:,selected_features]

In [22]:
X_train_selected.head

<bound method NDFrame.head of      meanHarmToNoiseHarmonicity  mean_MFCC_2nd_coef  mean_MFCC_5th_coef  \
212                      22.202            2.721500            0.190180   
69                       17.065            1.109000           -0.133790   
329                      21.295            1.859800           -1.263200   
131                      25.236            3.486900           -2.326800   
44                       20.530            1.561000           -0.242750   
70                       17.636            2.261000            0.230460   
407                      27.794           -0.827940           -1.982100   
215                      22.208            1.933000           -2.030700   
409                      27.530           -0.156000            0.278260   
377                      23.627            1.360500           -1.963400   
135                      26.345           -1.138500           -0.509770   
244                      16.299            0.493680            0.37912

In [23]:
def evaluate_classifiers(X_train, y_train, X_test, y_test):
    classifiers = {
        'SVM': {
            'model': SVC(),
            'param_grid': {
                'C': [0.1, 1, 10],
                'kernel': ['linear', 'rbf', 'poly'],
                'gamma': ['scale', 'auto']
            }
        },
        'Random Forest': {
            'model': RandomForestClassifier(),
            'param_grid': {
                'n_estimators': [100, 200, 300],
                'criterion': ['gini', 'entropy'],
                'max_depth': [None, 5, 10]
            }
        },
        'KNN': {
            'model': KNeighborsClassifier(),
            'param_grid': {
                'n_neighbors': [3, 5, 7],
                'weights': ['uniform', 'distance'],
                'p': [1, 2]
            }
        },
        'Naive Bayes': {
            'model': GaussianNB(),
            'param_grid': {
                'priors': [None, [0.1, 0.9], [0.3, 0.7], [0.5, 0.5], [0.7, 0.3], [0.9, 0.1]]

            }
        },
        'Decision Tree': {
            'model': DecisionTreeClassifier(),
            'param_grid': {
                'criterion': ['gini', 'entropy'],
                'max_depth': [None, 5, 10]
            }
        },
        'XGBoost': {
            'model': XGBClassifier(),
            'param_grid': {
                'learning_rate': [0.1, 0.01],
                'max_depth': [3, 5, 7],
                'n_estimators': [100, 200, 300]
            }
        },
        'AdaBoost': {
            'model': AdaBoostClassifier(),
            'param_grid': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.1, 1, 10]
            }
        }
        
    }

    performance_metrics = {}

    for clf_name, clf_info in classifiers.items():
        clf = clf_info['model']
        param_grid = clf_info['param_grid']
        
        start = time.time()
        grid_search = GridSearchCV(clf, param_grid, cv=5)
        grid_search.fit(X_train, y_train)
        best_clf = grid_search.best_estimator_

        y_pred = best_clf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        time_execution = time.time() - start
        performance_metrics[clf_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc':auc,
            'time_execution':time_execution
        }
        

    return performance_metrics

In [24]:
results = evaluate_classifiers(X_train_selected, y_train, X_test_selected, y_test)

for clf_name, metrics in results.items():
    print(clf_name + " Performance Metrics:")
    print("Accuracy:", metrics['accuracy'])
    print("Precision:", metrics['precision'])
    print("Recall:", metrics['recall'])
    print("F1 Score:", metrics['f1'])
    print("AUC-ROC:", metrics['auc'])
    print("Time:", metrics['time_execution'])
    print()

SVM Performance Metrics:
Accuracy: 0.8157894736842105
Precision: 0.864406779661017
Recall: 0.8947368421052632
F1 Score: 0.8793103448275862
AUC-ROC: 0.736842105263158
Time: 4159.760034799576

Random Forest Performance Metrics:
Accuracy: 0.8486842105263158
Precision: 0.8473282442748091
Recall: 0.9736842105263158
F1 Score: 0.9061224489795918
AUC-ROC: 0.7236842105263159
Time: 23.362380504608154

KNN Performance Metrics:
Accuracy: 0.8421052631578947
Precision: 0.835820895522388
Recall: 0.9824561403508771
F1 Score: 0.9032258064516129
AUC-ROC: 0.7017543859649122
Time: 0.2787141799926758

Naive Bayes Performance Metrics:
Accuracy: 0.7894736842105263
Precision: 0.8203125
Recall: 0.9210526315789473
F1 Score: 0.8677685950413222
AUC-ROC: 0.6578947368421053
Time: 0.07537436485290527

Decision Tree Performance Metrics:
Accuracy: 0.8421052631578947
Precision: 0.8688524590163934
Recall: 0.9298245614035088
F1 Score: 0.8983050847457625
AUC-ROC: 0.7543859649122807
Time: 0.1637859344482422

XGBoost Perfor