In [None]:
import pandas as pd
import seaborn as sns

df = pd.read_csv('./BP-MDD.csv', header=None)

In [None]:
print(df.head())

In [None]:
# Checking the shape
df.shape

In [None]:
# # Handling the target variable
y = df[45]
# y.replace({1: 0, 2: 1}, inplace=True)
print(y.head)
# making the classification in a binary classification
df.drop(columns=[45], inplace = True)

In [None]:
feat = df.iloc[:,:5]
train_index = [2,3,5,6]
feat.iloc[train_index,:]

In [None]:
# Defining a function with parameter
# clf = classifier
# X = features
# y = target variable

# Returns
# specificity = array of specificity values
# sensitivity = array of sensitivity values
# acc = array of accuracies
# precision = array of precisions
# f1score = array of f1 scores
# auc = area under curve

# Importing libraries

import statistics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RepeatedKFold, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score


def evaluate_model(clf, X, y):
    specificity = list()
    sensitvity = list()
    f1_scores = list()
    auc_scores = list()
    accuracy_scores = list()
    precision_scores = list()

    # Standard Deviation
    specificity_std = list()
    sensitvity_std = list()
    f1_scores_std = list()
    auc_scores_std = list()
    accuracy_scores_std = list()
    precision_scores_std = list()

    for i in range(0, 44):
    # for i in range(0, 3):
        # We generate the set of features
        feat = X.iloc[:,:i+1]
        target = y
        fold = RepeatedKFold(n_splits=5, n_repeats=100, random_state=10)
        # list for each features
        split_sensi = list()
        split_speci = list()
        split_accu = list()
        split_f1 = list()
        split_precision = list()
        split_auc = list()

        for train_index, test_index in fold.split(feat):
            X_train= feat.iloc[train_index,:]
            X_test = feat.iloc[test_index,:]
            y_train = target.iloc[train_index]
            y_test= target.iloc[test_index]

            # Fitting the classifier 
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            # We have confusion matrx
            cm = confusion_matrix(y_test,y_pred)
            tn, fp, fn, tp = cm.ravel()

            # Calucating metrics for each iteration
            split_sensi.append(tp / (tp+fn))
            split_speci.append(tn / (tn+fp))
            
            # Calculating accuracy
            split_accu.append(accuracy_score(y_test, y_pred))

            # Calculating auc_scores
            split_auc.append(roc_auc_score(y_test, y_pred))

            # Calculating f1 scores
            split_f1.append(f1_score(y_test, y_pred))

            # Calculating precision
            split_precision.append((tp / (tp + fp)) )
        
        # After competing all folds, appending the average of metrics to the main matric list
        sensitvity.append(sum(split_sensi) / len(split_sensi))
        specificity.append(sum(split_speci) / len(split_sensi))
        precision_scores.append(sum(split_precision) / len(split_sensi))
        auc_scores.append(sum(split_auc) / len(split_sensi))
        accuracy_scores.append(sum(split_accu) / len(split_sensi))
        f1_scores.append(sum(split_f1) / len(split_sensi))

        sensitvity_std.append(statistics.stdev(split_sensi))
        specificity_std.append(statistics.stdev(split_speci))
        precision_scores_std.append(statistics.stdev(split_precision))
        auc_scores_std.append(statistics.stdev(split_auc))
        accuracy_scores_std.append(statistics.stdev(split_accu))
        f1_scores_std.append(statistics.stdev(split_f1))
    return {
        "specificity" : specificity,
        "sensitvity" : sensitvity,
        "f1_scores" : f1_scores,
        "auc_scores" : auc_scores,
        "accuracy_scores" : accuracy_scores,
        "precision_scores" : precision_scores,
    }
        

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Naive Bayes

## Implementation

In [None]:
# Evaluating the naive Bayes classifier
from sklearn.naive_bayes import GaussianNB

gaussianScores = evaluate_model(GaussianNB(), df, y)
print(gaussianScores)

## Metrics

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# Plotting number of features vs the metrics

# number of features vs accuracy
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Accuracy": np.array(gaussianScores['accuracy_scores'])})
sns.lineplot(x='number_of_features', y='Accuracy', data=data)
plt.show()

# number of features vs specificity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Specificity": np.array(gaussianScores['specificity'])})
sns.lineplot(x='number_of_features', y='Specificity', data=data)
plt.show()

# number of features vs sensitvity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Sensitvity": np.array(gaussianScores['sensitvity'])})
sns.lineplot(x='number_of_features', y='Sensitvity', data=data)
plt.show()

# number of features vs f1_scores
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"F1_score": np.array(gaussianScores['f1_scores'])})
sns.lineplot(x='number_of_features', y='F1_score', data=data)
plt.show()

# number of features vs auc
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Auc": np.array(gaussianScores['auc_scores'])})
sns.lineplot(x='number_of_features', y='Auc', data=data)
plt.show()

# number of features vs precision
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Precision_score": np.array(gaussianScores['precision_scores'])})
sns.lineplot(x='number_of_features', y='Precision_score', data=data)
plt.show()

In [None]:
# For maximum values for each classifiers
maxValues = list()

In [None]:
# Getting index for the maximum values
maxIndex = np.argmax(np.array(gaussianScores['accuracy_scores']))

maxValues.append(["Naive Bayes", 
maxIndex,
gaussianScores["accuracy_scores"][maxIndex], 
gaussianScores["specificity"][maxIndex],
gaussianScores["sensitvity"][maxIndex],
gaussianScores["f1_scores"][maxIndex],
gaussianScores["auc_scores"][maxIndex],
gaussianScores["precision_scores"][maxIndex],
])

# 

# Decision Tree

## Implementation

In [None]:
# Evaluating the Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier

decisionScores = evaluate_model(DecisionTreeClassifier(), df, y)
print(decisionScores)

## Metrics for Decision Tree

In [None]:
# Plotting number of features vs the metrics

# number of features vs accuracy
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Accuracy": np.array(decisionScores['accuracy_scores'])})
sns.lineplot(x='number_of_features', y='Accuracy', data=data)
plt.show()

# number of features vs specificity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Specificity": np.array(decisionScores['specificity'])})
sns.lineplot(x='number_of_features', y='Specificity', data=data)
plt.show()

# number of features vs sensitvity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Sensitvity": np.array(decisionScores['sensitvity'])})
sns.lineplot(x='number_of_features', y='Sensitvity', data=data)
plt.show()

# number of features vs f1_scores
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"F1_score": np.array(decisionScores['f1_scores'])})
sns.lineplot(x='number_of_features', y='F1_score', data=data)
plt.show()

# number of features vs auc
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Auc": np.array(decisionScores['auc_scores'])})
sns.lineplot(x='number_of_features', y='Auc', data=data)
plt.show()

# number of features vs precision
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Precision_score": np.array(decisionScores['precision_scores'])})
sns.lineplot(x='number_of_features', y='Precision_score', data=data)
plt.show()

In [None]:
# Getting index for the maximum values
maxIndex = np.argmax(np.array(decisionScores['accuracy_scores']))

maxValues.append(["Decision Tree", 
maxIndex,
decisionScores["accuracy_scores"][maxIndex], 
decisionScores["specificity"][maxIndex],
decisionScores["sensitvity"][maxIndex],
decisionScores["f1_scores"][maxIndex],
decisionScores["auc_scores"][maxIndex],
decisionScores["precision_scores"][maxIndex]
])

# Random Forest

## Implementation

In [None]:
# Evaluating the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

randomForestScores = evaluate_model(RandomForestClassifier(n_jobs=-1, max_depth=5), df, y)
print(randomForestScores)

## Metrics

In [None]:
# Plotting number of features vs the metrics

# number of features vs accuracy
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Accuracy": np.array(randomForestScores['accuracy_scores'])})
sns.lineplot(x='number_of_features', y='Accuracy', data=data)
plt.show()

# number of features vs specificity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Specificity": np.array(randomForestScores['specificity'])})
sns.lineplot(x='number_of_features', y='Specificity', data=data)
plt.show()

# number of features vs sensitvity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Sensitvity": np.array(randomForestScores['sensitvity'])})
sns.lineplot(x='number_of_features', y='Sensitvity', data=data)
plt.show()

# number of features vs f1_scores
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"F1_score": np.array(randomForestScores['f1_scores'])})
sns.lineplot(x='number_of_features', y='F1_score', data=data)
plt.show()

# number of features vs auc
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Auc": np.array(randomForestScores['auc_scores'])})
sns.lineplot(x='number_of_features', y='Auc', data=data)
plt.show()

# number of features vs precision
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Precision_score": np.array(randomForestScores['precision_scores'])})
sns.lineplot(x='number_of_features', y='Precision_score', data=data)
plt.show()

In [None]:
# Getting index for the maximum values
maxIndex = np.argmax(np.array(randomForestScores['accuracy_scores']))

maxValues.append(["Random Forest", randomForestScores["accuracy_scores"][maxIndex], 
maxIndex,
randomForestScores["specificity"][maxIndex],
randomForestScores["sensitvity"][maxIndex],
randomForestScores["f1_scores"][maxIndex],
randomForestScores["auc_scores"][maxIndex],
randomForestScores["precision_scores"][maxIndex]
])

# Linear Discriminant Analysis

## Implementation

In [None]:
# Evaluating the Linear Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

linearScores = evaluate_model(LinearDiscriminantAnalysis(), df, y)
print(linearScores)

## Metrics

In [None]:
# Plotting number of features vs the metrics

# number of features vs accuracy
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Accuracy": np.array(linearScores['accuracy_scores'])})
sns.lineplot(x='number_of_features', y='Accuracy', data=data)
plt.show()

# number of features vs specificity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Specificity": np.array(linearScores['specificity'])})
sns.lineplot(x='number_of_features', y='Specificity', data=data)
plt.show()

# number of features vs sensitvity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Sensitvity": np.array(linearScores['sensitvity'])})
sns.lineplot(x='number_of_features', y='Sensitvity', data=data)
plt.show()

# number of features vs f1_scores
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"F1_score": np.array(linearScores['f1_scores'])})
sns.lineplot(x='number_of_features', y='F1_score', data=data)
plt.show()

# number of features vs auc
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Auc": np.array(linearScores['auc_scores'])})
sns.lineplot(x='number_of_features', y='Auc', data=data)
plt.show()

# number of features vs precision
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Precision_score": np.array(linearScores['precision_scores'])})
sns.lineplot(x='number_of_features', y='Precision_score', data=data)
plt.show()

In [None]:
# Getting index for the maximum values
maxIndex = np.argmax(np.array(linearScores['accuracy_scores']))

maxValues.append(["LDA", 
maxIndex,
linearScores["accuracy_scores"][maxIndex], 
linearScores["specificity"][maxIndex],
linearScores["sensitvity"][maxIndex],
linearScores["f1_scores"][maxIndex],
linearScores["auc_scores"][maxIndex],
linearScores["precision_scores"][maxIndex]
])

# Quadratic Discrimimant Analysis

# Implementation

In [None]:
# Evaluating the Quadratic Discriminant Analysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

quadraticScores = evaluate_model(QuadraticDiscriminantAnalysis(), df, y)
print(quadraticScores)

## Metrics

In [None]:
# number of features vs accuracy
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Accuracy": np.array(quadraticScores['accuracy_scores'])})
sns.lineplot(x='number_of_features', y='Accuracy', data=data)
plt.show()

# number of features vs specificity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Specificity": np.array(quadraticScores['specificity'])})
sns.lineplot(x='number_of_features', y='Specificity', data=data)
plt.show()

# number of features vs sensitvity
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Sensitvity": np.array(quadraticScores['sensitvity'])})
sns.lineplot(x='number_of_features', y='Sensitvity', data=data)
plt.show()

# number of features vs f1_scores
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"F1_score": np.array(quadraticScores['f1_scores'])})
sns.lineplot(x='number_of_features', y='F1_score', data=data)
plt.show()

# number of features vs auc
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Auc": np.array(quadraticScores['auc_scores'])})
sns.lineplot(x='number_of_features', y='Auc', data=data)
plt.show()

# number of features vs precision
data = pd.DataFrame({"number_of_features" : np.array(range(1,45)),
"Precision_score": np.array(quadraticScores['precision_scores'])})
sns.lineplot(x='number_of_features', y='Precision_score', data=data)
plt.show()

In [None]:
# Getting index for the maximum values
maxIndex = np.argmax(np.array(quadraticScores['accuracy_scores']))

maxValues.append(["QDA", 
maxIndex,
quadraticScores["accuracy_scores"][maxIndex], 
quadraticScores["specificity"][maxIndex],
quadraticScores["sensitvity"][maxIndex],
quadraticScores["f1_scores"][maxIndex],
quadraticScores["auc_scores"][maxIndex],
quadraticScores["precision_scores"][maxIndex]
])

In [None]:
# Checking the maximum values
print(maxValues)