In [46]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from scipy.stats import multivariate_normal
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

In [48]:
# # Loading the dataset
df = pd.read_csv('data.csv')
df.head()



Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V14,V15,V16,V17,V18,V19,V20,V21,V22,Class
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,...,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654,2
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,...,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674,2
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,...,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,2
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,...,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975,2
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,...,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335,2


# GDA

In [43]:
# Importing dataset
dataset = pd.read_csv('data.csv')
features = dataset.iloc[:, :-1].values
labels = dataset.iloc[:, -1].values

# Feature normalization
normalizer = StandardScaler()
features_normalized = normalizer.fit_transform(features)

# Preparing for accuracy metrics
acc_scores = np.zeros(10)
bal_acc_scores = np.zeros(10)

# Setting up K-Fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=8)

for iteration, (train_indices, test_indices) in enumerate(kfold.split(features_normalized, labels), start=1):
    # Splitting dataset into training and testing sets
    features_train, features_test = features_normalized[train_indices], features_normalized[test_indices]
    labels_train, labels_test = labels[train_indices], labels[test_indices]

    # Balancing dataset using SMOTE
    smote = SMOTE(random_state=42)
    features_balanced, labels_balanced = smote.fit_resample(features_train, labels_train)

    # Reducing dimensions using PCA
    pca = PCA(n_components=0.95)  # Keeping 95% of variance
    features_train_pca = pca.fit_transform(features_balanced)
    features_test_pca = pca.transform(features_test)

    # Grouping by class for training data
    class1_features = features_train_pca[labels_balanced == 1]
    class2_features = features_train_pca[labels_balanced == 2]

    # Computing class means
    mean1 = np.mean(class1_features, axis=0)
    mean2 = np.mean(class2_features, axis=0)

    # Shared covariance matrix calculation
    shared_cov = ((len(class1_features) - 1) * np.cov(class1_features.T) + (len(class2_features) - 1) * np.cov(class2_features.T)) / (len(class1_features) + len(class2_features) - 2)

    # Class prediction for test data
    prob_class1 = multivariate_normal(allow_singular=True, mean=mean1, cov=shared_cov).pdf(features_test_pca)
    prob_class2 = multivariate_normal(allow_singular=True, mean=mean2, cov=shared_cov).pdf(features_test_pca)
    predicted_labels = (prob_class2 > prob_class1).astype(int) + 1

    # Accuracy computation
    acc_scores[iteration-1] = accuracy_score(labels_test, predicted_labels)

    # Computing balanced accuracy
    tp = np.sum((predicted_labels == 2) & (labels_test == 2))
    tn = np.sum((predicted_labels == 1) & (labels_test == 1))
    fp = np.sum((predicted_labels == 2) & (labels_test == 1))
    fn = np.sum((predicted_labels == 1) & (labels_test == 2))

    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    bal_acc_scores[iteration-1] = (sensitivity + specificity) / 2

    # Displaying fold-wise accuracy metrics
    print(f'Iteration {iteration} Accuracy: {acc_scores[iteration-1]*100:.2f}%')
    print(f'Iteration {iteration} Balanced Accuracy: {bal_acc_scores[iteration-1]*100:.2f}%')

# Showing overall performance metrics
print(f'\nOverall Accuracy: {np.mean(acc_scores)*100:.2f}%')
print(f'Overall Balanced Accuracy: {np.mean(bal_acc_scores)*100:.2f}%')

Iteration 1 Accuracy: 80.00%
Iteration 1 Balanced Accuracy: 78.12%
Iteration 2 Accuracy: 70.00%
Iteration 2 Balanced Accuracy: 67.03%
Iteration 3 Accuracy: 80.00%
Iteration 3 Balanced Accuracy: 89.47%
Iteration 4 Accuracy: 80.00%
Iteration 4 Balanced Accuracy: 81.25%
Iteration 5 Accuracy: 80.00%
Iteration 5 Balanced Accuracy: 81.25%
Iteration 6 Accuracy: 94.74%
Iteration 6 Balanced Accuracy: 97.22%
Iteration 7 Accuracy: 78.95%
Iteration 7 Balanced Accuracy: 79.29%
Iteration 8 Accuracy: 84.21%
Iteration 8 Balanced Accuracy: 90.00%
Iteration 9 Accuracy: 63.16%
Iteration 9 Balanced Accuracy: 55.95%
Iteration 10 Accuracy: 68.42%
Iteration 10 Balanced Accuracy: 67.71%

Overall Accuracy: 77.95%
Overall Balanced Accuracy: 78.73%


# LDA and QDA

In [53]:
# Split the dataset into features (X) and the target variable (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Feature Scaling using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Outlier Detection using Isolation Forest
iso_forest = IsolationForest(contamination=0.05)
outliers = iso_forest.fit_predict(X_scaled)
# Keep only the data points that are not outliers
X_cleaned = X_scaled[outliers == 1]
y_cleaned = y[outliers == 1]

# Feature Selection using SelectKBest with ANOVA F-value
select_k_best = SelectKBest(f_classif, k=10)
X_selected_features = select_k_best.fit_transform(X_cleaned, y_cleaned)

# Initialize ADASYN for resampling #SMOTE technique used = ADASYN
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_selected_features, y_cleaned)

# Initialize LDA and QDA
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()

# Initialize 10-Fold Cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store results of each fold for LDA and QDA
results = []

# Perform 10-Fold Cross-validation
for train_index, test_index in kf.split(X_resampled, y_resampled):
    X_train, X_test = X_resampled[train_index], X_resampled[test_index]
    y_train, y_test = y_resampled[train_index], y_resampled[test_index]

    # Fit LDA on training data and predict on test data
    lda.fit(X_train, y_train)
    y_pred_lda = lda.predict(X_test)
    lda_accuracy = accuracy_score(y_test, y_pred_lda)
    lda_balanced_accuracy = balanced_accuracy_score(y_test, y_pred_lda)

    # Fit QDA on training data and predict on test data
    qda.fit(X_train, y_train)
    y_pred_qda = qda.predict(X_test)
    qda_accuracy = accuracy_score(y_test, y_pred_qda)
    qda_balanced_accuracy = balanced_accuracy_score(y_test, y_pred_qda)

    # Append to results list
    results.append({
        'Fold': len(results) + 1,
        'LDA Accuracy': lda_accuracy,
        'LDA Balanced Accuracy': lda_balanced_accuracy,
        'QDA Accuracy': qda_accuracy,
        'QDA Balanced Accuracy': qda_balanced_accuracy
    })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Calculate average accuracy and balanced accuracy for LDA and QDA
average_accuracy_lda = results_df['LDA Accuracy'].mean()
average_balanced_accuracy_lda = results_df['LDA Balanced Accuracy'].mean()
average_accuracy_qda = results_df['QDA Accuracy'].mean()
average_balanced_accuracy_qda = results_df['QDA Balanced Accuracy'].mean()

# Append average values as new rows in the DataFrame
results_df = results_df.append({
    'Fold': 'Average LDA',
    'LDA Accuracy': average_accuracy_lda,
    'LDA Balanced Accuracy': average_balanced_accuracy_lda,
    'QDA Accuracy': np.nan,  # NaN for QDA columns in LDA average row
    'QDA Balanced Accuracy': np.nan
}, ignore_index=True)
results_df = results_df.append({
    'Fold': 'Average QDA',
    'LDA Accuracy': np.nan,  # NaN for LDA columns in QDA average row
    'LDA Balanced Accuracy': np.nan,
    'QDA Accuracy': average_accuracy_qda,
    'QDA Balanced Accuracy': average_balanced_accuracy_qda
}, ignore_index=True)

# Display the DataFrame as a table
from google.colab import data_table
data_table.enable_dataframe_formatter()

# Display results in tabular format
display(results_df)

  results_df = results_df.append({
  results_df = results_df.append({


Unnamed: 0,Fold,LDA Accuracy,LDA Balanced Accuracy,QDA Accuracy,QDA Balanced Accuracy
0,1,0.862069,0.857143,0.862069,0.857143
1,2,0.785714,0.785714,0.892857,0.892857
2,3,0.821429,0.821429,0.821429,0.821429
3,4,0.964286,0.964286,0.821429,0.821429
4,5,0.857143,0.857143,0.785714,0.785714
5,6,0.857143,0.857143,0.821429,0.821429
6,7,0.821429,0.821429,0.928571,0.928571
7,8,0.857143,0.851282,0.785714,0.769231
8,9,0.714286,0.692308,0.75,0.735897
9,10,0.714286,0.707692,0.821429,0.828205
