In [120]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from matplotlib.colors import ListedColormap
from sklearn.decomposition import KernelPCA
from sklearn.metrics import accuracy_score, confusion_matrix

In [121]:
def load_data(path="Social_Network_Ads.csv"):
    dataset = pd.read_csv(path)
    X = dataset.iloc[:, [2, 3]].values   
    y = dataset.iloc[:, 4].values     

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0, stratify=y
    )

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    return X_train, X_test, y_train, y_test

In [122]:
def apply_kpca(X_train, X_test, n_components=2, kernel="rbf"):
    kpca = KernelPCA(n_components=n_components, kernel=kernel)
    X_train_kpca = kpca.fit_transform(X_train)
    X_test_kpca = kpca.transform(X_test)
    return X_train_kpca, X_test_kpca

In [123]:
def train_and_eval(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n{name} | Accuracy: {round(acc, 4)}")
    print("Confusion Matrix:\n", cm)

    # return BOTH the trained model and accuracy
    return acc

In [124]:
def visualize_results(X, y, classifier, title):
    X1, X2 = np.meshgrid(
        np.arange(start=X[:, 0].min() - 1, stop=X[:, 0].max() + 1, step=0.01),
        np.arange(start=X[:, 1].min() - 1, stop=X[:, 1].max() + 1, step=0.01)
    )

    plt.contourf(
        X1, X2,
        classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
        alpha=0.75, cmap=ListedColormap(('red', 'green', 'blue'))
    )
    plt.xlim(X1.min(), X1.max())
    plt.ylim(X2.min(), X2.max())

    for i, j in enumerate(np.unique(y)):
        plt.scatter(
            X[y == j, 0], X[y == j, 1],
            c=ListedColormap(('red', 'green', 'blue'))(i), label=j
        )

    plt.title(title)
    plt.xlabel("LD1")
    plt.ylabel("LD2")
    plt.legend()
    plt.show()

In [125]:
def run_pipeline(kda_list=[1]):
    # 1. Load dataset
    X_train, X_test, y_train, y_test = load_data()

    # 2. Define models
    models = [
        ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=0)),
        ("SVM (Linear)", SVC(kernel="linear", random_state=0)),
        ("KNN", KNeighborsClassifier(n_neighbors=5)),
        ("Decision Tree", DecisionTreeClassifier(random_state=0)),
        ("Random Forest", RandomForestClassifier(n_estimators=200, random_state=0)),
        ("Gaussian NB", GaussianNB()),
    ]

    results = []

    # 3. Loop over KDA components
    for n in kda_list:
        print(f"\n==== KDA Components = {n} ====")

        # Apply KDA (KernelPCA -> LDA)
        X_train_kpca, X_test_kpca = apply_kpca(X_train, X_test, n_components=n)

        # Train and evaluate each model
        for name, clf in models:
            acc = train_and_eval(name, clf, X_train_kpca, y_train, X_test_kpca, y_test)
            results.append([n, name, acc])

    # 4. Build results DataFrame
    df = pd.DataFrame(results, columns=["KDA", "Model", "Accuracy"])
    print("\n===== FINAL RESULTS TABLE =====")
    print(df.pivot(index="Model", columns="KDA", values="Accuracy"))

    return df

In [126]:
df_results = run_pipeline(kda_list=[1, 2])


==== KDA Components = 1 ====

Logistic Regression | Accuracy: 0.85
Confusion Matrix:
 [[47  4]
 [ 8 21]]

SVM (Linear) | Accuracy: 0.875
Confusion Matrix:
 [[47  4]
 [ 6 23]]

KNN | Accuracy: 0.8
Confusion Matrix:
 [[44  7]
 [ 9 20]]

Decision Tree | Accuracy: 0.7375
Confusion Matrix:
 [[45  6]
 [15 14]]

Random Forest | Accuracy: 0.7375
Confusion Matrix:
 [[45  6]
 [15 14]]

Gaussian NB | Accuracy: 0.8875
Confusion Matrix:
 [[46  5]
 [ 4 25]]

==== KDA Components = 2 ====

Logistic Regression | Accuracy: 0.8625
Confusion Matrix:
 [[47  4]
 [ 7 22]]

SVM (Linear) | Accuracy: 0.8875
Confusion Matrix:
 [[47  4]
 [ 5 24]]

KNN | Accuracy: 0.875
Confusion Matrix:
 [[46  5]
 [ 5 24]]

Decision Tree | Accuracy: 0.775
Confusion Matrix:
 [[45  6]
 [12 17]]

Random Forest | Accuracy: 0.875
Confusion Matrix:
 [[46  5]
 [ 5 24]]

Gaussian NB | Accuracy: 0.9
Confusion Matrix:
 [[47  4]
 [ 4 25]]

===== FINAL RESULTS TABLE =====
KDA                       1       2
Model                            

### 🔎 Observations
- With **1 component**:  
  - Best: **Gaussian NB (88.8%)**, **SVM (87.5%)**.  
  - Logistic Regression was strong at 85%.  
  - Decision Tree and Random Forest weakest (~73.8%).  

- With **2 components**:  
  - Accuracy improved for almost all models.  
  - **Gaussian NB (90%)**, **SVM (88.8%)**, **Random Forest (87.5%)**, and **KNN (87.5%)** performed the best.  
  - Logistic Regression improved slightly (86.3%).  
  - Decision Tree remained the lowest at ~77.5%.  

### ✅ Conclusion
Using **2 KDA components** is better:  
- **Gaussian NB and SVM** are consistently strong.  
- **Random Forest and KNN** gained the most with 2 components.  
- **Decision Tree** stayed weakest overall.