In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap

In [36]:
def load_data(path="Wine.csv"):
    dataset = pd.read_csv(path)
    X = dataset.iloc[:, 0:13].values
    y = dataset.iloc[:, 13].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0, stratify=y
    )

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    return X_train, X_test, y_train, y_test

In [37]:
def apply_lda(X_train, X_test, y_train, n_components):
    lda = LDA(n_components=n_components)
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)
    return X_train_lda, X_test_lda, lda

In [38]:
def train_and_eval(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n{name} | Accuracy: {round(acc, 4)}")
    print("Confusion Matrix:\n", cm)

    # return BOTH the trained model and accuracy
    return acc

In [39]:
def visualize_results(X, y, classifier, title):
    X1, X2 = np.meshgrid(
        np.arange(start=X[:, 0].min() - 1, stop=X[:, 0].max() + 1, step=0.01),
        np.arange(start=X[:, 1].min() - 1, stop=X[:, 1].max() + 1, step=0.01)
    )

    plt.contourf(
        X1, X2,
        classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
        alpha=0.75, cmap=ListedColormap(('red', 'green', 'blue'))
    )
    plt.xlim(X1.min(), X1.max())
    plt.ylim(X2.min(), X2.max())

    for i, j in enumerate(np.unique(y)):
        plt.scatter(
            X[y == j, 0], X[y == j, 1],
            c=ListedColormap(('red', 'green', 'blue'))(i), label=j
        )

    plt.title(title)
    plt.xlabel("LD1")
    plt.ylabel("LD2")
    plt.legend()
    plt.show()

In [40]:
def run_pipeline(lda_list=[1, 2]):
    # 1. Load dataset
    X_train, X_test, y_train, y_test = load_data()

    # 2. Define models
    models = [
        ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=0)),
        ("SVM (Linear)", SVC(kernel="linear", random_state=0)),
        ("KNN", KNeighborsClassifier(n_neighbors=5)),
        ("Decision Tree", DecisionTreeClassifier(random_state=0)),
        ("Random Forest", RandomForestClassifier(n_estimators=200, random_state=0)),
        ("Gaussian NB", GaussianNB()),
    ]

    results = []

    # 3. Loop over LDA components
    for n in lda_list:
        print(f"\n==== LDA Components = {n} ====")

        # Apply LDA
        X_train_lda, X_test_lda, lda_model = apply_lda(X_train, X_test, y_train, n_components=n)

        # Train and evaluate each model
        for name, clf in models:
            acc = train_and_eval(name, clf, X_train_lda, y_train, X_test_lda, y_test)
            results.append([n, name, acc])

    # 4. Build results DataFrame
    df = pd.DataFrame(results, columns=["LDA", "Model", "Accuracy"])
    print("\n===== FINAL RESULTS TABLE =====")
    print(df.pivot(index="Model", columns="LDA", values="Accuracy"))

    return df


In [41]:
# Run only with 1 component
df_results = run_pipeline(lda_list=[1])

# Run with both 1 and 2 components
df_results = run_pipeline(lda_list=[2, 1])


==== LDA Components = 1 ====

Logistic Regression | Accuracy: 0.9167
Confusion Matrix:
 [[11  1  0]
 [ 2 12  0]
 [ 0  0 10]]

SVM (Linear) | Accuracy: 0.9167
Confusion Matrix:
 [[11  1  0]
 [ 2 12  0]
 [ 0  0 10]]

KNN | Accuracy: 0.9167
Confusion Matrix:
 [[11  1  0]
 [ 2 12  0]
 [ 0  0 10]]

Decision Tree | Accuracy: 0.9444
Confusion Matrix:
 [[12  0  0]
 [ 2 12  0]
 [ 0  0 10]]

Random Forest | Accuracy: 0.9444
Confusion Matrix:
 [[12  0  0]
 [ 2 12  0]
 [ 0  0 10]]

Gaussian NB | Accuracy: 0.9167
Confusion Matrix:
 [[11  1  0]
 [ 2 12  0]
 [ 0  0 10]]

===== FINAL RESULTS TABLE =====
LDA                         1
Model                        
Decision Tree        0.944444
Gaussian NB          0.916667
KNN                  0.916667
Logistic Regression  0.916667
Random Forest        0.944444
SVM (Linear)         0.916667

==== LDA Components = 2 ====

Logistic Regression | Accuracy: 1.0
Confusion Matrix:
 [[12  0  0]
 [ 0 14  0]
 [ 0  0 10]]

SVM (Linear) | Accuracy: 1.0
Confusion M

# ‚úÖ Conclusion

From the results:

- With **1 LDA component**, the best models were **Decision Tree** and **Random Forest** (~94% accuracy).  
- With **2 LDA components**, almost all models (Logistic Regression, SVM, KNN, Gaussian NB, Random Forest) reached **100% accuracy**.  
- **Decision Tree** also improved but was slightly lower (~97%).  

### üèÜ Best Model:
With **2 LDA components**, the top performers are:  
**Logistic Regression, SVM, KNN, Gaussian NB, and Random Forest** ‚Äî all achieving **perfect accuracy (100%)**.  