<a href="https://colab.research.google.com/github/natsakh/IAD/blob/main/Pr_3/3_5_PCA_vs_noPCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Дані
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y)

# Базова модель: kNN без PCA (але зі скейлом)
pipe_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=5)) ])

# Модель з PCA(0.95) + kNN
pipe_pca_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95, svd_solver="full", random_state=42)),
    ("knn", KNeighborsClassifier(n_neighbors=5)) ])

# Крос-валідація для порівняння
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_knn = cross_val_score(pipe_knn, X_train, y_train, cv=cv, scoring="accuracy")
cv_pca_knn = cross_val_score(pipe_pca_knn, X_train, y_train, cv=cv, scoring="accuracy")

print("CV accuracy (mean±std)")
print(f"  kNN (no PCA):      {cv_knn.mean():.4f} ± {cv_knn.std():.4f}")
print(f"  PCA(0.95)+kNN:     {cv_pca_knn.mean():.4f} ± {cv_pca_knn.std():.4f}")


CV accuracy (mean±std)
  kNN (no PCA):      0.9553 ± 0.0431
  PCA(0.95)+kNN:     0.9627 ± 0.0331


In [3]:
# Навчання на train і оцінка на test + таймінги
def fit_eval(pipeline, X_train, y_train, X_test, y_test, label="model"):
    t0 = time.perf_counter()
    pipeline.fit(X_train, y_train)
    t1 = time.perf_counter()
    y_pred = pipeline.predict(X_test)
    t2 = time.perf_counter()
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{label}")
    if "pca" in dict(pipeline.named_steps):
        pca = pipeline.named_steps["pca"]
        print(f"  PCA kept components: {pca.n_components_} (of {X.shape[1]})")
        print(f"  Explained variance sum: {pca.explained_variance_ratio_.sum():.3f}")
    print(f"  Test accuracy: {acc:.4f}")
    print(f"  Fit time: {t1 - t0:.4f}s | Predict time: {t2 - t1:.6f}s")
    print("  Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    print("  Classification report:\n", classification_report(y_test, y_pred, digits=4))

fit_eval(pipe_knn, X_train, y_train, X_test, y_test, label="kNN (no PCA)")
fit_eval(pipe_pca_knn, X_train, y_train, X_test, y_test, label="PCA(0.95)+kNN")


kNN (no PCA)
  Test accuracy: 0.9333
  Fit time: 0.0078s | Predict time: 0.004992s
  Confusion matrix:
 [[15  0  0]
 [ 0 16  2]
 [ 0  1 11]]
  Classification report:
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        15
           1     0.9412    0.8889    0.9143        18
           2     0.8462    0.9167    0.8800        12

    accuracy                         0.9333        45
   macro avg     0.9291    0.9352    0.9314        45
weighted avg     0.9354    0.9333    0.9337        45


PCA(0.95)+kNN
  PCA kept components: 10 (of 13)
  Explained variance sum: 0.964
  Test accuracy: 0.9556
  Fit time: 0.0043s | Predict time: 0.003951s
  Confusion matrix:
 [[15  0  0]
 [ 0 16  2]
 [ 0  0 12]]
  Classification report:
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        15
           1     1.0000    0.8889    0.9412        18
           2     0.8571    1.0000    0.9231        12