In [None]:
# Load data hasil preprocessing (sudah dinormalisasi Z-Score & split)
import joblib

X_train = joblib.load('X_train.pkl')
X_test = joblib.load('X_test.pkl')
y_train = joblib.load('y_train.pkl')
y_test = joblib.load('y_test.pkl')

# Cek dimensi
print("Train features:", X_train.shape)
print("Test features:", X_test.shape)
print("Train labels:  ", y_train.shape)
print("Test labels:   ", y_test.shape)


In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# Load original dataset to show before-after normalization
wine = pd.read_csv('winequality-red.csv')
X_original = wine.drop('quality', axis=1)

# Simulate Z-score normalization
sc = StandardScaler()
X_scaled = sc.fit_transform(X_original)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_original.columns)

# Display first 5 rows before normalization
print("📋 Before Z-Score Normalization:")
display(X_original.head())

# Display first 5 rows after normalization
print("\n📋 After Z-Score Normalization:")
display(X_scaled_df.head())


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.spatial import distance

# Load preprocessed and scaled data from .pkl files
X_train = joblib.load('X_train.pkl')
X_test = joblib.load('X_test.pkl')
y_train = joblib.load('y_train.pkl')
y_test = joblib.load('y_test.pkl')

# Example: Train KNN with k=1
knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Show manual Euclidean distance for first test sample
print("🔢 Manual Euclidean Distance Calculation (first test sample):")
dists = [np.sqrt(np.sum((x - X_test[0]) ** 2)) for x in X_train]
nearest_index = np.argmin(dists)
print(f"Closest training sample index: {nearest_index}, Label: {y_train.iloc[nearest_index]}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Try multiple k values
k_values = [1, 3, 5, 7, 9]
val_scores = []

for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    val_scores.append(acc)
    print(f"K = {k} → Accuracy = {acc:.4f}")

# Plot validation accuracy per k
plt.figure(figsize=(8,5))
plt.plot(k_values, val_scores, marker='o', color='blue')
plt.title("Validation Accuracy vs K")
plt.xlabel("K Value")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()

# Find best K
best_k = k_values[np.argmax(val_scores)]
print(f"\n✅ Best K = {best_k} with Accuracy = {max(val_scores)*100:.2f}%")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Train final KNN model using best K
final_knn = KNeighborsClassifier(n_neighbors=best_k, metric='euclidean')
final_knn.fit(X_train, y_train)
y_pred = final_knn.predict(X_test)

# Evaluation results
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.decomposition import PCA
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np

# Reduce features to 2D using PCA for visualization
pca = PCA(n_components=2)
X_train_2D = pca.fit_transform(X_train)
X_test_2D = pca.transform(X_test)

# Train KNN on reduced 2D data
knn_2D = KNeighborsClassifier(n_neighbors=best_k, metric='euclidean')
knn_2D.fit(X_train_2D, y_train)

# Create meshgrid
h = 0.02
x_min, x_max = X_train_2D[:, 0].min() - 1, X_train_2D[:, 0].max() + 1
y_min, y_max = X_train_2D[:, 1].min() - 1, X_train_2D[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# Predict for each point in mesh
Z = knn_2D.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot decision boundary
plt.figure(figsize=(10, 6))
cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])  # class 0: red, class 1: blue
cmap_bold = ['red', 'blue']

plt.contourf(xx, yy, Z, cmap=cmap_light, alpha=0.4)

# Plot training points
for idx, cls in enumerate(np.unique(y_train)):
    plt.scatter(X_train_2D[y_train == cls, 0], 
                X_train_2D[y_train == cls, 1], 
                c=cmap_bold[idx], 
                label=f'Class {cls}',
                edgecolor='k')

plt.title(f"KNN Decision Boundary (k = {best_k}, 2D PCA projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()