<a href="https://colab.research.google.com/github/msivakumar47/SIVAKUMAR-M/blob/main/k_means_project18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load Iris dataset
data = load_iris()
X = data.data
y = data.target

# Standardize features for better distance calculation performance in KNN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset into train-test sets (80-20), stratified by label distribution
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# Define odd K values to test
k_values = list(range(1, 16, 2))

train_accuracies = []
test_accuracies = []
elapsed_times = []

print("KNN Performance Metrics:")
print("-" * 30)

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)

    start = time.time()
    knn.fit(X_train, y_train)
    train_pred = knn.predict(X_train)
    test_pred = knn.predict(X_test)
    end = time.time()

    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)
    elapsed = end - start

    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)
    elapsed_times.append(elapsed)

    print(f"K={k:<2} | Train Acc={train_acc:.3f} | Test Acc={test_acc:.3f} | Time={elapsed:.4f}s")

# Plotting both train and test accuracies vs K for better insight
plt.figure(figsize=(10, 6))
plt.plot(k_values, train_accuracies, label="Training Accuracy", marker='o')
plt.plot(k_values, test_accuracies, label="Testing Accuracy", marker='x')
plt.title("KNN Accuracy vs Number of Neighbors (K)")
plt.xlabel("Number of Neighbors (K)")
plt.ylabel("Accuracy")
plt.xticks(k_values)
plt.legend()
plt.grid(True)
plt.show()

# Optimal K selection
optimal_k = k_values[np.argmax(test_accuracies)]
optimal_test_acc = max(test_accuracies)

print(f"\nOptimal K: {optimal_k} with Test Accuracy: {optimal_test_acc:.3f}\n")

# Detailed analysis output for project report
analysis = f"""
Analysis:
- Small K values (like K=1) have very high training accuracy, indicating the model fits closely to training data (potential overfitting).
- As K increases, training accuracy tends to decrease slightly while testing accuracy stabilizes or improves, indicating reduced variance.
- Large K values smooth decision boundaries, increasing bias but helping reduce overfitting.
- The best K value balances this trade-off; here, optimal K = {optimal_k} achieved highest test accuracy of {optimal_test_acc:.3f}.
- This illustrates the classical bias-variance trade-off where K controls model complexity.
"""
print(analysis)
