# Step 1: Read the data

In [2]:
import pandas as pd

# Reading the data
df = pd.read_csv("Iris.csv", index_col=0)
df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


# Step 2: Define the input and target features

In [3]:
X = df.drop(columns=["Species"], axis=1)
y = df["Species"]

In [4]:
X.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5.1,3.5,1.4,0.2
2,4.9,3.0,1.4,0.2
3,4.7,3.2,1.3,0.2
4,4.6,3.1,1.5,0.2
5,5.0,3.6,1.4,0.2


In [5]:
y.head()

Id
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
5    Iris-setosa
Name: Species, dtype: object

# Step 3: Split the data into train (80%) and test (20%)

In [7]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Step 4: Build a 3-NN model on the train dataset and use to make predictions on the test dataset. Report the accuracy.

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Creating the k-NN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting the model
knn.fit(X_train_scaled, y_train)

# Making predictions
y_pred = knn.predict(X_test_scaled)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Displaying the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Displaying the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.93
Classification Report:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.83      1.00      0.91        10
 Iris-virginica       1.00      0.80      0.89        10

       accuracy                           0.93        30
      macro avg       0.94      0.93      0.93        30
   weighted avg       0.94      0.93      0.93        30

Confusion Matrix:
[[10  0  0]
 [ 0 10  0]
 [ 0  2  8]]


# Step 5: Try different values for k

In [9]:
for k in range(2, 21):

    # Buidling the k-NN classifier with the current k
    knn = KNeighborsClassifier(n_neighbors=k)

    # Fitting the model
    knn.fit(X_train_scaled, y_train)

    # Making predictions
    y_pred = knn.predict(X_test_scaled)

    # Evaluating the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for k={k}: {accuracy:.2f}")

Accuracy for k=2: 0.93
Accuracy for k=3: 0.93
Accuracy for k=4: 0.93
Accuracy for k=5: 0.93
Accuracy for k=6: 0.93
Accuracy for k=7: 0.97
Accuracy for k=8: 0.93
Accuracy for k=9: 0.97
Accuracy for k=10: 0.97
Accuracy for k=11: 0.97
Accuracy for k=12: 0.97
Accuracy for k=13: 0.97
Accuracy for k=14: 0.93
Accuracy for k=15: 0.97
Accuracy for k=16: 0.97
Accuracy for k=17: 0.97
Accuracy for k=18: 0.97
Accuracy for k=19: 0.97
Accuracy for k=20: 0.97


# Step 6: Using the results from Step 5, run a 5-fold cross validation

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Defining the cross-validation strategy
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Defining the k-NN classifier
knn = KNeighborsClassifier(n_neighbors=7)

# Defining the scaler
scaler = StandardScaler()   

# Standardizing the features
X_scaled = scaler.fit_transform(X)

# Running cross-validation
knn_cv = cross_val_score(knn, X_scaled, y, cv=skf, scoring='accuracy', n_jobs=-1)

# Displaying the cross-validation results
print(f"Cross-validation accuracy: {knn_cv.mean():.2f} ± {knn_cv.std():.2f}")

Cross-validation accuracy: 0.96 ± 0.03


In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Defining the cross-validation strategy
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Defining the k-NN classifier
knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=7))

# Running cross-validation
knn_cv = cross_val_score(knn, X, y, cv=skf, scoring='accuracy', n_jobs=-1)

# Displaying the cross-validation results
print(f"Cross-validation accuracy: {knn_cv.mean():.2f} ± {knn_cv.std():.2f}")

Cross-validation accuracy: 0.96 ± 0.03
