In [2]:
# Step 1: Import required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [3]:
# ques 1 
#part[i]
# Step 2: Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Convert to DataFrame for clarity
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y

df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [5]:
# Step 4: Implement Gaussian Naïve Bayes manually

class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y) #Finds all classes present in training labels and stores them
        self.mean = {}
        self.var = {}
        self.prior = {}

        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = X_c.mean(axis=0)
            self.var[c] = X_c.var(axis=0)
            self.prior[c] = X_c.shape[0] / X.shape[0]

    def calculate_likelihood(self, mean, var, x):
        eps = 1e-6  # small number to prevent divide by zero
        coeff = 1.0 / np.sqrt(2.0 * np.pi * var + eps)
        exponent = np.exp(-(x - mean) ** 2 / (2 * var + eps))
        return coeff * exponent

    def calculate_posterior(self, x):
        posteriors = []
        for c in self.classes:
            prior = np.log(self.prior[c])
            likelihood = np.sum(np.log(self.calculate_likelihood(self.mean[c], self.var[c], x)))
            posterior = prior + likelihood
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    def predict(self, X):
        return np.array([self.calculate_posterior(x) for x in X])


In [6]:
# Step 5: Train and test manual Gaussian Naïve Bayes model
gnb_manual = GaussianNaiveBayes()
gnb_manual.fit(X_train, y_train)
y_pred_manual = gnb_manual.predict(X_test)

print("Accuracy (Manual):", accuracy_score(y_test, y_pred_manual))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_manual))
print("\nClassification Report:\n", classification_report(y_test, y_pred_manual))


Accuracy (Manual): 0.9777777777777777

Confusion Matrix:
 [[19  0  0]
 [ 0 12  1]
 [ 0  0 13]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.92      0.96        13
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45



In [8]:
# part[ii]

from sklearn.naive_bayes import GaussianNB

# Step 1: Create model
gnb = GaussianNB()

# Step 2: Train model
gnb.fit(X_train, y_train)

# Step 3: Predict
y_pred = gnb.predict(X_test)

# Step 4: Evaluate
print("Accuracy (In-built):", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy (In-built): 1.0

Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [7]:
# ques 2:

# Step 1: Import required libraries
from sklearn.datasets import load_iris          # to load the Iris dataset
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Step 2: Load the dataset
iris = load_iris()
X = iris.data        # features (sepal length, width, etc.)
y = iris.target      # labels (0 = Setosa, 1 = Versicolor, 2 = Virginica)

# Step 3: Split dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 4: Define the model (KNN classifier)
knn = KNeighborsClassifier()

# Step 5: Define the parameter grid — values of K to try
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15]}

# Step 6: Create the GridSearchCV object
# cv=5 means 5-fold cross-validation
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

# Step 7: Fit (train) the GridSearchCV on the training data
grid.fit(X_train, y_train)

# Step 8: Print the best parameter and corresponding accuracy
print("Best K value:", grid.best_params_)
print("Best cross-validation accuracy:", grid.best_score_)

# Step 9: Evaluate the best model on the test set
best_knn = grid.best_estimator_          # get the best model found by GridSearchCV
y_pred = best_knn.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))


Best K value: {'n_neighbors': 3}
Best cross-validation accuracy: 0.9583333333333334
Test accuracy: 1.0
