In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
df = pd.read_csv("iris_dataset.csv")
X = df[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]]
Y = df['target']
classes = np.unique(Y)
stats = {}
for c in classes:
    data_c = X[Y==c]
    stats[c] = {}
    for col in X.columns:
        stats[c][col] = {
            "mean": data_c[col].mean(),
            "var": data_c[col].var(),
            "max": data_c[col].max(),
            "min": data_c[col].min(),
            "range": data_c[col].max() - data_c[col].min()
        }
def gaussian_prob(x, mean, var):
    sigma = math.sqrt(var)
    return (1 / (math.sqrt(2*math.pi)*sigma)) * math.exp(-0.5*((x-mean)/sigma)**2)
colors = {"setosa":"blue", "versicolor":"black", "virginica":"red"}
predictions = []
for i in range(len(X)):
    x_val = X.iloc[i]["petal length (cm)"]
    probs = {}
    for c in classes:
        mean = stats[c]["petal length (cm)"]["mean"]
        var = stats[c]["petal length (cm)"]["var"]
        probs[c] = gaussian_prob(x_val, mean, var)
    pred_class = max(probs, key=probs.get)
    predictions.append(pred_class)
plt.figure(figsize=(8,4))
for c in classes:
    x_plot = X["petal length (cm)"][np.array(predictions)==c]
    plt.scatter(x_plot, [0]*len(x_plot), c=colors[c], label=c)
plt.xlabel("Petal Length (cm)")
plt.title("Classification using Petal Length only")
plt.legend()
plt.show()
correct = sum([predictions[i]==Y.iloc[i] for i in range(len(Y))])
precision = correct/len(Y)*100
print("Precision using petal length only:", precision)
test_df = pd.read_csv("iris_test_samples.csv")
X_test = test_df[["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]]
Y_test = test_df['target']
predictions_test = []
for i in range(len(X_test)):
    x_val = X_test.iloc[i]["petal length (cm)"]
    probs = {}
    for c in classes:
        mean = stats[c]["petal length (cm)"]["mean"]
        var = stats[c]["petal length (cm)"]["var"]
        probs[c] = gaussian_prob(x_val, mean, var)
    pred_class = max(probs, key=probs.get)
    predictions_test.append(pred_class)
correct_test = sum([predictions_test[i]==Y_test.iloc[i] for i in range(len(Y_test))])
precision_test = correct_test/len(Y_test)*100
print("Precision on test data using petal length only:", precision_test)
predictions_test_2feat = []
for i in range(len(X_test)):
    x_val3 = X_test.iloc[i]["petal length (cm)"]
    x_val4 = X_test.iloc[i]["petal width (cm)"]
    probs = {}
    for c in classes:
        mean3 = stats[c]["petal length (cm)"]["mean"]
        var3 = stats[c]["petal length (cm)"]["var"]
        mean4 = stats[c]["petal width (cm)"]["mean"]
        var4 = stats[c]["petal width (cm)"]["var"]
        p3 = gaussian_prob(x_val3, mean3, var3)
        p4 = gaussian_prob(x_val4, mean4, var4)
        probs[c] = (p3+p4)/2
    pred_class = max(probs, key=probs.get)
    predictions_test_2feat.append(pred_class)
correct_test_2feat = sum([predictions_test_2feat[i]==Y_test.iloc[i] for i in range(len(Y_test))])
precision_test_2feat = correct_test_2feat/len(Y_test)*100
print("Precision on test data using petal length & width:", precision_test_2feat)
predictions_test_all = []
for i in range(len(X_test)):
    probs_all = {}
    for c in classes:
        prob_list = []
        for col in X_test.columns:
            mean = stats[c][col]["mean"]
            var = stats[c][col]["var"]
            prob_list.append(gaussian_prob(X_test.iloc[i][col], mean, var))
        prob_list.sort(reverse=True)
        probs_all[c] = sum(prob_list[:2])/2
    pred_class = max(probs_all, key=probs_all.get)
    predictions_test_all.append(pred_class)
correct_test_all = sum([predictions_test_all[i]==Y_test.iloc[i] for i in range(len(Y_test))])
precision_test_all = correct_test_all/len(Y_test)*100
print("Precision on test data using 4 features (top 2 probs):", precision_test_all)