In [170]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.preprocessing import StandardScaler


In [171]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    data['disease'] = data.num.apply(lambda x: min(x,1))
    
    for col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
        
    data = data.drop('num', axis=1)
    data = data.dropna()

    X = data.drop('disease', axis=1)
    y = data['disease']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    
    return X_scaled_df.values, y.values


In [172]:
X, y = load_data('data/cleveland.csv')

In [173]:
class kNN(BaseEstimator, ClassifierMixin):
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.X_ = X
        self.y_ = y
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        distances = [np.sqrt(np.sum((x - x_train) ** 2)) for x_train in self.X_]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]
    
    # def get_params(self, deep=True):
    #     return {"k": self.k}

    # def set_params(self, **parameters):
    #     for parameter, value in parameters.items():
    #         setattr(self, parameter, value)
    #     return self

In [174]:
def cross_validate(X, y, model, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    precisions = []
    recalls = []
    f1_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        # Print precision, recall, and f1 score for each fold
        print("Precision:", precision_score(y_test, predictions), "Recall:", recall_score(y_test, predictions), "F1:", f1_score(y_test, predictions))
        precisions.append(precision_score(y_test, predictions, zero_division=0))
        recalls.append(recall_score(y_test, predictions, zero_division=0))
        f1_scores.append(f1_score(y_test, predictions, zero_division=0))
    
    return {
        'precision': (np.mean(precisions), np.std(precisions)),
        'recall': (np.mean(recalls), np.std(recalls)),
        'f1': (np.mean(f1_scores), np.std(f1_scores))
    }

In [175]:
knn = kNN(k=3)
results = cross_validate(X, y, knn)
print(f"\nMean Results for kNN:")
print(f"Precision: {results['precision'][0]:.3f} (+/- {results['precision'][1]:.3f})")
print(f"Recall: {results['recall'][0]:.3f} (+/- {results['recall'][1]:.3f})")
print(f"F1 Score: {results['f1'][0]:.3f} (+/- {results['f1'][1]:.3f})")

Precision: 0.8333333333333334 Recall: 0.9090909090909091 F1: 0.8695652173913043
Precision: 1.0 Recall: 0.8461538461538461 F1: 0.9166666666666666
Precision: 0.9375 Recall: 0.8823529411764706 F1: 0.9090909090909091
Precision: 0.7894736842105263 Recall: 0.8333333333333334 F1: 0.8108108108108109
Precision: 0.8095238095238095 Recall: 0.85 F1: 0.8292682926829268
Precision: 0.8571428571428571 Recall: 0.9230769230769231 F1: 0.8888888888888888
Precision: 0.7272727272727273 Recall: 0.6666666666666666 F1: 0.6956521739130435
Precision: 0.7142857142857143 Recall: 0.7692307692307693 F1: 0.7407407407407407
Precision: 0.75 Recall: 0.8181818181818182 F1: 0.782608695652174
Precision: 0.5833333333333334 Recall: 0.7777777777777778 F1: 0.6666666666666666

Mean Results for kNN:
Precision: 0.800 (+/- 0.112)
Recall: 0.828 (+/- 0.072)
F1 Score: 0.811 (+/- 0.084)


In [176]:
def feature_importance(X, y):
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X, y)
    importances = dt.feature_importances_
    return pd.Series(importances).sort_values(ascending=False)

importance_results = feature_importance(X, y)

print(importance_results)

12    0.279725
2     0.151193
11    0.108062
7     0.083773
4     0.079608
9     0.076073
0     0.072169
3     0.050094
10    0.031960
1     0.028341
6     0.016259
8     0.012420
5     0.010323
dtype: float64
