In [2]:
from google.colab import files
uploaded = files.upload()

Saving heart-diseases.csv to heart-diseases.csv


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, mean_squared_error

In [4]:
# Load the dataset
df = pd.read_csv('heart-diseases.csv')

In [5]:
# Check for missing values
print(df.isnull().sum()) # should print all zeros

HeartDiseaseorAttack    0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
Diabetes                0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


In [6]:
# Split the dataset into features and target
X = df.drop('HeartDiseaseorAttack', axis=1)
y = df['HeartDiseaseorAttack']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define the KNN classifier
knn = KNeighborsClassifier()

# Define the hyperparameter grid for grid search
param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}

# Perform grid search using cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Best hyperparameters: {'n_neighbors': 9}
Best score: 0.9019680305801984


In [8]:
# Evaluate the model on the test set
y_pred = grid_search.predict(X_test)

In [9]:
# Print the confusion matrix
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

# Print the accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))

# Print the precision
print('Precision:', precision_score(y_test, y_pred))

# Print the recall
print('Recall:', recall_score(y_test, y_pred))

# Print the F1 score
print('F1 score:', f1_score(y_test, y_pred))

# Print the mean squared error
print('Mean squared error:', mean_squared_error(y_test, y_pred))

Confusion matrix:
 [[45482   486]
 [ 4453   315]]
Accuracy: 0.9026529485966572
Precision: 0.39325842696629215
Recall: 0.06606543624161074
F1 score: 0.1131262345124798
Mean squared error: 0.0973470514033428
