In [1]:
from google.colab import files
uploaded = files.upload()

Saving cleanheart_dataset.csv to cleanheart_dataset.csv


In [2]:
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
import pandas as pd
import numpy as np

In [3]:
# Load the dataset
df = pd.read_csv("cleanheart_dataset.csv")

In [4]:
# Separate the features and target variable
X = df.drop("HeartDiseaseorAttack", axis=1)
y = df["HeartDiseaseorAttack"]

In [5]:
# Perform oversampling using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X, y)

In [6]:
X_train_res.shape[0] #new row count

412128

In [7]:
y_train_res.shape[0] #new row count

412128

In [9]:
y_train_res.value_counts() #new balance ratio in target variable

0.0    206064
1.0    206064
Name: HeartDiseaseorAttack, dtype: int64

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_res, y_train_res, test_size=0.3, random_state=42)

In [11]:
# Feature Scaling : Minmax Normalization
# Assuming X_train_selected and X_test_selected are defined
scaler = MinMaxScaler()

# fit the scaler on the training data
scaler.fit(X_train)

# transform the training and test data using the fitted scaler
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

In [12]:
# Define the KNN classifier
knn = KNeighborsClassifier()

# Define the hyperparameter grid for grid search
param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}

# Perform grid search using cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train_norm, y_train)

# Print the best hyperparameters and score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Best hyperparameters: {'n_neighbors': 1}
Best score: 0.9136570180932804


In [13]:
# Evaluate the model on the test set
y_pred = grid_search.predict(X_test_norm)

In [14]:
# Print the confusion matrix
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

# Print the accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))

# Print the precision
print('Precision:', precision_score(y_test, y_pred))

# Print the recall
print('Recall:', recall_score(y_test, y_pred))

# Print the F1 score
print('F1 score:', f1_score(y_test, y_pred))

# Print the mean squared error
print('Mean squared error:', mean_squared_error(y_test, y_pred))

Confusion matrix:
 [[55348  6658]
 [ 2517 59116]]
Accuracy: 0.925792023552439
Precision: 0.8987745917839876
Recall: 0.9591614881638083
F1 score: 0.9279866883295265
Mean squared error: 0.07420797644756104
