In [2]:
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
import pandas as pd
import numpy as np

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.10.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
# Load the dataset
df = pd.read_csv("cleanheart_dataset.csv")

In [5]:
# Separate the features and target variable
X = df.drop("HeartDiseaseorAttack", axis=1)
y = df["HeartDiseaseorAttack"]

In [6]:
# Perform oversampling using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X, y)

In [7]:
X_train_res.shape[0] #new row count

412128

In [8]:
y_train_res.shape[0] #new row count

412128

In [9]:
y_train_res.value_counts() #new balance ratio in target variable

0.0    206064
1.0    206064
Name: HeartDiseaseorAttack, dtype: int64

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_res, y_train_res, test_size=0.3, random_state=42)

In [14]:
X_train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
345981,1.0,1.0,1.0,32.374432,0.0,0.0,2.0,1.0,1.0,0.374432,...,1.0,0.0,3.625568,0.0,1.625568,0.625568,1.0,11.0,6.0,8.0
310659,1.0,1.0,1.0,25.0,1.0,0.462995,0.0,1.0,0.462995,0.462995,...,1.0,0.0,4.0,0.0,0.0,0.0,1.0,9.462995,6.0,7.0
77985,1.0,1.0,1.0,27.0,0.0,0.0,2.0,0.0,0.0,1.0,...,1.0,1.0,4.0,0.0,7.0,1.0,0.0,7.0,5.0,7.0
10475,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,6.0,6.0,8.0
290531,0.0,1.0,1.0,23.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.286744,0.0,0.0,0.0,1.0,9.0,5.286744,7.0


In [15]:
X_test.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
133782,1.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,3.0,2.0,0.0,0.0,1.0,7.0,4.0,7.0
126825,1.0,1.0,1.0,33.0,1.0,0.0,2.0,1.0,1.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,9.0,4.0,7.0
283831,0.483122,1.0,1.0,30.483122,0.0,0.0,2.0,1.0,0.483122,1.0,...,1.0,0.0,3.516878,0.0,0.0,0.0,0.0,10.0,4.516878,1.0
89269,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,8.0,4.0,8.0
215748,0.0,0.0,1.0,23.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,1.0,4.0,6.0,8.0


In [18]:
# choosing features based on feature selection : Correlation
X_train_selected = X_train[["GenHlth", "Age","DiffWalk", "HighBP", "Stroke", "HighChol", "Diabetes", "PhysHlth", "Smoker", "Sex", "MentHlth","CholCheck"]]
X_test_selected = X_test[["GenHlth", "Age","DiffWalk", "HighBP", "Stroke", "HighChol", "Diabetes", "PhysHlth", "Smoker", "Sex", "MentHlth","CholCheck"]]

In [20]:
# Feature Scaling : Minmax Normalization
# Assuming X_train_selected and X_test_selected are defined
scaler = MinMaxScaler()

# fit the scaler on the training data
scaler.fit(X_train_selected)

# transform the training and test data using the fitted scaler
X_train_norm = scaler.transform(X_train_selected)
X_test_norm = scaler.transform(X_test_selected)

In [22]:
# Define the KNN classifier
knn = KNeighborsClassifier()

# Define the hyperparameter grid for grid search
param_grid = {'n_neighbors': [ 3, 5, 7, 9]}

# Perform grid search using cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=4)
grid_search.fit(X_train_norm, y_train)

# Print the best hyperparameters and score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Best hyperparameters: {'n_neighbors': 3}
Best score: 0.86889968453052


In [24]:
# Evaluate the model on the test set
y_pred = grid_search.predict(X_test_norm)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [25]:
# Print the confusion matrix
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

# Print the accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))

# Print the precision
print('Precision:', precision_score(y_test, y_pred))

# Print the recall
print('Recall:', recall_score(y_test, y_pred))

# Print the F1 score
print('F1 score:', f1_score(y_test, y_pred))

# Print the mean squared error
print('Mean squared error:', mean_squared_error(y_test, y_pred))

Confusion matrix:
 [[56180  5826]
 [ 9076 52557]]
Accuracy: 0.8794716877360703
Precision: 0.900210677765788
Recall: 0.8527412262911103
F1 score: 0.8758332222370351
Mean squared error: 0.12052831226392967
