In [39]:
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
import pandas as pd
import numpy as np

In [40]:
# Load the dataset
df = pd.read_csv("cleanheart_dataset.csv")

In [41]:
# Separate the features and target variable
X = df.drop("HeartDiseaseorAttack", axis=1)
y = df["HeartDiseaseorAttack"]

In [42]:
# Perform oversampling using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X, y)

In [43]:
X_train_res.shape[0] #new row count

412128

In [44]:
y_train_res.shape[0] #new row count

412128

In [45]:
y_train_res.value_counts() #new balance ratio in target variable

0.0    206064
1.0    206064
Name: HeartDiseaseorAttack, dtype: int64

In [46]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train_res, y_train_res, test_size=0.3, random_state=42)

In [47]:
# Feature Selection technique : Mutual Info Classifier
selector = SelectKBest(mutual_info_classif, k=15)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [48]:
#To understand which features were selected

selected_mask = selector.get_support()

# get the names of the selected features
feature_names = np.array(X_train.columns)
selected_features = feature_names[selected_mask]

print("Selected features:", selected_features)

Selected features: ['HighBP' 'HighChol' 'BMI' 'Smoker' 'Diabetes' 'PhysActivity' 'Fruits'
 'Veggies' 'GenHlth' 'PhysHlth' 'DiffWalk' 'Sex' 'Age' 'Education'
 'Income']


In [49]:
# Feature Scaling : Minmax Normalization
# Assuming X_train_selected and X_test_selected are defined
scaler = MinMaxScaler()

# fit the scaler on the training data
scaler.fit(X_train_selected)

# transform the training and test data using the fitted scaler
X_train_norm = scaler.transform(X_train_selected)
X_test_norm = scaler.transform(X_test_selected)

In [50]:
# Define the KNN classifier
knn = KNeighborsClassifier()

# Define the hyperparameter grid for grid search
param_grid = {'n_neighbors': [1, 3, 5, 7, 9]}

# Perform grid search using cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train_norm, y_train)

# Print the best hyperparameters and score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Best hyperparameters: {'n_neighbors': 1}
Best score: 0.9071715024931561


In [51]:
# Evaluate the model on the test set
y_pred = grid_search.predict(X_test_norm)

In [52]:
# Print the confusion matrix
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

# Print the accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))

# Print the precision
print('Precision:', precision_score(y_test, y_pred))

# Print the recall
print('Recall:', recall_score(y_test, y_pred))

# Print the F1 score
print('F1 score:', f1_score(y_test, y_pred))

# Print the mean squared error
print('Mean squared error:', mean_squared_error(y_test, y_pred))

Confusion matrix:
 [[55559  6447]
 [ 3465 58168]]
Accuracy: 0.9198311212481498
Precision: 0.9002244060976553
Recall: 0.9437801177940389
F1 score: 0.9214878651542994
Mean squared error: 0.08016887875185015
