# import libaries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from collections import Counter

# import the dataset 

In [2]:
df=pd.read_csv('../heart_disease/heart_disease.csv')
#df.isnull().sum() #check for null values 
df.columns

Index(['HeartDiseaseorAttack', 'HighBP', 'HighChol', 'CholCheck', 'BMI',
       'Smoker', 'Stroke', 'Diabetes', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [3]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

# Feature Selection

In [4]:
from sklearn.feature_selection import RFECV

#balanced_accuracy, or f1_weighted
rfecv = RFECV(estimator= DecisionTreeClassifier(), step = 1, cv = 5, scoring="balanced_accuracy", n_jobs=-1)
rfecv = rfecv.fit(X, y)

print("The optimal number of features:", rfecv.n_features_)
print("Best features:", X.columns[rfecv.support_])


X_new = rfecv.transform(X)

The optimal number of features: 20
Best features: Index(['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'Diabetes',
       'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump',
       'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth',
       'DiffWalk', 'Sex', 'Age', 'Education', 'Income'],
      dtype='object')


# Resampling the data

In [5]:
X_train, X_test, y_train , y_test = train_test_split(X_new,y,test_size=0.3, random_state=1234) 

over=  RandomOverSampler(sampling_strategy=0.11)
under= RepeatedEditedNearestNeighbours(sampling_strategy='majority', max_iter=100,n_neighbors=7, kind_sel='all', n_jobs=-1)

X_balanced, y_balanced = over.fit_resample(X_train, y_train)
X_balanced, y_balanced = under.fit_resample(X_balanced, y_balanced)

print(f'Y balanced {Counter(y_balanced)}')

Y balanced Counter({0.0: 96091, 1.0: 17692})


# Scaling the data

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_balanced)

scaleddata=scaler.transform(X_balanced)

In [7]:
# calculate heuristic class weighting
from sklearn.utils.class_weight import compute_class_weight

weighting = compute_class_weight(class_weight='balanced', classes=[0,1], y=y)
print(weighting)

[0.55198945 5.30866781]


# Logistic regression

In [15]:
from sklearn.linear_model import LogisticRegression

#hyperparameter tunning
#grid search 

from sklearn.model_selection import GridSearchCV

params = {'penalty':["none", "l2"],
          'C':[0.001, 0.01, 0.1, 1, 10],
          'class_weight':[{0:0.55,1:5.3},{0:5.3,1:0.55}, {0:5,1:1}, {0:1,1:1}, {0:1,1:5}]}

model=LogisticRegression()

#f1_weighted, recall, balanced_accuracy
param_search=GridSearchCV(estimator=model, param_grid=params, cv=5, scoring="recall", n_jobs=-1) 

best_model = param_search.fit(scaleddata,y_balanced)

print('Best Parameters:',best_model.best_params_,end='\n\n')
print('Best Score:',best_model.best_score_)

Best Parameters: {'C': 0.001, 'class_weight': {0: 0.55, 1: 5.3}, 'penalty': 'l2'}

Best Score: 0.9507572447225197


# Decision Tree 

In [24]:
from sklearn.tree import DecisionTreeClassifier
#hyperparameter tunning
#grid search 

from sklearn.model_selection import GridSearchCV

params = {'max_depth':[15,20,25,30,35],
          'criterion':['gini','entropy', 'log_loss'],
          'class_weight':[{0:0.55,1:5.3},{0:5.3,1:0.55}, {0:5,1:1}, {0:1,1:1}, {0:1,1:5}]}

model=DecisionTreeClassifier()

#f1_weighted, recall, balanced_accuracy
param_search=GridSearchCV(estimator=model, param_grid=params, cv=5, scoring="balanced_accuracy", n_jobs=-1)

#not scaled data
best_model = param_search.fit(X_balanced, y_balanced)

print('Best Parameters:',best_model.best_params_,end='\n\n')
print('Best Score:',best_model.best_score_)

Best Parameters: {'class_weight': {0: 0.55, 1: 5.3}, 'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 4}

Best Score: 0.9409134561065271


# SVM  

In [10]:
from sklearn.svm import SVC

#hyperparameter tunning
#grid search 

from sklearn.model_selection import GridSearchCV

params = {'C': [0.01, 0.1, 1, 10], 
          'gamma': [0.01, 0.1, 1, 10]
          #'kernel': ['rbf'],
          #'class_weight':[{0:0.55,1:5.3}, {0:1,1:1}]
         } 

model=SVC()

#f1_weighted, recall, balanced_accuracy
param_search=GridSearchCV(estimator=model, param_grid=params, cv=5, scoring="balanced_accuracy", n_jobs=-1)

best_model = param_search.fit(scaleddata, y_balanced)

print('Best Parameters:',best_model.best_params_,end='\n\n')
print('Best Score:',best_model.best_score_)

Best Parameters: {'C': 1, 'gamma': 0.1}

Best Score: 0.9762674226666912
