# import libaries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# import the dataset

In [2]:
df=pd.read_csv('../heart_disease/heart_disease.csv')
#df.isnull().sum() #check for null values 
#df.columns

In [3]:
X = df.drop (labels = ['HeartDiseaseorAttack'], axis =1)
y = df['HeartDiseaseorAttack']

# Define Resampling algorithms

In [4]:
over=  RandomOverSampler(sampling_strategy=0.2)
under= RepeatedEditedNearestNeighbours(sampling_strategy='majority', max_iter=100,n_neighbors=7, kind_sel='all', n_jobs=-1)

# Feature Selection

In [5]:
#balanced_accuracy, or f1_weighted
rfecv = RFECV(estimator= DecisionTreeClassifier(), step = 1, cv = 5, scoring="balanced_accuracy", n_jobs=-1)
rfecv = rfecv.fit(X, y)

print("The optimal number of features:", rfecv.n_features_)
print("Best features:", X.columns[rfecv.support_])

X_new = rfecv.transform(X)

The optimal number of features: 20
Best features: Index(['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'Diabetes',
       'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump',
       'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth',
       'DiffWalk', 'Sex', 'Age', 'Education', 'Income'],
      dtype='object')


In [6]:
X_train, X_test, y_train , y_test = train_test_split(X_new,y,test_size=0.3, random_state=0) 
X_train.shape

(177576, 20)

# Train the model and predict (original data)

In [15]:
model=DecisionTreeClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.92      0.91      0.28      0.92      0.50      0.27     68840
        1.0       0.24      0.28      0.91      0.26      0.50      0.23      7264

avg / total       0.86      0.85      0.34      0.85      0.50      0.26     76104



# Logistic regression

In [12]:
X_train, X_test, y_train , y_test = train_test_split(X_new,y,test_size=0.3, random_state=0) 

model2= LogisticRegression()
#model2= LogisticRegression(penalty="l2", C=0.001, class_weight={0:5.3, 1:0.5}, solver="lbfgs")

pipeline = Pipeline([('StandardScaler', StandardScaler()), ('over', over), ('under', under), ('model2', model2)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat)) #87 40 #com penalty e c : 91, 41

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.98      0.71      0.83      0.82      0.77      0.58     68840
        1.0       0.23      0.83      0.71      0.36      0.77      0.59      7264

avg / total       0.90      0.72      0.82      0.78      0.77      0.58     76104



# Decision Tree

In [7]:
X_train, X_test, y_train , y_test = train_test_split(X_new,y,test_size=0.3, random_state=0) 

model2= DecisionTreeClassifier()#criterion="entropy", class_weight={0:5.3, 1:0.55}, max_depth=30


pipeline = Pipeline([('over', over), ('under', under), ('model2', model2)])
pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat))#84 25 ## com scaler 87 35 #89 34 com class weight

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.96      0.73      0.73      0.83      0.73      0.53     68840
        1.0       0.22      0.73      0.73      0.34      0.73      0.53      7264

avg / total       0.89      0.73      0.73      0.78      0.73      0.53     76104



# SVM

In [16]:
X_train, X_test, y_train , y_test = train_test_split(X_new,y,test_size=0.3, random_state=0) 

model2= SVC()#C=1, gamma=0.1, kernel='rbf', class_weight={0:5.3,1:0.55}

pipeline = Pipeline([('StandardScaler', StandardScaler()), ('over', over), ('under', under), ('model2', model2)])

pipeline.fit(X_train, y_train) 

y_hat = pipeline.predict(X_test)
print(classification_report_imbalanced(y_test, y_hat)) #sem class wheight 82,36

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.97      0.72      0.82      0.83      0.76      0.58     68840
        1.0       0.23      0.82      0.72      0.36      0.76      0.59      7264

avg / total       0.90      0.73      0.81      0.78      0.76      0.58     76104

