In [332]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.model_selection import KFold
from scikeras.wrappers import KerasRegressor
import tensorflow
from keras.models import Sequential
from keras.layers import Dense
import random


In [316]:
pd.set_option('display.max_colwidth', None)  # For Pandas version < 1.0.0, use -1 instead of None

In [317]:
df =pd.read_csv('classification_cleaned_data.csv')

In [318]:
df = df[df.columns[1:]]

In [319]:
df.head(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,chol_range,thalach_range,oldpeak_range,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,200,127,1,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,200,0,1,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,200,127,2,1


In [320]:
seed=1

In [321]:
def createModel():
    model = Sequential()
    model.add(Dense(5, activation='relu', input_shape=(16,)))
    model.add(Dense(1, activation='sigmoid')) 

    return model

In [322]:
#Models
rf = {"name":"Random Forest",
      "object": RandomForestClassifier(criterion='gini',
                               min_samples_leaf=2,
                               bootstrap=True,
                               oob_score=False,
                               random_state=seed,
                               verbose=0),
      "paramsGrid": {'max_depth': range(1, 8),'min_samples_split': range(5, 10, 2),'n_estimators': range(2,5,1)},
      "requires_feature_selection": False,
      "requires_scalling":False,
      "scoring":"roc_auc"
}

svc = {"name":"SVC",
      "object": SVC(kernel='linear'),
      "paramsGrid": {'C': [.2,.5,1],'gamma': [00000.8,00000.9,0.0001]},
      "requires_feature_selection": False,
      "requires_scalling":True,
      "scoring":"roc_auc"
}

boost = {"name":"XGboost",
      "object": xgb.XGBClassifier(objective='binary:logistic',
                            seed=seed,
                            subsample=0.9,
                            colsample_bytree=0.5
                            ),
      "paramsGrid": {'max_depth':[2,3,4],'gamma':[0,2,0.3,0.5],'reg_lambda':[10,20,100],'scale_pos_weight':[3,4,5]},
      "requires_feature_selection": False,
      "requires_scalling":False,
      "scoring": "roc_auc"
}


nngrid = dict(optimizer__learning_rate=[0.1, 0.05, 0.01], optimizer__momentum=[0.8, 0.6, 0.4])
nn = { "name": "Neural Net",
    "object": KerasRegressor(model=createModel, loss='binary_crossentropy',
                        optimizer=tensorflow.keras.optimizers.legacy.SGD,
                          epochs=200, batch_size=10, verbose=0),
    "paramsGrid": nngrid,
    "requires_feature_selection": False,
    "requires_scalling": True,
    "scoring":"roc_auc"
}

models = [rf,svc,boost,nn]

In [323]:
def modeling(df,modelDict,seed=1,threshold_for_selection=.3,test_size=.2,cv_splits=5):
    
    #Feature selection
    if modelDict["requires_feature_selection"] == True:
        corr = df.corr()["num"].sort_values(ascending=False)
        #Creates a new dataframe with the selected columns
        df = df[corr[corr > threshold_for_selection].index]
    

    X = df.drop('num', axis=1)
    Y = df['num']

   

    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size,stratify=Y,random_state=seed)

    #Scalling
    if modelDict["requires_scalling"] == True:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    #Kfold object
    kfold = KFold(n_splits=cv_splits, random_state=seed, shuffle=True)

    #Grid Search
    gridSearch = GridSearchCV(estimator=modelDict["object"],param_grid=modelDict["paramsGrid"],cv=kfold,scoring= modelDict["scoring"]) #If grid search is empty it doesn't do CV, just kfolds
    gridSearch.fit(X_train, y_train)

    #Getting best model
    bestModel = gridSearch.best_estimator_

    #Predictions 
    y_pred = bestModel.predict(X_test)

    if modelDict["name"] == "Neural Net":
        y_pred = (bestModel.predict(X_test) > .5).astype(int)

    return [modelDict['name'],accuracy_score(y_test, y_pred),precision_score(y_test, y_pred),recall_score(y_test, y_pred),roc_auc_score(y_test, y_pred),gridSearch.best_params_,bestModel]

In [324]:
#Main

results = pd.DataFrame(columns=['model','accuracy', 'precision', 'recall','roc_auc','hiperparameters','modelObject'])
times = []
for model in models:
    print(f'Making {model["name"]} model...')
    start = time.time()
    row = modeling(df,modelDict=model)

    end = time.time()
    delta = round(end-start,3)
    times.append(delta)

    results.loc[len(results)] = row

results["duration"] = times

results[[col for col in results.columns if col != "modelObject"]].sort_values(by='roc_auc',ascending=False)

Making Random Forest model...
Making SVC model...
Making XGboost model...
Making Neural Net model...


Unnamed: 0,model,accuracy,precision,recall,roc_auc,hiperparameters,duration
0,Random Forest,0.819672,0.793103,0.821429,0.819805,"{'max_depth': 5, 'min_samples_split': 9, 'n_estimators': 4}",0.873
1,SVC,0.803279,0.785714,0.785714,0.801948,"{'C': 0.5, 'gamma': 0.8}",0.067
3,Neural Net,0.786885,0.758621,0.785714,0.786797,"{'optimizer__learning_rate': 0.01, 'optimizer__momentum': 0.4}",55.81
2,XGboost,0.754098,0.685714,0.857143,0.761905,"{'gamma': 0.5, 'max_depth': 3, 'reg_lambda': 20, 'scale_pos_weight': 4}",14.34


### Rf was the best model so we'll predict with it

In [325]:
#best model
model = results.loc[results.model=="Random Forest","modelObject"].values[0]

In [347]:
X = df.drop('num', axis=1)

In [352]:
n_rows=14
newX = pd.DataFrame(np.random.randint(X.min(),
                                     X.max()+1,
                                     size=(n_rows,len(X.columns))),columns=X.columns)

newX


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,chol_range,thalach_range,oldpeak_range
0,75,0,4,109,176,0,0,121,0,2,1,1,6,381,87,0
1,47,1,2,145,389,0,2,167,0,2,2,3,4,140,64,0
2,66,0,4,192,553,0,0,177,1,3,3,3,7,226,27,3
3,45,1,3,160,419,1,1,76,1,6,3,3,5,200,3,1
4,69,1,4,170,432,1,0,104,1,1,3,3,7,406,25,4
5,49,1,2,102,526,0,2,101,0,6,3,0,5,408,78,1
6,48,1,1,144,363,0,0,171,0,5,1,1,5,441,27,4
7,55,1,4,108,352,0,2,153,1,0,3,2,3,339,47,0
8,30,1,3,163,200,1,1,188,0,6,3,3,4,280,76,0
9,48,1,4,174,207,0,0,177,1,5,2,2,5,442,38,2


In [353]:
y_pred = model.predict(newX)

In [354]:
y_pred

array([1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])

In [361]:
pd.concat([newX,pd.Series(y_pred,name="has_disease")],axis=1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,chol_range,thalach_range,oldpeak_range,has_disease
0,75,0,4,109,176,0,0,121,0,2,1,1,6,381,87,0,1
1,47,1,2,145,389,0,2,167,0,2,2,3,4,140,64,0,0
2,66,0,4,192,553,0,0,177,1,3,3,3,7,226,27,3,1
3,45,1,3,160,419,1,1,76,1,6,3,3,5,200,3,1,1
4,69,1,4,170,432,1,0,104,1,1,3,3,7,406,25,4,1
5,49,1,2,102,526,0,2,101,0,6,3,0,5,408,78,1,1
6,48,1,1,144,363,0,0,171,0,5,1,1,5,441,27,4,1
7,55,1,4,108,352,0,2,153,1,0,3,2,3,339,47,0,1
8,30,1,3,163,200,1,1,188,0,6,3,3,4,280,76,0,0
9,48,1,4,174,207,0,0,177,1,5,2,2,5,442,38,2,1
