In [178]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.model_selection import KFold
from scikeras.wrappers import KerasRegressor
import tensorflow
from keras.models import Sequential
from keras.layers import Dense


In [179]:
pd.set_option('display.max_colwidth', None)  # For Pandas version < 1.0.0, use -1 instead of None

In [180]:
df =pd.read_csv('classification_cleaned_data.csv')

In [181]:
df = df[df.columns[1:]]

In [182]:
df.head(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,chol_range,thalach_range,oldpeak_range,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,200,127,1,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,200,0,1,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,200,127,2,1


In [183]:
seed=1

In [190]:
def createModel():
    model = Sequential()
    model.add(Dense(5, activation='relu', input_shape=(16,)))
    model.add(Dense(1, activation='sigmoid')) 

    return model

In [191]:
#Models
rf = {"name":"Random Forest",
      "object": RandomForestClassifier(criterion='gini',
                               min_samples_leaf=2,
                               bootstrap=True,
                               oob_score=False,
                               random_state=seed,
                               verbose=0),
      "paramsGrid": {'max_depth': range(1, 8),'min_samples_split': range(5, 10, 2),'n_estimators': range(2,5,1)},
      "requires_feature_selection": False,
      "requires_scalling":False,
      "scoring":"roc_auc"
}

svc = {"name":"SVC",
      "object": SVC(kernel='linear'),
      "paramsGrid": {'C': [.2,.5,1],'gamma': [00000.8,00000.9,0.0001]},
      "requires_feature_selection": False,
      "requires_scalling":True,
      "scoring":"roc_auc"
}

boost = {"name":"XGboost",
      "object": xgb.XGBClassifier(objective='binary:logistic',
                            seed=seed,
                            subsample=0.9,
                            colsample_bytree=0.5
                            ),
      "paramsGrid": {'max_depth':[2,3,4],'gamma':[0,2,0.3,0.5],'reg_lambda':[10,20,100],'scale_pos_weight':[3,4,5]},
      "requires_feature_selection": False,
      "requires_scalling":False,
      "scoring": "roc_auc"
}


nngrid = dict(optimizer__learning_rate=[0.1, 0.05, 0.01], optimizer__momentum=[0.8, 0.6, 0.4])
nn = { "name": "Neural Net",
    "object": KerasRegressor(model=createModel, loss='binary_crossentropy',
                        optimizer=tensorflow.keras.optimizers.legacy.SGD,
                          epochs=200, batch_size=10, verbose=0),
    "paramsGrid": nngrid,
    "requires_feature_selection": False,
    "requires_scalling": True,
    "scoring":"roc_auc"
}

models = [nn]

In [192]:
def modeling(df,modelDict,seed=1,threshold_for_selection=.3,test_size=.2,cv_splits=5):
    
    #Feature selection
    if modelDict["requires_feature_selection"] == True:
        corr = df.corr()["num"].sort_values(ascending=False)
        #Creates a new dataframe with the selected columns
        df = df[corr[corr > threshold_for_selection].index]
    

    X = df.drop('num', axis=1)
    Y = df['num']

   

    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=test_size,stratify=Y,random_state=seed)

    #Scalling
    if modelDict["requires_scalling"] == True:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    #Kfold object
    kfold = KFold(n_splits=cv_splits, random_state=seed, shuffle=True)

    #Grid Search
    gridSearch = GridSearchCV(estimator=modelDict["object"],param_grid=modelDict["paramsGrid"],cv=kfold,scoring= modelDict["scoring"]) #If grid search is empty it doesn't do CV, just kfolds
    gridSearch.fit(X_train, y_train)

    #Getting best model
    bestModel = gridSearch.best_estimator_

    #Predictions 
    y_pred = bestModel.predict(X_test)

    return [modelDict['name'],accuracy_score(y_test, y_pred),precision_score(y_test, y_pred),recall_score(y_test, y_pred),roc_auc_score(y_test, y_pred),gridSearch.best_params_,bestModel]

In [193]:
#Main

results = pd.DataFrame(columns=['model','accuracy', 'precision', 'recall','roc_auc','hiperparameters','modelObject'])
times = []
for model in models:
    print(f'Making {model["name"]} model...')
    start = time.time()
    row = modeling(df,modelDict=model)

    end = time.time()
    delta = round(end-start,3)
    times.append(delta)

    results.loc[len(results)] = row

results["duration"] = times

results[[col for col in results.columns if col != "modelObject"]].sort_values(by='roc_auc',ascending=False)

Making Neural Net model...


ValueError: Classification metrics can't handle a mix of binary and continuous targets