In [73]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import pandas as pd
import numpy as np

In [19]:
# Random Forest Classifier
# LDA
# Logistic Regression
# AdaBoosting
# GradientBoosting
# CATBoosting

In [20]:
def data_preprocessing(path):
    df = pd.read_csv(path)
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

In [21]:
X_train, X_test, y_train, y_test = data_preprocessing("heart.csv")

In [41]:
X_train

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
74,43,0,2,122,213,0,1,165,0,0.2,1,0,2
153,66,0,2,146,278,0,0,152,0,0.0,1,1,2
64,58,1,2,140,211,1,0,165,0,0.0,2,0,2
296,63,0,0,124,197,0,1,136,1,0.0,1,0,2
287,57,1,1,154,232,0,0,164,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,43,1,0,132,247,1,0,143,1,0.1,1,4,3
192,54,1,0,120,188,0,1,113,0,1.4,1,1,3
117,56,1,3,120,193,0,0,162,0,1.9,1,0,3
47,47,1,2,138,257,0,0,156,0,0.0,2,0,2


In [22]:
#cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
num_cols = ["age","trtbps","chol","thalachh","oldpeak"]

In [42]:
cols = ["age","trtbps","chol","thalachh","oldpeak", 'sex','exng','caa','cp','fbs','restecg','slp','thall']

In [23]:
numerical_transformer = StandardScaler()

In [46]:
ct = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, num_cols)
    ], remainder='passthrough')

In [47]:
X_train_scaled = ct.fit_transform(X_train)
X_test_scaled = ct.transform(X_test)

In [48]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=cols)

In [63]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns=cols)

In [64]:
X_test_scaled

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,sex,exng,caa,cp,fbs,restecg,slp,thall
0,1.697165,0.772516,-1.363541,-1.073753,1.290005,1.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0
1,1.024965,2.236257,-0.370312,0.223758,-0.388840,1.0,3.0,0.0,0.0,0.0,1.0,0.0,3.0
2,0.464799,2.236257,0.772838,0.396759,-0.724609,1.0,3.0,0.0,0.0,0.0,1.0,0.0,3.0
3,0.576832,-0.398476,0.210633,-0.381747,1.457889,1.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0
4,0.800899,-0.105728,-0.295351,-0.165495,0.618467,1.0,2.0,0.0,1.0,0.0,1.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,-1.215700,-0.808324,-0.089209,-0.035744,-0.640667,0.0,2.0,0.0,1.0,0.0,1.0,1.0,2.0
57,0.240733,-0.105728,-0.201650,1.045515,-0.892493,0.0,1.0,0.0,0.0,0.0,1.0,1.0,2.0
58,0.464799,1.065265,-0.651414,0.310259,0.450583,1.0,2.0,1.0,1.0,0.0,2.0,0.0,2.0
59,-0.543500,-0.691224,-0.051729,0.526511,0.030871,0.0,1.0,0.0,1.0,0.0,2.0,0.0,2.0


In [50]:
clf1 = SVC()
clf2 = RandomForestClassifier()
clf3 = LogisticRegression()
clf4 = GradientBoostingClassifier()

In [51]:
pipe = Pipeline([('preprocessor', ct), ('classifier', clf1)])

In [11]:
# params1 = {}
# params1['classifier__kernel'] = ['rbf', 'poly', 'linear', 'sigmoid']
# params1['classifier__C'] = [0.001, 0.01, 0.1, 1, 10, 100]
# params1['classifier'] = [clf1]

In [12]:
# params2 = {}
# params2['classifier__criterion'] = ['gini', 'entropy']
# params2['classifier__n_estimators'] = [1, 40, 80, 100, 120, 160, 200, 250, 300]
# params2['classifier__max_depth'] = [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
# params2['classifier'] = [clf2]

In [52]:
# params1 = {'classifier__kernel': ['rbf', 'poly', 'linear', 'sigmoid'],
#           'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
#            'classifier': [clf1]
#          }

params2 = {'classifier__criterion':['gini', 'entropy'],
              'classifier__n_estimators':[x for x in range(1,100,10)],
           'classifier__max_depth': [x for x in range(1,10,1)],
            'classifier': [clf2]
             }

params3 =    {'classifier__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'classifier__max_iter' : [100, 1000,2500, 5000],
                 'classifier': [cl3]
    }

params4 = {
    "classifier__n_estimators":[5,50,250,500],
    "classifier__max_depth":[1,3,5,7,9],
    "classifier__learning_rate":[0.01,0.1,1,10,100]
    'classifier': [cl4]
}



In [53]:
params_grid = [params2]

In [54]:
grid = GridSearchCV(pipe, params_grid, scoring='accuracy', cv=5, verbose=1)

In [55]:
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'trtbps',
                                                                          'chol',
                                                                          'thalachh',
                                                                          'oldpeak'])])),
                                       ('classifier', SVC())]),
             param_grid=[{'classifier': [RandomForestClassifier(criterion='entropy',
                                                                max_depth=3,
                      

In [None]:
grid.cv_results_

In [72]:
grid.param_grid

[{'classifier__criterion': ['gini', 'entropy'],
  'classifier__n_estimators': [1, 11, 21, 31, 41, 51, 61, 71, 81, 91],
  'classifier__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
  'classifier': [RandomForestClassifier(criterion='entropy', max_depth=3, n_estimators=71)]}]

In [None]:
# def data_preprocessing(path):
#     df = pd.read_csv(path)
#     X = df.iloc[:,:-1]
#     y = df.iloc[:,-1]
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#     return X_train, X_test, y_train, y_test

In [58]:
grid.best_score_

0.851530612244898

In [60]:
model = RandomForestClassifier(n_estimators=71, criterion='entropy', max_depth=3)

In [61]:
model.fit(X_train_scaled, y_train)

RandomForestClassifier(criterion='entropy', max_depth=3, n_estimators=71)

In [66]:
pred = model.predict(X_test_scaled)

In [67]:
(y_test == pred).mean()

0.8852459016393442