In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def data_preprocessing(path):
    df = pd.read_csv(path)
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

In [28]:
df_heart = pd.read_csv("heart.csv")

In [29]:
df_heart.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
X_train, X_test, y_train, y_test = data_preprocessing("heart.csv")

In [4]:
X_train

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall
74,43,0,2,122,213,0,1,165,0,0.2,1,0,2
153,66,0,2,146,278,0,0,152,0,0.0,1,1,2
64,58,1,2,140,211,1,0,165,0,0.0,2,0,2
296,63,0,0,124,197,0,1,136,1,0.0,1,0,2
287,57,1,1,154,232,0,0,164,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,43,1,0,132,247,1,0,143,1,0.1,1,4,3
192,54,1,0,120,188,0,1,113,0,1.4,1,1,3
117,56,1,3,120,193,0,0,162,0,1.9,1,0,3
47,47,1,2,138,257,0,0,156,0,0.0,2,0,2


In [5]:
def column_transformer(x, y):
    cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
    num_cols = ["age","trtbps","chol","thalachh","oldpeak"]
    cols = ["age","trtbps","chol","thalachh","oldpeak", 'sex','exng','caa','cp','fbs','restecg','slp','thall']
    numerical_transformer = StandardScaler()
    ct = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, num_cols)
    ], remainder='passthrough')
    X_train_scaled = ct.fit_transform(x)
    X_test_scaled = ct.transform(y)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=cols)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=cols)
    return X_train_scaled, X_test_scaled, ct

In [6]:
X_train_scaled, X_test_scaled, ct = column_transformer(X_train, X_test)

In [13]:
clf1 = SVC()
clf2 = RandomForestClassifier()
clf3 = LogisticRegression()
clf4 = GradientBoostingClassifier()
clf5 = AdaBoostClassifier()

In [14]:
pipe = Pipeline([('preprocessor', ct), ('classifier', clf1)])

In [15]:
params1 = {'classifier__kernel': ['rbf','linear', 'sigmoid'],
          'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
           'classifier__gamma': ['auto', 'scale'],
           'classifier': [clf1]
         }

params2 = {'classifier__criterion':['gini', 'entropy'],
              'classifier__n_estimators':[x for x in range(1,100,10)],
           'classifier__max_depth': [x for x in range(1,10,1)],
            'classifier': [clf2]
             }

params3 =    {'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__solver' : ['lbfgs', 'liblinear','sag'],
    'classifier__max_iter' : [100, 500, 1000],
                 'classifier': [clf3]
    }

params4 = {
    "classifier__n_estimators":[5,50,250,500],
    "classifier__max_depth":[1,3,5,7,9],
    "classifier__learning_rate":[0.01,0.1,1,10,100],
    'classifier': [clf4]
}

params5 = {'classifier__n_estimators': [10, 50, 100, 500],
        'classifier__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
           'classifier': [clf5]
          }

In [16]:
params_grid = [params1, params2, params3, params4, params5]

In [17]:
best_parameter = []
for best_para in params_grid:
    grid = GridSearchCV(pipe, best_para, scoring='accuracy', cv=7, verbose=1)
    grid.fit(X_train_scaled, y_train)
    best_parameter.append(grid.best_params_)

Fitting 7 folds for each of 36 candidates, totalling 252 fits
Fitting 7 folds for each of 180 candidates, totalling 1260 fits
Fitting 7 folds for each of 108 candidates, totalling 756 fits


252 fits failed out of a total of 756.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
126 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Ellefson\miniconda3\envs\strive-ai\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ellefson\miniconda3\envs\strive-ai\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Ellefson\miniconda3\envs\strive-ai\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Ellefson\miniconda

Fitting 7 folds for each of 100 candidates, totalling 700 fits
Fitting 7 folds for each of 20 candidates, totalling 140 fits


In [18]:
best_parameter

[{'classifier': SVC(C=1, gamma='auto', kernel='linear'),
  'classifier__C': 1,
  'classifier__gamma': 'auto',
  'classifier__kernel': 'linear'},
 {'classifier': RandomForestClassifier(criterion='entropy', max_depth=9, n_estimators=71),
  'classifier__criterion': 'entropy',
  'classifier__max_depth': 9,
  'classifier__n_estimators': 71},
 {'classifier': LogisticRegression(C=1),
  'classifier__C': 1,
  'classifier__max_iter': 100,
  'classifier__penalty': 'l2',
  'classifier__solver': 'lbfgs'},
 {'classifier': GradientBoostingClassifier(max_depth=1, n_estimators=250),
  'classifier__learning_rate': 0.1,
  'classifier__max_depth': 1,
  'classifier__n_estimators': 250},
 {'classifier': AdaBoostClassifier(learning_rate=0.01, n_estimators=500),
  'classifier__learning_rate': 0.01,
  'classifier__n_estimators': 500}]

In [9]:
clf_trees = {
    'SVC': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'GradientBoosting': GradientBoostingClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier()
}

In [22]:
clf_tuned_trees = {
    'SVC': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'GradientBoosting': GradientBoostingClassifier(max_depth=1, n_estimators=250),
    'AdaBoostClassifier': AdaBoostClassifier()
}

In [23]:
def get_scores(x):
    results = []
    for name, model in x.items():
        model.fit(X_train_scaled, y_train)
        train_score = model.score(X_train_scaled, y_train)
        prediction = model.predict(X_test_scaled)
        test_score = model.score(X_test_scaled, y_test)
        accuracy = (y_test == prediction).mean()*100
        results.append({
        'ModelName': name,
        'Accuracy': accuracy,
        'Train Score': train_score,
        'Test Score': test_score
        })
    return pd.DataFrame(results)         

In [11]:
get_scores(clf_trees)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


Unnamed: 0,ModelName,Accuracy,Train Score,Test Score
0,SVC,86.885246,0.876033,0.868852
1,Random Forest,85.245902,1.0,0.852459
2,Logistic Regression,85.245902,0.838843,0.852459
3,GradientBoosting,80.327869,1.0,0.803279
4,AdaBoostClassifier,90.163934,0.921488,0.901639


In [25]:
tuned_scores_no_aug = get_scores(clf_tuned_trees)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
