In [1]:
# Import libraries
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters  #Register pandas formatters and converters with matplotlib.
from pylab import rcParams      # for customizing matplotlib graphs.
sns.set_style("whitegrid")    

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline  
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV


In [3]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.svm           import SVC

In [4]:
df = pd.read_csv('heart.csv')
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace = True)

In [5]:
## chainging numerical col 'chol' into categorical column:
df['chol']= pd.cut(df['chol'], bins=[0,200,239,564], labels = [0,1,2]) #0-High, 1-Borderline high, 2-High
df['trtbps']= pd.cut(df['trtbps'], bins=[90,120,139,200], labels = [0,1,2])
df['age'] = pd.cut(df['age'], bins=[25,53,80], labels = [0, 1])
df = df.astype({'trtbps':'float64','chol':'float64', 'age':'float64'})

In [6]:
df['chol_bps'] = df['chol']+df['trtbps']
df.drop(['chol','trtbps'], axis=1, inplace=True)
df

Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,chol_bps
0,1.0,1,3,1,0,150,0,2.3,0,0,1,1,3.0
1,0.0,1,2,0,1,187,0,3.5,0,0,2,1,3.0
2,0.0,0,1,0,0,172,0,1.4,2,0,2,1,2.0
3,1.0,1,1,0,1,178,0,0.8,2,0,2,1,1.0
4,1.0,0,0,0,1,163,1,0.6,2,0,2,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,1.0,0,0,0,1,123,1,0.2,1,0,3,0,4.0
298,0.0,1,3,0,1,132,0,1.2,1,0,3,0,2.0
299,1.0,1,0,1,1,141,0,3.4,1,2,3,0,2.0
300,1.0,1,0,0,1,115,1,1.2,1,1,3,0,1.0


In [7]:
# data agumentation
new_df = df.copy()
def get_data(data):
    gen_data = data
    for restecg_values in data['restecg'].unique():
        new_data = gen_data[gen_data['restecg']== restecg_values]
        thalachh_std = new_data['thalachh'].std()
        oldpeak_std = new_data['oldpeak'].std()

        for i in gen_data[gen_data['restecg']== restecg_values].index:
            if np.random.randint(2)==1:
                gen_data['thalachh'].values[i] += thalachh_std/10   
            else:
                gen_data['thalachh'].values[i] -= thalachh_std/10
            if np.random.randint(2)==1:
                gen_data['oldpeak'].values[i] += oldpeak_std/10
            else:
                gen_data['oldpeak'].values[i] -= oldpeak_std/10
    return gen_data
print(df.head())  
std_data = get_data(new_df)
std_data.head()

   age  sex  cp  fbs  restecg  thalachh  exng  oldpeak  slp  caa  thall  \
0  1.0    1   3    1        0       150     0      2.3    0    0      1   
1  0.0    1   2    0        1       187     0      3.5    0    0      2   
2  0.0    0   1    0        0       172     0      1.4    2    0      2   
3  1.0    1   1    0        1       178     0      0.8    2    0      2   
4  1.0    0   0    0        1       163     1      0.6    2    0      2   

   output  chol_bps  
0       1       3.0  
1       1       3.0  
2       1       2.0  
3       1       1.0  
4       1       2.0  


Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,chol_bps
0,1.0,1,3,1,0,152,0,2.178532,0,0,1,1,3.0
1,0.0,1,2,0,1,184,0,3.394439,0,0,2,1,3.0
2,0.0,0,1,0,0,169,0,1.521468,2,0,2,1,2.0
3,1.0,1,1,0,1,175,0,0.905561,2,0,2,1,1.0
4,1.0,0,0,0,1,165,1,0.705561,2,0,2,1,2.0


In [8]:
x= df.drop(['output'], axis=1)
y = df.output
cat_fe = [ 'sex','cp','fbs','restecg','exng','slp','caa','thall','age']
num_fe = ['thalachh','oldpeak','chol_bps']

x_train, x_test, y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=0, stratify=y)

extra_sample = std_data.sample(std_data.shape[0]//3)
x_train = pd.concat([x_train, extra_sample.drop(['output'], axis=1)])
y_train = pd.concat([y_train, extra_sample['output']])


In [9]:
s_col = ['oldpeak', 'chol_bps']
scaler = MinMaxScaler()
for d in s_col:
    train_array = x_train[d].to_numpy()
    test_array = x_test[d].to_numpy()
    train_array = train_array.reshape(-1,1)
    test_array = test_array.reshape(-1,1)

    scaler.fit(train_array)
    x_train[d] = scaler.transform(train_array)
    x_test[d] = scaler.transform(test_array)

x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[d] = scaler.transform(test_array)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[d] = scaler.transform(test_array)


Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,chol_bps
268,1.0,1,0,1,0,103,1,0.295428,0,0,3,0.75
190,1.0,1,0,0,0,131,1,0.398396,1,3,3,0.5
15,0.0,0,2,0,1,158,0,0.295428,1,0,2,0.25
223,1.0,1,0,0,1,126,1,0.501365,1,1,3,0.25
249,0.0,1,0,0,1,122,1,0.741625,1,3,3,1.0


In [10]:
pipe_lr = Pipeline([('LR', LogisticRegression(random_state=0))])
pipe_svm = Pipeline([('SVM', SVC(random_state=0))])
pipe_catboost = Pipeline([('CatBoost', CatBoostClassifier(random_state=0))])

lr_param_grid = [{'LR__penalty': ['l1', 'l2'],
                   'LR__C': [1.0, 0.5, 0.1],
                   'LR__solver': ['liblinear']}]

svm_param_grid = [{'SVM__kernel': ['linear', 'rbf','sigmoid','poly'],
                    "SVM__decision_function_shape": ["ovo", "ovr"],
                    'SVM__C': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
                    "SVM__gamma": ["scale", "auto"]}]

catboost_param_grid = [{'CatBoost__depth'         : [4,5,6,7,8,9, 10],
                        'CatBoost__learning_rate' : [0.01,0.02,0.03,0.04],
                        'CatBoost__iterations'    : [10, 20,30,40,50,60,70,80,90, 100]}]
                        
lr_grid_search = GridSearchCV(estimator=pipe_lr,
        param_grid=lr_param_grid,
        scoring='accuracy',
        cv=3)   
svm_grid_search = GridSearchCV(estimator=pipe_svm,
        param_grid=svm_param_grid,
        scoring='accuracy',
        cv=3)
catboost_grid_search = GridSearchCV(estimator=pipe_catboost,
        param_grid=catboost_param_grid,
        scoring='accuracy',
        cv=3)
grids = [svm_grid_search]
for pipe in grids:
    pipe.fit(x_train, y_train)

grid_dict = {0:'svm'}

for i, model in enumerate(grids):
    print('{} Test Accuracy: {}'.format(grid_dict[i],
    model.score(x_test,y_test)))
    print('{} Best Params: {}'.format(grid_dict[i], model.best_params_))


svm Test Accuracy: 0.8688524590163934
svm Best Params: {'SVM__C': 6.0, 'SVM__decision_function_shape': 'ovo', 'SVM__gamma': 'auto', 'SVM__kernel': 'poly'}


In [11]:
SVC.get_params('C').keys()

AttributeError: 'str' object has no attribute '_get_param_names'

In [None]:
# num_4_classifiers = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value=-9999))
# ])
# cat_4_classifiers = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value=-9999))
# ])
# classifier_prepro = ColumnTransformer(transformers=[
#     ('num', num_4_classifiers, num_fe),
#     ('cat',cat_4_classifiers, cat_fe)
# ], remainder='drop')

In [None]:
# tree_classifiers = {
#   "KNN": KNeighborsClassifier(),
#   "svm": SVC(kernel='linear'),
#   "Logistic": LogisticRegression(),
#   "Decision Tree": DecisionTreeClassifier(),
#   "Extra Trees": ExtraTreesClassifier(n_estimators=100),
#   "Random Forest": RandomForestClassifier(n_estimators=100),
#   "AdaBoost": AdaBoostClassifier(n_estimators=100),
#   "Skl GBM": GradientBoostingClassifier(n_estimators=100),
#   "Skl HistGBM": HistGradientBoostingClassifier(max_iter=100),
#   "XGBoost": XGBClassifier(n_estimators=100),
#   "LightGBM": LGBMClassifier(n_estimators=100),
#   "CatBoost":  CatBoostClassifier(n_estimators=100),
# }
# tree_classifiers= {name:make_pipeline(classifier_prepro, model) for name, model in tree_classifiers.items()}
# tree_classifiers["AdaBoost"]

In [None]:
# results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

# for model_name, model in tree_classifiers.items():
#     start_time = time.time()
#     model.fit(x_train, y_train)
#     total_time = time.time()-start_time
#     pred = model.predict(x_test)
#     results = results.append({"Model":    model_name,
#                             "Accuracy": accuracy_score(y_test, pred)*100,
#                             "Bal Acc.": balanced_accuracy_score(y_test, pred)*100,
#                             "Time":     total_time},
#                             ignore_index=True)

# results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
# results_ord.index += 1 
# results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')


    