In [1]:
# Import libraries
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters  #Register pandas formatters and converters with matplotlib.
from pylab import rcParams      # for customizing matplotlib graphs.
sns.set_style("whitegrid")    

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline  
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2


In [3]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.svm           import SVC

In [4]:
df = pd.read_csv('heart.csv')
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace = True)

In [5]:
#df.iloc[72]

In [6]:
## chainging numerical col 'chol' into categorical column:
df['chol']= pd.cut(df['chol'], bins=[0,200,239,564], labels = [0,1,2]) #0-High, 1-Borderline high, 2-High
df['trtbps']= pd.cut(df['trtbps'], bins=[90,120,139,200], labels = [0,1,2])
df['age'] = pd.cut(df['age'], bins=[25,53,80], labels = [0, 1])
df = df.astype({'trtbps':'float64','chol':'float64', 'age':'float64'})

In [7]:
df['chol_bps'] = df['chol']+df['trtbps']
df.drop(['chol','trtbps'], axis=1, inplace=True)
df

Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,chol_bps
0,1.0,1,3,1,0,150,0,2.3,0,0,1,1,3.0
1,0.0,1,2,0,1,187,0,3.5,0,0,2,1,3.0
2,0.0,0,1,0,0,172,0,1.4,2,0,2,1,2.0
3,1.0,1,1,0,1,178,0,0.8,2,0,2,1,1.0
4,1.0,0,0,0,1,163,1,0.6,2,0,2,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,1.0,0,0,0,1,123,1,0.2,1,0,3,0,4.0
298,0.0,1,3,0,1,132,0,1.2,1,0,3,0,2.0
299,1.0,1,0,1,1,141,0,3.4,1,2,3,0,2.0
300,1.0,1,0,0,1,115,1,1.2,1,1,3,0,1.0


In [8]:
rows_with_nan = [index for index, row in df.iterrows() if row.isnull().any()]

print(rows_with_nan)

[]


In [9]:
# data agumentation
new_df = df.copy()
def get_data(data):
    gen_data = data
    for restecg_values in data['restecg'].unique():
        new_data = gen_data[gen_data['restecg']== restecg_values]
        thalachh_std = new_data['thalachh'].std()
        oldpeak_std = new_data['oldpeak'].std()

        for i in gen_data[gen_data['restecg']== restecg_values].index:
            if np.random.randint(2)==1:
                gen_data['thalachh'].values[i] += thalachh_std/10   
            else:
                gen_data['thalachh'].values[i] -= thalachh_std/10
            if np.random.randint(2)==1:
                gen_data['oldpeak'].values[i] += oldpeak_std/10
            else:
                gen_data['oldpeak'].values[i] -= oldpeak_std/10
    return gen_data
print(df.head())  
std_data = get_data(new_df)
std_data.head()

   age  sex  cp  fbs  restecg  thalachh  exng  oldpeak  slp  caa  thall  \
0  1.0    1   3    1        0       150     0      2.3    0    0      1   
1  0.0    1   2    0        1       187     0      3.5    0    0      2   
2  0.0    0   1    0        0       172     0      1.4    2    0      2   
3  1.0    1   1    0        1       178     0      0.8    2    0      2   
4  1.0    0   0    0        1       163     1      0.6    2    0      2   

   output  chol_bps  
0       1       3.0  
1       1       3.0  
2       1       2.0  
3       1       1.0  
4       1       2.0  


Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,chol_bps
0,1.0,1,3,1,0,152,0,2.421468,0,0,1,1,3.0
1,0.0,1,2,0,1,189,0,3.605561,0,0,2,1,3.0
2,0.0,0,1,0,0,169,0,1.521468,2,0,2,1,2.0
3,1.0,1,1,0,1,180,0,0.694439,2,0,2,1,1.0
4,1.0,0,0,0,1,160,1,0.705561,2,0,2,1,2.0


In [10]:
# x= df.drop(['output'], axis=1)
# y = df.output
# x_new = SelectKBest(chi2, k=8).fit_transform(x,y)
# print(x_new[0])
# print(x_new[1])
# print(x_new[2])


In [11]:
x= df.drop(['output'], axis=1)
y = df.output
cat_fe = [ 'age','cp','exng','slp','caa']
num_fe = ['thalachh','oldpeak','chol_bps']

x_train, x_test, y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=0, stratify=y)

extra_sample = std_data.sample(std_data.shape[0]//3)
x_train = pd.concat([x_train, extra_sample.drop(['output'], axis=1)])
y_train = pd.concat([y_train, extra_sample['output']])


In [12]:
s_col = ['oldpeak', 'chol_bps']
scaler = MinMaxScaler()
for d in s_col:
    train_array = x_train[d].to_numpy()
    test_array = x_test[d].to_numpy()
    train_array = train_array.reshape(-1,1)
    test_array = test_array.reshape(-1,1)

    scaler.fit(train_array)
    x_train[d] = scaler.transform(train_array)
    x_test[d] = scaler.transform(test_array)

x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[d] = scaler.transform(test_array)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[d] = scaler.transform(test_array)


Unnamed: 0,age,sex,cp,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,chol_bps
268,1.0,1,0,1,0,103,1,0.300879,0,0,3,0.75
190,1.0,1,0,0,0,131,1,0.405747,1,3,3,0.5
15,0.0,0,2,0,1,158,0,0.300879,1,0,2,0.25
223,1.0,1,0,0,1,126,1,0.510615,1,1,3,0.25
249,0.0,1,0,0,1,122,1,0.755308,1,3,3,1.0


In [13]:
num_4_classifiers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-9999))
])
cat_4_classifiers = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-9999))
])
classifier_prepro = ColumnTransformer(transformers=[
    ('num', num_4_classifiers, num_fe),
    ('cat',cat_4_classifiers, cat_fe)
])

In [14]:
tree_classifiers = {
  "KNN": KNeighborsClassifier(),
  "svm": SVC(kernel='linear'),
  "Logistic": LogisticRegression(),
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees": ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost": AdaBoostClassifier(n_estimators=100),
  "Skl GBM": GradientBoostingClassifier(n_estimators=100),
  "Skl HistGBM": HistGradientBoostingClassifier(max_iter=100),
  "XGBoost": XGBClassifier(n_estimators=100),
  "LightGBM": LGBMClassifier(n_estimators=100),
  "CatBoost":  CatBoostClassifier(n_estimators=100),
}
tree_classifiers= {name:make_pipeline(classifier_prepro, model) for name, model in tree_classifiers.items()}
tree_classifiers["AdaBoost"]

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=-9999,
                                                                                 strategy='constant'))]),
                                                  ['thalachh', 'oldpeak',
                                                   'chol_bps']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=-9999,
                                                                                 strategy='constant'))]),
                                                  ['age', 'cp', 'exng', 'slp',
                                                   'caa'])])),
 

In [15]:
### With out cross validation
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time()-start_time
    pred = model.predict(x_test)
    results = results.append({"Model":    model_name,
                            "Accuracy": accuracy_score(y_test, pred)*100,
                            "Bal Acc.": balanced_accuracy_score(y_test, pred)*100,
                            "Time":     total_time},
                            ignore_index=True)

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')


    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Learning rate set to 0.053755
0:	learn: 0.6786045	total: 143ms	remaining: 14.2s
1:	learn: 0.6649951	total: 146ms	remaining: 7.15s
2:	learn: 0.6503591	total: 149ms	remaining: 4.82s
3:	learn: 0.6365908	total: 152ms	remaining: 3.64s
4:	learn: 0.6244249	total: 155ms	remaining: 2.95s
5:	learn: 0.6128625	total: 158ms	remaining: 2.48s
6:	learn: 0.6003449	total: 161ms	remaining: 2.14s
7:	learn: 0.5910654	total: 164ms	remaining: 1.89s
8:	learn: 0.5804197	total: 166ms	remaining: 1.68s
9:	learn: 0.5706946	total: 169ms	remaining: 1.52s
10:	learn: 0.5619353	total: 172ms	remaining: 1.39s
11:	learn: 0.5549551	total: 175ms	remaining: 1.28s
12:	learn: 0.5473013	total: 178ms	remaining: 1.19s
13:	learn: 0.5408124	total: 181ms	remaining: 1.11s
14:	learn: 0.5328687	total: 184ms	remaining: 1.04s
15:	learn: 0.5261978	total: 186ms	remaining: 978ms
16:	learn: 0.5193471	total: 188ms	remaining: 919ms
17:	learn: 0.5135265	total: 190ms	remaining: 866ms
18:	learn: 0.5068298	total: 192ms	remaining: 818ms
19:	learn: 

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Extra Trees,90.163934,89.82684,0.186501
2,Random Forest,86.885246,86.255411,0.151595
3,CatBoost,85.245902,84.469697,0.36211
4,Skl HistGBM,81.967213,81.439394,0.348037
5,Decision Tree,80.327869,80.465368,0.012965
6,LightGBM,80.327869,79.65368,0.07879
7,Skl GBM,78.688525,78.679654,0.073838
8,XGBoost,78.688525,78.679654,0.116688
9,svm,77.04918,76.623377,0.070844
10,Logistic,77.04918,76.623377,0.053886


In [16]:
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

# for model_name, model in tree_classifiers.items():
#     start_time = time.time()
        
#     # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
#     pred = cross_val_predict(model, x_train,y_train, cv=skf)

#     total_time = time.time() - start_time

#     results = results.append({"Model":    model_name,
#                               "Accuracy": accuracy_score(y_train, pred)*100,
#                               "Bal Acc.": balanced_accuracy_score(y_train, pred)*100,
#                               "Time":     total_time},
#                               ignore_index=True)
                              
                              
# results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
# results_ord.index += 1 
# results_ord.style.bar(subset=['Accuracy','Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')