In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# models
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

# NN models
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping, ModelCheckpoint

# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

ModuleNotFoundError: No module named 'sklearn'

## Read Data

In [184]:
data = pd.read_csv("cardio_train.csv", sep=";")

## Data Cleaning

In [185]:
# FE - thanks to: https://www.kaggle.com/benanakca/comparison-of-classification-disease-prediction
data.drop("id",axis=1,inplace=True)
data.drop_duplicates(inplace=True)
data["bmi"] = data["weight"] / (data["height"]/100)**2
out_filter = ((data["ap_hi"]>250) | (data["ap_lo"]>200))
data = data[~out_filter]
len(data)

68983

In [186]:
out_filter2 = ((data["ap_hi"] < 0) | (data["ap_lo"] < 0))
data = data[~out_filter2]

In [187]:
data.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,bmi
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,21.96712
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,34.927679
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,23.507805
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,28.710479
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,23.011177


In [188]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68975 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          68975 non-null  int64  
 1   gender       68975 non-null  int64  
 2   height       68975 non-null  int64  
 3   weight       68975 non-null  float64
 4   ap_hi        68975 non-null  int64  
 5   ap_lo        68975 non-null  int64  
 6   cholesterol  68975 non-null  int64  
 7   gluc         68975 non-null  int64  
 8   smoke        68975 non-null  int64  
 9   alco         68975 non-null  int64  
 10  active       68975 non-null  int64  
 11  cardio       68975 non-null  int64  
 12  bmi          68975 non-null  float64
dtypes: float64(2), int64(11)
memory usage: 7.4 MB


In [189]:
# import pandas_profiling as profile
# pp.ProfileReport(data)

In [190]:
# data.describe()

In [191]:
# ap_list = ["ap_hi", "ap_lo"]
# boundary = pd.DataFrame(index=["lower_bound","upper_bound"]) # We created an empty dataframe
# for each in ap_list:
#     Q1 = data[each].quantile(0.25)
#     Q3 = data[each].quantile(0.75)
#     IQR = Q3 - Q1

#     lower_bound = Q1- 1.5*IQR
#     upper_bound = Q3 + 1.5*IQR
#     boundary[each] = [lower_bound, upper_bound ]
# boundary

In [192]:
# ap_hi_filter = (data["ap_hi"] > boundary["ap_hi"][1])
# ap_lo_filter = (data["ap_lo"] > boundary["ap_lo"][1])                                                           
# outlier_filter = (ap_hi_filter | ap_lo_filter)
# data_outliers = data[outlier_filter]
# data_outliers["cardio"].value_counts()

In [193]:
# data = data[~outlier_filter]
# data.info()

In [194]:
# data.describe()

## Preparing the Training and Test Sets

In [195]:
target_name = 'cardio'
data_target = data[target_name]
data = data.drop([target_name], axis=1)
# data = data.drop(['alco','gender','height','smoke','active'], axis=1)

In [196]:
# train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [197]:
train, test, target, target_test = train_test_split(data, data_target, test_size=0.2, random_state=22)

In [198]:
test.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi
2317,19045,2,174,82.0,120,80,1,1,1,0,1,27.084159
69880,16000,1,158,65.0,110,70,1,1,0,0,1,26.037494
28024,23517,1,187,84.0,12,80,1,1,0,0,1,24.021276
69319,14738,1,156,53.0,90,60,1,1,0,0,0,21.778435
29872,18139,1,155,94.0,140,90,1,2,0,0,1,39.125911


In [201]:

testCopy = test.copy(deep=True)

testCopy = testCopy.to_csv ('test.csv', index = None, header=True) 


from sklearn import preprocessing
train = preprocessing.scale(train)
test = preprocessing.scale(test)

train.info()

test.head(5)

In [None]:
#%% split training set to validation set
# Xtrain, Xval, Ztrain, Zval = train_test_split(train, target, test_size=0.2, random_state=22)

Xval.head(5)

type(Xval)
# type(Zval)

## Data Normalization

from sklearn.preprocessing import normalize
train = normalize(train)
test = normalize(test)
data = normalize(data)

test.head(5)

# export Xval CSV
Xval = Xval.to_csv ('Xval.csv', index = None, header=True) 

In [202]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier

# dec = DecisionTreeClassifier()
# ran = RandomForestClassifier(n_estimators=100)
Ada = AdaBoostClassifier()
# knn = KNeighborsClassifier(n_neighbors=100)
# svm = SVC(random_state=1)
# naive = GaussianNB()
XGB = XGBClassifier()

models = {
#         "Decision tree" : dec,
#           "Random forest" : ran,
#           "KNN" : knn,
#           "SVM" : svm,
#           "Naive bayes" : naive,
          "XGBoost" : XGB,
          "AdaBoost" : Ada}
scores= { }

for key, value in models.items():    
    model = value
    model.fit(train, target)
    scores[key] = model.score(test, target_test)

In [203]:
scores_frame = pd.DataFrame(scores, index=["Accuracy Score"]).T
scores_frame.sort_values(by=["Accuracy Score"], axis=0 ,ascending=False, inplace=True)
scores_frame

Unnamed: 0,Accuracy Score
XGBoost,0.727727
AdaBoost,0.723161


In [204]:
from sklearn.externals import joblib 

# Save the model as a pickle in a file 
joblib.dump(XGB, 'XGBCardioModel.pkl') 

['XGBCardioModel.pkl']

In [253]:
test_data = pd.read_csv("test.csv", sep=",")
test_data.head(5)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi
0,19045,2,174,82.0,120,80,1,1,1,0,1,27.084159
1,16000,1,158,65.0,110,70,1,1,0,0,1,26.037494
2,23517,1,187,84.0,12,80,1,1,0,0,1,24.021276
3,14738,1,156,53.0,90,60,1,1,0,0,0,21.778435
4,18139,1,155,94.0,140,90,1,2,0,0,1,39.125911


In [254]:
test_data = preprocessing.scale(test_data)

In [255]:
# Load the model from the file 
XGB_from_joblib = joblib.load('XGBCardioModel.pkl')  

# Use the loaded model to make predictions 
XGB_from_joblib.predict(test_data) 

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)