In [1]:


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px



In [3]:
df = pd.read_csv('cardio_train.csv', sep=';')

In [4]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [5]:
#Retreive categorical features
df['gender'].replace([1,2], value=['female', 'male'], inplace=True)
df[['cholesterol','gluc']] = df[['cholesterol','gluc']].replace([1,2,3], ['normal', 'above normal', 'well above normal'])
df[['smoke', 'cardio','alco','active']] = df[['smoke', 'cardio','alco','active']].replace([0,1], ['No', 'Yes'])

In [6]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,male,168,62.0,110,80,normal,normal,No,No,Yes,No
1,1,20228,female,156,85.0,140,90,well above normal,normal,No,No,Yes,Yes
2,2,18857,female,165,64.0,130,70,well above normal,normal,No,No,No,Yes
3,3,17623,male,169,82.0,150,100,normal,normal,No,No,Yes,Yes
4,4,17474,female,156,56.0,100,60,normal,normal,No,No,No,No


In [7]:
df.isna().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
#Some instances have corrupted values for systolic and diastolic blood pressure

#Dropping corrupted instances (systolic or diastolic blood pressures are out of possible range)
df.drop(df[(df['ap_lo'] > 400) | (df['ap_lo'] < 40)].index,inplace=True)
df.drop(df[(df['ap_hi'] > 400) | (df['ap_hi'] < 40)].index,inplace=True)



In [10]:
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,male,168,62.0,110,80,normal,normal,No,No,Yes,No
1,1,20228,female,156,85.0,140,90,well above normal,normal,No,No,Yes,Yes
2,2,18857,female,165,64.0,130,70,well above normal,normal,No,No,No,Yes
3,3,17623,male,169,82.0,150,100,normal,normal,No,No,Yes,Yes
4,4,17474,female,156,56.0,100,60,normal,normal,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,male,168,76.0,120,80,normal,normal,Yes,No,Yes,No
69996,99995,22601,female,158,126.0,140,90,above normal,above normal,No,No,Yes,Yes
69997,99996,19066,male,183,105.0,180,90,well above normal,normal,No,Yes,No,Yes
69998,99998,22431,female,163,72.0,135,80,normal,above normal,No,No,No,Yes


In [11]:
#Converting the age from days to years
df['age_yrs']=df['age']/365



In [12]:
#Adding BMI feature
df['bmi'] = df['weight']/((df['height']/100.0)**2)

In [13]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_yrs,bmi
0,0,18393,male,168,62.0,110,80,normal,normal,No,No,Yes,No,50.391781,21.96712
1,1,20228,female,156,85.0,140,90,well above normal,normal,No,No,Yes,Yes,55.419178,34.927679
2,2,18857,female,165,64.0,130,70,well above normal,normal,No,No,No,Yes,51.663014,23.507805
3,3,17623,male,169,82.0,150,100,normal,normal,No,No,Yes,Yes,48.282192,28.710479
4,4,17474,female,156,56.0,100,60,normal,normal,No,No,No,No,47.873973,23.011177


In [14]:
#Adding blood pressure categories as a feature
conditions = [(df['ap_hi']<90) | (df['ap_lo']<60), 
              ((df['ap_hi']>=90)&(df['ap_hi']<120)) & ((df['ap_lo']>=60)&(df['ap_lo']<80)),
              ((df['ap_hi']>=120)&(df['ap_hi']<140)) | ((df['ap_lo']>=80)&(df['ap_lo']<90)),
              (df['ap_hi']>=140) | (df['ap_lo']>=90)]

choices = ['low', 'normal', 'preHigh', 'High']
df['blood_pressure'] = np.select(conditions, choices)

In [15]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_yrs,bmi,blood_pressure
0,0,18393,male,168,62.0,110,80,normal,normal,No,No,Yes,No,50.391781,21.96712,preHigh
1,1,20228,female,156,85.0,140,90,well above normal,normal,No,No,Yes,Yes,55.419178,34.927679,High
2,2,18857,female,165,64.0,130,70,well above normal,normal,No,No,No,Yes,51.663014,23.507805,preHigh
3,3,17623,male,169,82.0,150,100,normal,normal,No,No,Yes,Yes,48.282192,28.710479,High
4,4,17474,female,156,56.0,100,60,normal,normal,No,No,No,No,47.873973,23.011177,normal


In [16]:
df['blood_pressure'].value_counts()

preHigh    43965
High       15161
normal      9417
low          232
Name: blood_pressure, dtype: int64

In [17]:
#Binnig BMI feature
conds = [df['bmi']< 18.5 , df['bmi'] > 30, ((df['bmi']>18.5) & (df['bmi']<25)), ((df['bmi']>=25) & (df['bmi']<=30))]
choices = ['Underweight', 'Obseity','Normal','Overweight']

df['BMI_status']= (np.select(conds, choices))

In [18]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_yrs,bmi,blood_pressure,BMI_status
0,0,18393,male,168,62.0,110,80,normal,normal,No,No,Yes,No,50.391781,21.96712,preHigh,Normal
1,1,20228,female,156,85.0,140,90,well above normal,normal,No,No,Yes,Yes,55.419178,34.927679,High,Obseity
2,2,18857,female,165,64.0,130,70,well above normal,normal,No,No,No,Yes,51.663014,23.507805,preHigh,Normal
3,3,17623,male,169,82.0,150,100,normal,normal,No,No,Yes,Yes,48.282192,28.710479,High,Overweight
4,4,17474,female,156,56.0,100,60,normal,normal,No,No,No,No,47.873973,23.011177,normal,Normal


In [19]:
#Encoding Categorical Features

from sklearn.preprocessing import OrdinalEncoder

df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 68775 entries, 0 to 69999
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              68775 non-null  int64  
 1   age             68775 non-null  int64  
 2   gender          68775 non-null  object 
 3   height          68775 non-null  int64  
 4   weight          68775 non-null  float64
 5   ap_hi           68775 non-null  int64  
 6   ap_lo           68775 non-null  int64  
 7   cholesterol     68775 non-null  object 
 8   gluc            68775 non-null  object 
 9   smoke           68775 non-null  object 
 10  alco            68775 non-null  object 
 11  active          68775 non-null  object 
 12  cardio          68775 non-null  object 
 13  age_yrs         68775 non-null  float64
 14  bmi             68775 non-null  float64
 15  blood_pressure  68775 non-null  object 
 16  BMI_status      68775 non-null  object 
dtypes: float64(3), int64(5), object

In [20]:
enc = OrdinalEncoder(categories=[['normal', 'above normal','well above normal'], ['normal', 'above normal','well above normal'],
                                 ['No', 'Yes'],['No', 'Yes'],['No', 'Yes'],['No', 'Yes'], ['male','female'],
                                ['low', 'normal', 'preHigh', 'High'], ['Underweight', 'Obseity','Normal','Overweight']])
encoded_features = enc.fit_transform(df[['cholesterol','gluc','smoke','alco','active','cardio', 'gender', 'blood_pressure', 'BMI_status']])
enc.categories_

[array(['normal', 'above normal', 'well above normal'], dtype=object),
 array(['normal', 'above normal', 'well above normal'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['male', 'female'], dtype=object),
 array(['low', 'normal', 'preHigh', 'High'], dtype=object),
 array(['Underweight', 'Obseity', 'Normal', 'Overweight'], dtype=object)]

In [21]:
df[['cholesterol_enc','gluc_enc','smoke_enc','alco_enc','active_enc','cardio_enc', 'gender_enc','blood_pressure_enc','BMI_status_enc']] = encoded_features.astype('int')

df



Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,...,BMI_status,cholesterol_enc,gluc_enc,smoke_enc,alco_enc,active_enc,cardio_enc,gender_enc,blood_pressure_enc,BMI_status_enc
0,0,18393,male,168,62.0,110,80,normal,normal,No,...,Normal,0,0,0,0,1,0,0,2,2
1,1,20228,female,156,85.0,140,90,well above normal,normal,No,...,Obseity,2,0,0,0,1,1,1,3,1
2,2,18857,female,165,64.0,130,70,well above normal,normal,No,...,Normal,2,0,0,0,0,1,1,2,2
3,3,17623,male,169,82.0,150,100,normal,normal,No,...,Overweight,0,0,0,0,1,1,0,3,3
4,4,17474,female,156,56.0,100,60,normal,normal,No,...,Normal,0,0,0,0,0,0,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,male,168,76.0,120,80,normal,normal,Yes,...,Overweight,0,0,1,0,1,0,0,2,3
69996,99995,22601,female,158,126.0,140,90,above normal,above normal,No,...,Obseity,1,1,0,0,1,1,1,3,1
69997,99996,19066,male,183,105.0,180,90,well above normal,normal,No,...,Obseity,2,0,0,1,0,1,0,3,1
69998,99998,22431,female,163,72.0,135,80,normal,above normal,No,...,Overweight,0,1,0,0,0,1,1,2,3


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68775 entries, 0 to 69999
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  68775 non-null  int64  
 1   age                 68775 non-null  int64  
 2   gender              68775 non-null  object 
 3   height              68775 non-null  int64  
 4   weight              68775 non-null  float64
 5   ap_hi               68775 non-null  int64  
 6   ap_lo               68775 non-null  int64  
 7   cholesterol         68775 non-null  object 
 8   gluc                68775 non-null  object 
 9   smoke               68775 non-null  object 
 10  alco                68775 non-null  object 
 11  active              68775 non-null  object 
 12  cardio              68775 non-null  object 
 13  age_yrs             68775 non-null  float64
 14  bmi                 68775 non-null  float64
 15  blood_pressure      68775 non-null  object 
 16  BMI_

**Preparing dataframes for the dashboard** 

In [23]:
#Blood pressure categories

BP_df = df.groupby(['cardio','blood_pressure'])[['cardio_enc']].count().reset_index().rename(columns={'cardio_enc':'count'})
BP_df



Unnamed: 0,cardio,blood_pressure,count
0,No,High,2580
1,No,low,153
2,No,normal,7330
3,No,preHigh,24675
4,Yes,High,12581
5,Yes,low,79
6,Yes,normal,2087
7,Yes,preHigh,19290


In [24]:
#BMI Categories

bmi_df = df.groupby(['cardio','BMI_status'])[['cardio_enc']].count().reset_index()
bmi_df



Unnamed: 0,cardio,BMI_status,cardio_enc
0,No,Normal,15320
1,No,Obseity,6753
2,No,Overweight,12199
3,No,Underweight,466
4,Yes,Normal,10133
5,Yes,Obseity,11254
6,Yes,Overweight,12473
7,Yes,Underweight,177


In [25]:
#Cholesterol

cholesterol_df = df.groupby(['cardio','cholesterol'])[['cardio_enc']].count().reset_index().rename(columns={'cardio_enc':'count'})
cholesterol_df



Unnamed: 0,cardio,cholesterol,count
0,No,above normal,3758
1,No,normal,29110
2,No,well above normal,1870
3,Yes,above normal,5555
4,Yes,normal,22466
5,Yes,well above normal,6016


In [26]:
#Glucose

gluc_df = df.groupby(['cardio','gluc'])[['cardio_enc']].count().reset_index().rename(columns={'cardio_enc':'count'})
gluc_df



Unnamed: 0,cardio,gluc,count
0,No,above normal,2087
1,No,normal,30655
2,No,well above normal,1996
3,Yes,above normal,2987
4,Yes,normal,27811
5,Yes,well above normal,3239


In [27]:
#Gender

gender_df = df.groupby(['cardio','gender'])[['cardio_enc']].count().reset_index().rename(columns={'cardio_enc':'count'})
gender_df



Unnamed: 0,cardio,gender,count
0,No,female,22742
1,No,male,11996
2,Yes,female,22049
3,Yes,male,11988


In [28]:
#Active

active_df = df.groupby(['cardio','active'])[['cardio_enc']].count().reset_index().rename(columns={'cardio_enc':'count'})
active_df



Unnamed: 0,cardio,active,count
0,No,No,6322
1,No,Yes,28416
2,Yes,No,7201
3,Yes,Yes,26836


In [29]:
#Smoke

smoke_df = df.groupby(['cardio','smoke'])[['cardio_enc']].count().reset_index().rename(columns={'cardio_enc':'count'})
smoke_df



Unnamed: 0,cardio,smoke,count
0,No,No,31522
1,No,Yes,3216
2,Yes,No,31200
3,Yes,Yes,2837


In [30]:
#Alcohol

alcohol_df = df.groupby(['cardio','alco'])[['cardio_enc']].count().reset_index().rename(columns={'cardio_enc':'count'})
alcohol_df



Unnamed: 0,cardio,alco,count
0,No,No,32811
1,No,Yes,1927
2,Yes,No,32275
3,Yes,Yes,1762


In [31]:
#Model requirements for the dashboard

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import cross_validate

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier



In [32]:
testSize = 0.2
available_features = ['gender_enc', 'height', 'weight', 'ap_hi', 'ap_lo','age_yrs','bmi', 'cholesterol_enc',
                     'gluc_enc', 'smoke_enc', 'alco_enc','active_enc','blood_pressure_enc', 'BMI_status_enc']



X_train, X_test, y_train, y_test = train_test_split(df[available_features], df['cardio_enc'], test_size= testSize, random_state=0)

 



In [36]:
!pip install jupyter_dash

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jupyter_dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting dash
  Downloading dash-2.5.1-py3-none-any.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 6.7 MB/s 
[?25hCollecting retrying
  Downloading retrying-1.3.3.tar.gz (10 kB)
Collecting ansi2html
  Downloading ansi2html-1.7.0-py3-none-any.whl (15 kB)
Collecting flask-compress
  Downloading Flask_Compress-1.12-py3-none-any.whl (7.9 kB)
Collecting dash-table==5.0.0
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting dash-html-components==2.0.0
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Collecting brotli
  Downloading Brotli-1.0.9-cp37-cp37m-manylinux1_x86_64.whl (357 kB)
[K     |████████████████████████████████| 357 kB 38.3 MB/s 
Building wheel

In [38]:
import dash
from dash import html
from dash import dcc
import plotly.figure_factory as ff



In [39]:
#Feature EDA related

features_eda_figs = {'height':px.box(df, y='height', x='cardio'),
                    'weight':px.box(df, y='weight', x='cardio'),
                     'ap_hi':px.box(df, y='ap_hi', x='cardio'),
                     'ap_lo':px.box(df, y='ap_lo', x='cardio'),
                     'blood_pressure_enc':px.bar(BP_df, x='cardio', y='count', color='blood_pressure',
                                                category_orders={'blood_pressure':['High', 'preHigh', 'normal','low']}),
                     'age_yrs':px.box(df, y='age_yrs', x='cardio'),
                     'bmi':px.box(df, y='bmi', x='cardio'),
                     'BMI_status_enc':px.bar(bmi_df, x='cardio', y='cardio_enc', color='BMI_status', labels={'BMI_status':'bmi'},
                                         category_orders={'BMI_status':['Underweight', 'Normal','Overweight','Obesity']}),
                     'cholesterol_enc':df.groupby(['cardio','cholesterol'])[['cardio_enc']].count().reset_index().rename(columns={'cardio_enc':'count'}),
                     'smoke_enc':px.bar(smoke_df, x='cardio', y='count', color='smoke'),
                     'gluc_enc':px.bar(gluc_df, x='cardio', y='count', color='gluc', labels={'gluc':'glucose'},
                                    category_orders={'gluc':['normal', 'above normal', 'well above normal']}),
                     'alco_enc':px.bar(alcohol_df, x='cardio', y='count', color='alco', labels={'alco':'Alcohol'}),
                     'active_enc':px.bar(active_df, x='cardio', y='count', color='active') ,
                     'gender_enc':px.bar(gender_df, x='cardio', y='count', color='gender')
                    }


feature_eda_dropdownOpts = [
    {'label':'Gender', 'value':'gender_enc'},
    {'label': 'Height', 'value':'height'},
    {'label': 'weight', 'value':'weight'},
    {'label': 'Systolic blood pressure', 'value':'ap_hi'},
    {'label': 'Diastolic blood pressure', 'value':'ap_lo'},
    {'label': 'Blood pressure', 'value':'blood_pressure_enc'},
    {'label': 'Age', 'value':'age_yrs'},
    {'label': 'BMI', 'value':'bmi'},
    {'label': 'BMI categories', 'value':'BMI_status_enc'},
    {'label': 'Cholesterol', 'value':'cholesterol_enc'},
    {'label': 'Smoking', 'value':'smoke_enc'},
    {'label': 'Glucose', 'value':'gluc_enc'},
    {'label': 'Alcohol', 'value':'alco_enc'},
    {'label': 'Activity', 'value':'active_enc'}
]

In [40]:
# ROC related
roc_dropdownOpts = [{'label': 'AdaBoost', 'value': 'AdaBoost'},
                   {'label': 'Random Forest', 'value': 'Random Forest'},
                   {'label': 'k-NN', 'value': 'k-NN'},
                   {'label': 'MLP', 'value': 'MLP'}]

In [41]:
#Accuracy related
def create_scores_df(selected_features, selected_pipeline, selected_model):
    res = cross_validate(selected_pipeline, X_train[selected_features], y_train, scoring=['accuracy'], cv=5, return_train_score=True)
    test_values = res['test_accuracy']
    train_values = res['train_accuracy']
    train_names = '{}_train'.format(selected_model)
    test_names = '{}_test'.format(selected_model)
    test_eval_df = pd.DataFrame([test_names, test_values]).T.explode(1).rename(columns={0:'classifier', 1:'accuracy'})
    train_eval_df = pd.DataFrame([train_names, train_values]).T.explode(1).rename(columns={0:'classifier', 1:'accuracy'})
    final_df = pd.concat([train_eval_df, test_eval_df], axis=0)
    return final_df

In [44]:
pip install dash-bootstrap-components

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dash-bootstrap-components
  Downloading dash_bootstrap_components-1.2.0-py3-none-any.whl (216 kB)
[K     |████████████████████████████████| 216 kB 5.2 MB/s 
Installing collected packages: dash-bootstrap-components
Successfully installed dash-bootstrap-components-1.2.0


In [45]:
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State

app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])


optimizerOpts = [{'label':'SGD', 'value':'sgd'},
                {'label':'Adam', 'value':'adam'}]
activationOpts = [{'label':'No activation function', 'value':'identity'},
                 {'label':'Logistic', 'value':'logistic'},
                 {'label':'tanh', 'value':'tanh'},
                 {'label':'ReLU', 'value':'relu'}]

hyperparam_mlpTab = html.Div([dbc.Row(html.Label('Number of neurons')),
                              dbc.Row(dcc.Input(id='mlp_noNeurons',type='number')),
                              dbc.Row(html.Label('Activation function')),
                              dcc.Dropdown(id='mlp_activation',
                                      options=activationOpts, 
                                      placeholder='Select activation functionn'),
                              dbc.Row(html.Label('Optimizer')),
                              dcc.Dropdown(id='mlp_optimizer',
                                      options=optimizerOpts,
                                      placeholder='Select optimizer'),
                              dbc.Row(html.Label('Learning rate')),
                              dbc.Row(dcc.Input(id='mlp_learningRate',type='number')),
                              dbc.Row(html.Label('Max number of iterations')),
                              dbc.Row(dcc.Input(id='mlp_maxIter',type='number'))
                            ],
                            style={'margin-left': '70px'})


distMetricsOpts = [{'label':'Euclidean', 'value':'euclidean'}, 
                   {'label':'Manhattan', 'value':'manhattan'}, 
                   {'label':'Hamming', 'value':'hamming'}]
hyperparam_knnTab = html.Div([dbc.Row(html.Label('Number of nearest neighbors')),
                             dbc.Row(dcc.Input(id='k_neighbors', type='number')),
                             html.Label('Distance metric'),
                             dcc.Dropdown(id='dist_metric_dropdown',
                                                 options=distMetricsOpts,
                                                 value='euclidean')],
                            style={'margin-left': '70px'})


hyperparam_rfTab = html.Div([dbc.Row(html.Label('Number of estimators')),
                            dbc.Row(dcc.Input(id='rf_noEstimators',
                                     type='number')),
                             dbc.Row(html.Label('Max depth')),
                            dbc.Row(dcc.Input(id='rf_maxDepth',
                                     type='number')),
                             dbc.Row(html.Label('Min # samples required to split')),
                            dbc.Row(dcc.Input(id='rf_minSamplesSplit',
                                             type='number'))],
                           style={'margin-left': '70px'})

hyperparam_adaTab = html.Div([dbc.Row(html.Label('Number of estimators')),
                            dbc.Row(dcc.Input(id='ada_noEstimators',
                                     type='number')),
                             dbc.Row(html.Label('Learning rate')),
                            dbc.Row(dcc.Input(id='ada_learningRate',
                                     type='number'))],
                            style={'margin-left': '70px'})


app.layout = html.Div(children=[
    html.H1('Cardiovascular Disease Classification',
           style={'color': '#566163', 'fontSize': 40, 'font-weight':'bold','textAlign': 'center'}),
    
    dbc.Row([dbc.Col([dbc.Row([dbc.Col(id='features_eda',
                                       children=[html.Label('Explore the features', 
                                                            style={'color': '#3d717d', 'fontSize': 20,
                                                                   'font-weight':'bold',
                                                                  'margin-left': '10px'}),
                                                dcc.Graph(id='feature_eda_fig',
                                                         style={'height': '300px'}),
                                                html.Div(dcc.Dropdown(id='features_eda_dropdown',
                                                             placeholder='Select a feature',
                                                            options=feature_eda_dropdownOpts),
                                                        style={'width':'250px', 'margin-left': '45px'})],
                                      width={'size':5}),
                               
                               dbc.Col(id='accuracy',
                                       children=[html.Label('Select a model',
                                                           style={'color': '#3d717d', 'fontSize': 20,
                                                                 'font-weight':'bold', 'margin-left': '10px'}),
                                                html.Div(dcc.Dropdown(id='model-name',
                                                           options=roc_dropdownOpts,
                                                           value='AdaBoost',
                                                           clearable=False),
                                                        style={'width':'300px', "margin-left": "65px"}),
                                                dcc.Graph(id="box_plot",
                                                         style={'height': '350px', 'width':'500px'})],
                                      width={'size':7})]),
                      
                      dbc.Row([dbc.Col(id='feature_selection',
                                       children=[html.Label('Select features for training',
                                                           style={'color': '#3d717d', 'fontSize': 20,
                                                                 'font-weight':'bold', 'margin-left': '10px'}),
                                                html.Div(dcc.Dropdown(id='feature_selection_dropdown',
                                                            options=feature_eda_dropdownOpts,
                                                            multi=True),
                                                        style={'width':'250px', "margin-left": "45px"})],
                                      width={'size':5}),
                               
                              dbc.Col(id='auc_roc_curve',
                                      children=[dcc.Graph(id="roc_graph",
                                                        style={'height': '300px', 'width':'500px'})],
                                     width={'size':7})
                              ])
                     ], width={'size':8}),
             
             
            dbc.Col(id='hyperparameters',
                    children=[html.Label('Set the hyperparameters',
                                        style={'color': '#3d717d', 'fontSize': 20,
                                              'font-weight':'bold', 'margin-left': '10px'}),
                              dbc.Row(html.Div(id='tab_content', children=hyperparam_mlpTab)),
                              html.Button('Train', id='train_button', 
                                          style={'margin-top':'40px', 'margin-left':'90px', 
                                                 'width':'70px', 'height':'40px',
                                                'fontSize':20, 'font-weight':'bold',
                                                'backgroundColor':'#bbf2b1'}),
                              dbc.Row(dcc.Graph(id='confusion_matrix',
                                               style={'height': '400px', 'width':'400px'}))
                             ],
                   width={'size':4})])
])





#render hyperparameters tabs
@app.callback(
    Output("tab_content", "children"), 
    [Input('model-name', "value")])
def render_content(tab):
    if tab=='MLP':
        children = [hyperparam_mlpTab,
           html.Div([hyperparam_rfTab], style={'display': 'none'}),
           html.Div([hyperparam_adaTab], style={'display': 'none'}),
           html.Div([hyperparam_knnTab], style={'display': 'none'})]
        return children

    elif tab=='k-NN':
        children = [hyperparam_knnTab,
                   html.Div([hyperparam_rfTab], style={'display': 'none'}),
                   html.Div([hyperparam_adaTab], style={'display': 'none'}),
                   html.Div([hyperparam_mlpTab], style={'display': 'none'})]
        return children
    
    elif tab=='Random Forest':
        children = [hyperparam_rfTab,
                   html.Div([hyperparam_knnTab], style={'display': 'none'}),
                   html.Div([hyperparam_adaTab], style={'display': 'none'}),
                   html.Div([hyperparam_mlpTab], style={'display': 'none'})]
        return children

    else: #adaboost
        children = [hyperparam_adaTab,
                   html.Div([hyperparam_knnTab], style={'display': 'none'}),
                   html.Div([hyperparam_rfTab], style={'display': 'none'}),
                   html.Div([hyperparam_mlpTab], style={'display': 'none'})]
        return children



#Feature EDA callback
@app.callback(
    Output("feature_eda_fig", "figure"), 
    [Input('features_eda_dropdown', "value")])

def display_eda(feat):
    features_eda_figs[feat].update_layout(
    xaxis_title='Cardiovascular Disease',
    margin={'t':10})
    return features_eda_figs[feat]




#Feature Selection Callback 
@app.callback(
    [Output("roc_graph", "figure"), Output("box_plot", "figure"), Output('confusion_matrix', 'figure')],
    Input('train_button','n_clicks'),
    [State('feature_selection_dropdown', "value"),
     State('model-name', "value"),
     State('k_neighbors', "value"), 
     State('dist_metric_dropdown', 'value'),
     State('rf_noEstimators', 'value'),
     State('rf_maxDepth', 'value'),
     State('rf_minSamplesSplit', 'value'),
     State('ada_noEstimators', 'value'),
     State('ada_learningRate', 'value'),
     State('mlp_noNeurons', 'value'),
     State('mlp_optimizer', 'value'),
     State('mlp_activation', 'value'),
     State('mlp_learningRate', 'value'),
     State('mlp_maxIter', 'value')])

def feature_selection(clk, selected_features, selected_model, k_neighbors, knn_distMetric,
                      rf_noEstimators, rf_maxDepth, rf_minSamplesSplit,
                     ada_nEstimators, ada_learningRate,
                     mlp_noNeurons, mlp_optimizer, mlp_activation, mlp_learningRate, mlp_maxIter):
    
    
    if selected_model=='k-NN':
        pipeline = Pipeline(steps=[('knn',KNeighborsClassifier(n_neighbors=k_neighbors, metric=knn_distMetric)) ])

       
    elif selected_model=='Random Forest':
        pipeline = Pipeline(steps=[('rf',RandomForestClassifier(max_depth=rf_maxDepth, n_estimators=rf_noEstimators,
                                                                min_samples_split=rf_minSamplesSplit,random_state=0, n_jobs=-1)) ])     
        
    elif selected_model=='AdaBoost':
        pipeline = Pipeline(steps=[('ada',AdaBoostClassifier(n_estimators=ada_nEstimators, learning_rate=ada_learningRate, random_state=0)) ])

        
    else: #mlp
        pipeline = Pipeline(steps=[('mlp',MLPClassifier(hidden_layer_sizes=(mlp_noNeurons,2), solver=mlp_optimizer,
                                                       activation=mlp_activation, learning_rate_init=mlp_learningRate,
                                                       max_iter=mlp_maxIter)) ])


    scores_df = create_scores_df(selected_features, pipeline, selected_model)
    model = pipeline.fit(X_train[selected_features], y_train)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test[selected_features])[:,1])
    score = metrics.auc(fpr, tpr)
    
    y_pred = model.predict(X_test[selected_features])
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
    


        
    #----------ROC Plot-----------

    fig = px.area(
        x=fpr, y=tpr,
        labels=dict(
            x='False Positive Rate', 
            y='True Positive Rate'))
    
    fig.update_layout(
    title={
        'text':f'ROC Curve for {selected_model} (AUC={score:.4f})',
        'xanchor': 'center',
        'y':0.92,
        'x':0.5,
    })
    
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1)
    
    
    #---------Accuracy Plots---------
    #scores_df = create_scores_df(selected_features, selected_model)
    accuracy_fig = px.box(scores_df, x='classifier', y='accuracy')
    accuracy_fig.update_layout(
        title={
        'text':'Train and test accuracy for 10-fold CV',
        'xanchor': 'center',
        
        'y':0.95,
        'x':0.5,
    }
    )
    
    #----------- Confusion Matrix ------------
    confusionMatrix_fig = ff.create_annotated_heatmap(confusion_matrix, x = ['0','1'], y=['0','1'])
    
    confusionMatrix_fig['layout']['xaxis']['side'] = 'bottom'
    confusionMatrix_fig.update_layout(
    title={
        'text':f'Confusion Matrix for {selected_model}',
        'xanchor': 'center',
        
        'y':0.90,
        'x':0.5,
    },
    xaxis_title="Predicted",
    yaxis_title="Actual"
    )

    return fig, accuracy_fig, confusionMatrix_fig


In [None]:
app.run_server()

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
