# Heart Disease Risk Analysis Data - Predicting Heart Disease

## Importing Libraries
- Import the model
- Import the Dict Vectorizer

In [1]:
import pickle
import numpy as np
import pandas as pd
import sklearn

print(sklearn.__version__)

1.2.2


In [3]:
# load the models
model_filename = './bin/hd_xgboost_model.pkl.bin'
dv_filename = './bin/hd_dictvectorizer.pkl.bin'

# Load the model and dv from the files
with open(model_filename, 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open(dv_filename, 'rb') as dv_file:
    loaded_dv = pickle.load(dv_file)


In [6]:
def probability_label(probability):
    
    labels = ['none','low', 'medium', 'high']
    label = 'unknown'

    # return the label based on the probability
    if probability < 0.3:
        label = labels[0]
    elif probability < 0.50:
        label = labels[1]
    elif probability < 0.75:
        label = labels[2]
    elif probability >= 0.75:
        label = labels[3]
    
    return label

def predict(data):
    # Transform the data
    X = loaded_dv.transform(data)
    # Predict the probability
    y_pred = loaded_model.predict_proba(X)[:, 1]
    
    return y_pred


## Run risks test cases

- Load the models (xgboost and dictvectorizer)
  - Load the ./bin/hd_xgboost_model.pkl.bin
  - Load the ./bin/hd_dictvectorizer.pkl.bin
- Load data/test_cases.csv 
  - Call Predict() for each test case
  - Map the score to a risk label


In [7]:
# open the test cases csv file and read it into a pandas dataframe 
df = pd.read_csv('./data/test_cases.csv', sep=',', quotechar='"')

df.head()

Unnamed: 0,bmi,smoking,alcoholdrinking,stroke,physicalhealth,mentalhealth,diffwalking,sex,agecategory,race,diabetic,physicalactivity,genhealth,sleeptime,asthma,kidneydisease,skincancer
0,40,0,0,0,0,0,1,Male,65-69,White,No,1,Good,10,0,0,0
1,34,1,0,0,30,0,1,Male,60-64,White,Yes,0,Poor,15,1,0,0
2,28,1,0,0,0,0,0,Female,55-59,White,No,1,Very good,5,0,0,0


In [8]:
# Predict the probability for each test case
df['risk_score'] = predict(df.to_dict(orient='records'))

# get the label for each probability
df['risk_label'] = df['risk_score'].apply(probability_label)

# print the risk score and label for each test case
df[['risk_score', 'risk_label']].head()


Unnamed: 0,risk_score,risk_label
0,0.106512,none
1,0.364203,low
2,0.050394,none


In [1]:
# Export this file
!jupyter nbconvert --to script data_predict.ipynb


  from pkg_resources import load_entry_point
[NbConvertApp] Converting notebook data_predict.ipynb to script
[NbConvertApp] Writing 2013 bytes to data_predict.py
