# The purpose of this short notebook is to familiarize myself with importing models. I will:
1. Initialize new data (as could be collected from a user if I designed an interface)
2. Transform that data into the same normalized, regularized, one-hot-encoded format as the training data
3. Import the models
4. Make predictions on the new data

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import joblib
import tensorflow as tf
import os

In [18]:
df = pd.read_csv("heart_failure_data.csv")
df = df.drop(["HeartDisease"], axis=1)

### Initialize new data

In [19]:
new_data = pd.Series({
    "Age": 45,
    "Sex": "M",
    "ChestPainType": "TA",
    "RestingBP": 110,
    "Cholesterol": 264,
    "FastingBS": 0,
    "RestingECG": "Normal",
    "MaxHR": 132,
    "ExerciseAngina": "N",
    "Oldpeak": 1.2,
    "ST_Slope": "Flat"
})

new_data = pd.Series({
    "Age": 45,
    "RestingBP": 110,
    "Cholesterol": 264,
    "MaxHR": 132,
    "Oldpeak": 1.2,
    "Sex": "M",
    "ChestPainType": "TA",
    "FastingBS": 0,
    "ExerciseAngina": "N",
    "ST_Slope": "Flat"
})

### Add new data to dataframe

In [20]:
# add new data to dataframe
display(df.tail())
# df = pd.concat([new_data, df], ignore_index=True)
df.loc[len(df)] = new_data
display(df.tail())

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat
917,38,M,NAP,138,175,0,Normal,173,N,0.0,Up


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat
917,38,M,NAP,138,175,0,Normal,173,N,0.0,Up
918,45,M,TA,110,264,0,,132,N,1.2,Flat


### Augment features the same way I did on training data

In [21]:
numerical_features = df.select_dtypes(include=[np.number])
numerical_features = numerical_features.drop(["FastingBS"], axis=1)
continuous_feature_names = numerical_features.columns.tolist()

categorical_features = df.select_dtypes(include=[object])
categorical_feature_names = categorical_features.columns.to_list() + ["FastingBS"]

In [22]:
df2 = df.copy(deep=True)  # make a copy of the original data which we will modify

# Initialize the scalers
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()  # not clear this was required for 'Age', 'RestingBP', or, 'MaxHR' because those were already looking pretty close to Gaussian. Further normalization here is unlikely to hurt, however. A further investigation into normality with QQ-plots and the shapiro wilk test could be a future direction and dictate whether those features get StandardScaler applied to them

# Apply both scalers to each continuous variable
for feature in continuous_feature_names:
    # Perform MinMax scaling
    min_max_scaled_data = min_max_scaler.fit_transform(df2[[feature]])

    # Perform Standard scaling on the MinMax scaled data
    min_max_standard_scaled_data = standard_scaler.fit_transform(min_max_scaled_data)

    # Update the original DataFrame with the scaled data
    df2[feature] = min_max_standard_scaled_data.flatten()

display(df2.tail())

# one hot encoding of categorical variables
df2 = pd.get_dummies(df2, columns=categorical_feature_names, dtype=int)
display(df2.tail())

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
914,1.53804,M,ASY,0.628236,-0.053717,1,Normal,0.164976,N,2.357948,Flat
915,0.37112,M,ASY,-0.128163,-0.621035,0,Normal,-0.857313,Y,0.293109,Flat
916,0.37112,F,ATA,-0.128163,0.339745,0,LVH,1.462498,N,-0.833166,Flat
917,-1.64447,M,NAP,0.304065,-0.218422,0,Normal,1.423179,N,-0.833166,Up
918,-0.901884,M,TA,-1.208735,0.595953,0,,-0.188893,N,0.293109,Flat


Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,...,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,FastingBS_0,FastingBS_1
914,1.53804,0.628236,-0.053717,0.164976,2.357948,0,1,1,0,0,...,0,1,0,1,0,0,1,0,0,1
915,0.37112,-0.128163,-0.621035,-0.857313,0.293109,0,1,1,0,0,...,0,1,0,0,1,0,1,0,1,0
916,0.37112,-0.128163,0.339745,1.462498,-0.833166,1,0,0,1,0,...,1,0,0,1,0,0,1,0,1,0
917,-1.64447,0.304065,-0.218422,1.423179,-0.833166,0,1,0,0,1,...,0,1,0,1,0,0,0,1,1,0
918,-0.901884,-1.208735,0.595953,-0.188893,0.293109,0,1,0,0,0,...,0,0,0,1,0,0,1,0,1,0


### Extract row I added

In [23]:
to_predict = df2.tail(1)  # get last row, keep as dataframe structure
display(to_predict)

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,...,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,FastingBS_0,FastingBS_1
918,-0.901884,-1.208735,0.595953,-0.188893,0.293109,0,1,0,0,0,...,0,0,0,1,0,0,1,0,1,0


### Import models

In [36]:
# Load the models
# gnb_model1 = joblib.load("saved models/gnb_model1.pkl")
# logistic_regressor1 = joblib.load("saved models/logistic_regressor1.pkl")
random_forest_classifier = joblib.load("saved models/random_forest_classifier.pkl")
# Repeat for each model

In [46]:
random_forest_classifier_pred = random_forest_classifier.predict(to_predict)
random_forest_classifier_pred_prob = random_forest_classifier.predict_proba(to_predict)

print(f"Random forest prediction {random_forest_classifier_pred}\n"
      f"With probability {random_forest_classifier_pred_prob}")

Random forest prediction [1]
With probability [[0.29 0.71]]


# trying with tensorflow model

In [26]:
print(os.listdir(os.path.join(os.getcwd(), "saved models")))

['.DS_Store', 'random_forest_classifier.pkl', 'gnb_model1.pkl', 'deep_learning_classifier', 'random_forest1.pkl', 'svm_model1.pkl', 'logistic_regressor1.pkl']


In [44]:
loaded_model = tf.keras.models.load_model(os.path.join(os.getcwd(), "saved models/deep_learning_classifier"))
tf_predictions = loaded_model.predict(to_predict)
print(f"Deep learning prediction: {np.round(tf_predictions).astype(int)[0]}\n"
      f"With probability: {tf_predictions}")

Tensorflow prediction: [1]
With probability: [[0.7069615]]


# Conclusion:
- Great! this went well, now I can implement a separate interface to prompt a user for their data and have my model predict their state!