## Heart Disease Classification

In this notebook, we will try to look at just the inference part of the heart disease classification solution

### Import Modules

In [20]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import joblib

### Get Inference Data

In [79]:
# in real-time use cases, this method should be replaced with live flowing data
def get_inference_data():
    data = pd.read_csv("Data/heart.csv")
    data.drop_duplicates(subset=None, inplace=True)
    data.duplicated().any()
    inference_df = data.sample(frac=1, random_state = 2)
    inference_df = inference_df.tail(20)
    return inference_df[inference_df.columns.drop('target')], inference_df['target']

inference_data, labels = get_inference_data()

In [84]:
inference_data.columns

Index(['age', 'sex', 'chest_pain_type', 'resting_bp', 'cholestoral',
       'fasting_blood_sugar', 'restecg', 'max_hr', 'exang', 'oldpeak', 'slope',
       'num_major_vessels', 'thal'],
      dtype='object')

In [85]:
inference_data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_bp,cholestoral,fasting_blood_sugar,restecg,max_hr,exang,oldpeak,slope,num_major_vessels,thal
51,66,1,0,120,302,0,0,151,0,0.4,1,0,2
261,52,1,0,112,230,0,1,160,0,0.0,2,1,2
102,63,0,1,140,195,0,1,179,0,0.0,2,2,2
196,46,1,2,150,231,0,1,147,0,3.6,1,0,2
168,63,1,0,130,254,0,0,147,0,1.4,1,1,3


### Apply Same Pre-processing

In [80]:
# apply same pre-processing and feature engineering techniques as applied during the training process
def encode_features(df, features):
    '''
    Method for one-hot encoding all selected categorical fields
    '''
    # Implement these steps to prevent dimension mismatch during inference
    encoded_df = pd.DataFrame(columns= ['age', 'sex', 'resting_bp', 'cholestoral', 'fasting_blood_sugar',
       'max_hr', 'exang', 'oldpeak', 'num_major_vessels', 'thal_0', 'thal_1',
       'thal_2', 'thal_3', 'slope_0', 'slope_1', 'slope_2',
       'chest_pain_type_0', 'chest_pain_type_1', 'chest_pain_type_2',
       'chest_pain_type_3', 'restecg_0', 'restecg_1', 'restecg_2'])
    placeholder_df = pd.DataFrame()
    
    # One-Hot Encoding using get_dummies for the specified categorical features
    for f in features:
        if(f in df.columns):
            encoded = pd.get_dummies(df[f])
            encoded = encoded.add_prefix(f + '_')
            placeholder_df = pd.concat([placeholder_df, encoded], axis=1)
        else:
            print('Feature not found')
            return df
    
    # Implement these steps to prevent dimension mismatch during inference
    for feature in encoded_df.columns:
        if feature in df.columns:
            encoded_df[feature] = df[feature]
        if feature in placeholder_df.columns:
            encoded_df[feature] = placeholder_df[feature]
    # fill all null values
    encoded_df.fillna(0, inplace=True)
    
    return encoded_df

def normalize_data(df):
    val = df.values 
    min_max_normalizer = preprocessing.MinMaxScaler()
    norm_val = min_max_normalizer.fit_transform(val)
    df2 = pd.DataFrame(norm_val)
    
    return df2

def apply_pre_processing(data):
    features_to_encode = ['thal', 'slope', 'chest_pain_type', 'restecg']
    encoded = encode_features(data, features_to_encode)
    processed_data = normalize_data(encoded)
    return processed_data # Please note this is fabricated inference data, so just taking a small sample size

processed_inference_data = apply_pre_processing(inference_data)
processed_inference_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,1.0,1.0,0.464286,0.814607,0.0,0.457447,0.0,0.111111,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.621622,1.0,0.321429,0.410112,0.0,0.553191,0.0,0.0,0.5,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.918919,0.0,0.821429,0.213483,0.0,0.755319,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.459459,1.0,1.0,0.41573,0.0,0.414894,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.918919,1.0,0.642857,0.544944,0.0,0.414894,0.0,0.388889,0.5,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.324324,1.0,0.642857,0.320225,0.0,0.638298,0.0,0.555556,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6,0.27027,0.0,0.0,0.235955,0.0,0.755319,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,0.513514,1.0,0.642857,0.55618,1.0,0.446809,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.756757,1.0,0.285714,1.0,0.0,0.37234,1.0,0.833333,0.5,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.486486,1.0,0.785714,0.561798,0.0,0.510638,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


### Load Saved Model

In [81]:
model = joblib.load('aditya_model1_adaboost.joblib')
model

AdaBoostClassifier(algorithm='SAMME', learning_rate=0.1, n_estimators=40,
                   random_state=7)

### Prediction on inference data

In [82]:
model.predict(processed_inference_data)

array([1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1.,
       1., 1., 0.])

### Scoring check on prediction

In [83]:
from sklearn.metrics import accuracy_score
accuracy_score(labels[-20:], model.predict(processed_inference_data))

0.85