### Testing a KNN Machine Learning Model's PKL File Load and Run

In [65]:
# test the pkl file

In [66]:
import pandas as pd
import numpy as np
from time import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, make_scorer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.inspection import DecisionBoundaryDisplay
import joblib


Note ensure that the feature names, number of features, and order of features are consistent, you can use the saved model to make accurate predictions.

In [67]:
health1 = pd.read_csv('data/hypertension_data.csv')

In [68]:
health2 = health1

In [69]:
health2['age'] = health2['age'].astype('int64')

In [70]:
health2 = health2.dropna(subset=['sex'])

In [71]:
health2 = health2.reset_index(drop=True)

In [72]:
health2['sex'] = health2['sex'].astype('int64')

In [73]:
health3 = health2.rename(columns={
    'cp': 'chest_pain',
    'trestbps': 'rest_bp',
    'fbs': 'fast_bs',
    'restecg': 'rest_ecg',
    'thalach': 'max_hr',
    'exang': 'exercise_ang',
    'oldpeak': 'st_depressed',
    'slope': 'st_slope',
    'ca': 'art_color',
    'thal': 'dis_blood',
    'target': 'hyper'
})

In [74]:
health3

Unnamed: 0,age,sex,chest_pain,rest_bp,chol,fast_bs,rest_ecg,max_hr,exercise_ang,st_depressed,st_slope,art_color,dis_blood,hyper
0,57,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,64,0,2,130,250,0,1,187,0,3.5,0,0,2,1
2,52,1,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,0,1,120,236,0,1,178,0,0.8,2,0,2,1
4,66,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26053,72,0,0,138,294,1,1,106,0,1.9,1,3,2,0
26054,60,1,0,144,200,0,0,126,1,0.9,1,0,3,0
26055,68,1,0,100,234,0,1,156,0,0.1,2,1,3,0
26056,67,1,1,154,232,0,0,164,0,0.0,2,1,2,0


In [75]:
X = health3.drop('hyper', axis=1)
y = health3['hyper']

In [76]:
X

Unnamed: 0,age,sex,chest_pain,rest_bp,chol,fast_bs,rest_ecg,max_hr,exercise_ang,st_depressed,st_slope,art_color,dis_blood
0,57,1,3,145,233,1,0,150,0,2.3,0,0,1
1,64,0,2,130,250,0,1,187,0,3.5,0,0,2
2,52,1,1,130,204,0,0,172,0,1.4,2,0,2
3,56,0,1,120,236,0,1,178,0,0.8,2,0,2
4,66,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26053,72,0,0,138,294,1,1,106,0,1.9,1,3,2
26054,60,1,0,144,200,0,0,126,1,0.9,1,0,3
26055,68,1,0,100,234,0,1,156,0,0.1,2,1,3
26056,67,1,1,154,232,0,0,164,0,0.0,2,1,2


In [77]:
print(type(y))

<class 'pandas.core.series.Series'>


In [78]:
# just to demonstrate I set the pkl files data set to run to X_test - you would replace this later with a new data set
dataset_toinput = X
pickle_file = 'knn_model.pkl'

In [79]:
import joblib

# Load the model from the .pkl file using joblib
loaded_model = joblib.load(pickle_file)

# Use the machine learning model to make predictions
hypertension_predictions = loaded_model.predict(dataset_toinput)

In [80]:
# Create a pandas DataFrame with one feature named 'prediction'
hypertension_df = pd.DataFrame(hypertension_predictions, columns=['prediction'])

hypertension_df # display results

Unnamed: 0,prediction
0,1
1,1
2,1
3,1
4,1
...,...
26053,1
26054,1
26055,1
26056,1


In [81]:
y_df = y.to_frame()

In [82]:
print(type(y_df))

<class 'pandas.core.frame.DataFrame'>


In [83]:
y_df

Unnamed: 0,hyper
0,1
1,1
2,1
3,1
4,1
...,...
26053,0
26054,0
26055,0
26056,0


In [84]:
combined_df = pd.concat([hypertension_df, y_df], axis=1)

In [85]:
combined_df.head(100)

Unnamed: 0,prediction,hyper
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
95,1,1
96,1,1
97,1,1
98,1,1


In [86]:
# Compare the two features and count mismatches
mismatch_count = (combined_df['prediction'] != combined_df['hyper']).sum()

print(f'Total mismatches: {mismatch_count}')

Total mismatches: 11832


Note I used the entire original data set as a test data set to run the PKL file