In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
#Best parameters extraction
# Best model prediction
# How to get accuracy & confusion matrix

In [2]:
dt = DecisionTreeClassifier(random_state=42)

param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [3]:
grid = GridSearchCV(estimator=dt,
                    param_grid=param_grid,
                    cv=5,
                    scoring='accuracy',
                    n_jobs=-1)

In [4]:
myDataset = pd.read_csv("Health_Risk_Prediction.csv")
df = myDataset.copy() 
onehot_cols = ['profession','smoking','alcohol','married','health_risk']
df_onehot = pd.get_dummies(df[onehot_cols], drop_first=True)
df_onehot.replace({True: 1, False: 0}, inplace=True)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

label_cols = ['exercise', 'sugar_intake']

df_label = df[label_cols].apply(le.fit_transform)
df_numeric = df.select_dtypes(include=['int64', 'float64'])
myNewDataset = pd.concat([df_numeric, df_label,df_onehot], axis=1)
indep_X=myNewDataset.drop('health_risk_low', axis=1)
dep_Y=myNewDataset['health_risk_low']

  df_onehot.replace({True: 1, False: 0}, inplace=True)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.30, random_state = 0)
grid.fit(X_train, y_train)     # <--- REQUIRED

print(grid.best_params_)       # <--- Works now
print(grid.best_score_)        # optional
best_model = grid.best_estimator_

{'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.9917142857142858


In [9]:
print("The best Params is:",grid.best_params_)       # <--- Works now
print("The Best score is:",grid.best_score_) 
best_model

The best Params is: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
The Best score is: 0.9917142857142858


In [15]:
#HOW TO MAKE PREDICTION

In [6]:
y_pred = best_model.predict(X_test)


In [7]:
y_pred

array([0, 0, 0, ..., 1, 1, 1])

In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9933333333333333
Confusion Matrix:
 [[1043    5]
 [   5  447]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1048
           1       0.99      0.99      0.99       452

    accuracy                           0.99      1500
   macro avg       0.99      0.99      0.99      1500
weighted avg       0.99      0.99      0.99      1500



In [11]:
#model = DecisionTreeClassifier().fit(X_train, y_train)

# Save
import joblib
joblib.dump(best_model, "decision_tree.pkl")

# Load
loaded_model = joblib.load("decision_tree.pkl")

# Predict
prediction = loaded_model.predict(X_test)

In [9]:
myNewDataset.columns

Index(['age', 'weight', 'height', 'sleep', 'bmi', 'exercise', 'sugar_intake',
       'profession_doctor', 'profession_driver', 'profession_engineer',
       'profession_farmer', 'profession_office_worker', 'profession_student',
       'profession_teacher', 'smoking_yes', 'alcohol_yes', 'married_yes',
       'health_risk_low'],
      dtype='object')

In [12]:
prediction

array([0, 0, 0, ..., 1, 1, 1])

In [13]:
myNewDataset.shape

(5000, 18)

In [17]:
## Provide Input and Predict
import numpy as np

# Load your saved model
loaded_model = joblib.load("decision_tree.pkl")

# Enter the input values - Features 
a = float(input("Enter value for age: "))
b = float(input("Enter value for weight: "))
c = float(input("Enter value for height: "))
d = float(input("Enter value for sleep: "))
e = float(input("Enter value for bmi: "))
f = float(input("Enter value for excersie: "))
g = float(input("Enter value for sugar_intake: "))
h = float(input("Enter value for profession_doctor: "))
i = float(input("Enter value for profession_driver: "))
j = float(input("Enter value for profession_engineer: "))
k = float(input("Enter value for profession_farmer: "))
l = float(input("Enter value for profession_office_worker: "))
m = float(input("Enter value for profession_student: "))
n = float(input("Enter value for profession_teacher: "))
o = float(input("Enter value for smoking_yes: "))
p = float(input("Enter value for alcohol_yes: "))
q = float(input("Enter value for married_yes: "))

# Combine input into array
user_data = np.array([[a, b, c,d,e,f,g,h,i,j,k,l,m,n,o,p,q]])

# Predict
prediction = loaded_model.predict(user_data)

print("The Health Risk is (0=low, 1=high):", prediction[0])

Enter value for age:  35
Enter value for weight:  106
Enter value for height:  146
Enter value for sleep:  8
Enter value for bmi:  50.2
Enter value for excersie:  2
Enter value for sugar_intake:  2
Enter value for profession_doctor:  1
Enter value for profession_driver:  0
Enter value for profession_engineer:  0
Enter value for profession_farmer:  0
Enter value for profession_office_worker:  0
Enter value for profession_student:  0
Enter value for profession_teacher:  0
Enter value for smoking_yes:  0
Enter value for alcohol_yes:  0
Enter value for married_yes:  1


The Health Risk is (0=low, 1=high): 1




In [31]:
X_test

Unnamed: 0,age,weight,height,sleep,bmi,exercise,sugar_intake,profession_doctor,profession_driver,profession_engineer,profession_farmer,profession_office_worker,profession_student,profession_teacher,smoking_yes,alcohol_yes,married_yes
398,65,78,146,4.4,36.6,1,2,0,0,0,1,0,0,0,1,1,1
3833,68,86,156,5.5,35.3,2,2,0,0,0,0,0,0,0,0,0,1
4836,67,66,194,7.2,17.5,1,2,0,0,0,0,0,0,0,0,0,0
4572,57,61,185,6.8,17.8,1,2,0,0,0,0,0,0,1,1,0,0
636,31,73,192,4.6,19.8,1,2,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4554,35,107,146,8.0,50.2,2,2,1,0,0,0,0,0,0,0,0,1
4807,58,75,167,7.1,26.9,0,1,0,0,1,0,0,0,0,0,0,1
1073,40,57,191,5.9,15.6,2,2,0,0,0,0,0,0,0,0,0,0
2906,25,76,179,9.1,23.7,0,2,0,0,1,0,0,0,0,1,0,0


In [13]:
y_test

398     0
3833    0
4836    0
4572    0
636     1
       ..
4554    1
4807    0
1073    1
2906    1
1357    1
Name: health_risk_low, Length: 1500, dtype: int64

In [20]:
import pickle  #library to save the model
file_name = "DT_Class_FinalModel.sav"
pickle.dump(grid,open(file_name,'wb'))

In [14]:
import pickle

load_model = pickle.load(open("DT_Class_FinalModel.sav",'rb'))  #Read Binary from the file  

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [15]:
#Predict the output. Enter the input.
R_Output =  load_model.predict([[65,78,146,4.4,36.6,1,2,0,0,0,1,0,0,0,1,1,1]])
R_Output



array([0])

In [27]:
health_risk_value = df.loc[(df['height'] == 146) & (df['age'] == 35), 'health_risk'].values[0]
print(health_risk_value)

high


In [31]:
health_risk_value = df.loc[(df['height'] == 146) & (df['age'] == 65), 'health_risk'].values[0]
print(health_risk_value)

high


In [32]:
# Load your saved model

# Enter the input values - Features 
a = float(input("Enter value for age: "))
b = float(input("Enter value for weight: "))
c = float(input("Enter value for height: "))
d = float(input("Enter value for sleep: "))
e = float(input("Enter value for bmi: "))
f = float(input("Enter value for excersie: "))
g = float(input("Enter value for sugar_intake: "))
h = float(input("Enter value for profession_doctor: "))
i = float(input("Enter value for profession_driver: "))
j = float(input("Enter value for profession_engineer: "))
k = float(input("Enter value for profession_farmer: "))
l = float(input("Enter value for profession_office_worker: "))
m = float(input("Enter value for profession_student: "))
n = float(input("Enter value for profession_teacher: "))
o = float(input("Enter value for smoking_yes: "))
p = float(input("Enter value for alcohol_yes: "))
q = float(input("Enter value for married_yes: "))

# Combine input into array
user_data = np.array([[a, b, c,d,e,f,g,h,i,j,k,l,m,n,o,p,q]])

# Predict
prediction = loaded_model.predict(user_data)

print("The Health Risk is (0=low, 1=high):", prediction[0])

Enter value for age:  65
Enter value for weight:  78
Enter value for height:  146
Enter value for sleep:  4.4
Enter value for bmi:  36.6
Enter value for excersie:  1
Enter value for sugar_intake:  2
Enter value for profession_doctor:  0
Enter value for profession_driver:  0
Enter value for profession_engineer:  0
Enter value for profession_farmer:  1
Enter value for profession_office_worker:  0
Enter value for profession_student:  0
Enter value for profession_teacher:  0
Enter value for smoking_yes:  1
Enter value for alcohol_yes:  1
Enter value for married_yes:  1


The Health Risk is (0=low, 1=high): 0


