In [11]:
#!pip install tpot

Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting xgboost>=1.1.0 (from tpot)
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
   ---------------------------------------- 0.0/87.4 kB ? eta -:--:--
   ---------------------------- ----------- 61.4/87.4 kB 1.7 MB/s eta 0:00:01
   ---------------------------------------- 87.4/87.4 kB 1.2 MB/s eta 0:00:00
Downloading deap-1.4.2-cp312-cp312-win_amd64.whl (109 kB)
   ---------------------------------------- 0.0/109.9 kB ? eta -:--:--
   ------------

In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from tpot import TPOTClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import joblib
%matplotlib inline



## Loading and Preprocessing Stroke Data

In [3]:
# Loading data
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
df.shape

(5110, 12)

In [8]:
df_cleaned = df.dropna(subset=['bmi', 'smoking_status'])

In [10]:
df_cleaned.shape

(4909, 12)

In [12]:
df_cleaned.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [14]:
df_cleaned['smoking_status'].value_counts()

smoking_status
never smoked       1852
Unknown            1483
formerly smoked     837
smokes              737
Name: count, dtype: int64

In [16]:
df_cleaned = pd.get_dummies(df_cleaned)

In [18]:
df_cleaned.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,False,True,False,...,False,True,False,False,False,True,False,True,False,False
2,31112,80.0,0,1,105.92,32.5,1,False,True,False,...,False,True,False,False,True,False,False,False,True,False
3,60182,49.0,0,0,171.23,34.4,1,True,False,False,...,False,True,False,False,False,True,False,False,False,True
4,1665,79.0,1,0,174.12,24.0,1,True,False,False,...,False,False,True,False,True,False,False,False,True,False
5,56669,81.0,0,0,186.21,29.0,1,False,True,False,...,False,True,False,False,False,True,False,True,False,False


In [20]:
df_cleaned.columns

Index(['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level',
       'bmi', 'stroke', 'gender_Female', 'gender_Male', 'gender_Other',
       'ever_married_No', 'ever_married_Yes', 'work_type_Govt_job',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'],
      dtype='object')

In [22]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4909 entries, 0 to 5109
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              4909 non-null   int64  
 1   age                             4909 non-null   float64
 2   hypertension                    4909 non-null   int64  
 3   heart_disease                   4909 non-null   int64  
 4   avg_glucose_level               4909 non-null   float64
 5   bmi                             4909 non-null   float64
 6   stroke                          4909 non-null   int64  
 7   gender_Female                   4909 non-null   bool   
 8   gender_Male                     4909 non-null   bool   
 9   gender_Other                    4909 non-null   bool   
 10  ever_married_No                 4909 non-null   bool   
 11  ever_married_Yes                4909 non-null   bool   
 12  work_type_Govt_job              4909 no

In [24]:
# Define features set
X = df_cleaned.copy()
X.drop(columns=["stroke","id"], axis=1, inplace=True)
X.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,False,True,False,False,True,...,False,True,False,False,False,True,False,True,False,False
2,80.0,0,1,105.92,32.5,False,True,False,False,True,...,False,True,False,False,True,False,False,False,True,False
3,49.0,0,0,171.23,34.4,True,False,False,False,True,...,False,True,False,False,False,True,False,False,False,True
4,79.0,1,0,174.12,24.0,True,False,False,False,True,...,False,False,True,False,True,False,False,False,True,False
5,81.0,0,0,186.21,29.0,False,True,False,False,True,...,False,True,False,False,False,True,False,True,False,False


In [26]:
# Define target vector
y = df_cleaned["stroke"]
y[:5]

0    1
2    1
3    1
4    1
5    1
Name: stroke, dtype: int64

In [28]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [29]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,test_size=0.2, random_state=42)

In [30]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [34]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [36]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## FITTING THE RANDOM FOREST MODEL

In [206]:
# Create a random forest classifier
# model_rf_new = RandomForestClassifier(bootstrap=False, criterion='entropy', max_features=0.5, min_samples_leaf=2, min_samples_split=7, random_state=42)
# Fitting the model
# model_rf_new = model_rf_new.fit(X_train_scaled, y_train)

In [107]:
# Create a random forest classifier
#model_rf_new = RandomForestClassifier(n_estimators=500, random_state=78)
model_rf_new = RandomForestClassifier(
    n_estimators=500,  # number of trees
    max_depth=10,  # limiting tree depth to avoid overfitting
    min_samples_split=10,  # allowing splits even for smaller nodes
    min_samples_leaf=4,  # minimum samples in leaf node
    max_features='sqrt',  # use square root of total features for each split
    random_state=42,  # for reproducibility
    bootstrap=True,  # using bootstrap sampling
    #class_weight='balanced'  # if data is imbalanced
    class_weight={0: 2, 1: 5} 
)
# Fitting the model
model_rf_new = model_rf_new.fit(X_train_scaled, y_train)

In [109]:
# Making predictions using the testing data
predictions = model_rf_new.predict(X_test_scaled)

In [111]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [113]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,848,74
Actual 1,28,930


Accuracy Score : 0.9457446808510638
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.92      0.94       922
           1       0.93      0.97      0.95       958

    accuracy                           0.95      1880
   macro avg       0.95      0.95      0.95      1880
weighted avg       0.95      0.95      0.95      1880



In [216]:
# Save the model RF
joblib.dump(model_rf_new, 'model/model_rf_new.pkl')

['model/model_rf_new.pkl']

## SVM MODEL

In [219]:
from sklearn.svm import SVC
# Train the SVM model
model_SVM = SVC(kernel='rbf')  # You can change the kernel (linear, rbf, etc.)
model_SVM.fit(X_train_scaled, y_train)

In [221]:
# Make predictions
y_pred = model_SVM.predict(X_test_scaled)

In [223]:
# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       922
           1       1.00      0.93      0.96       958

    accuracy                           0.96      1880
   macro avg       0.96      0.96      0.96      1880
weighted avg       0.96      0.96      0.96      1880



In [225]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[919   3]
 [ 67 891]]

Accuracy Score:
0.9627659574468085

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       922
           1       1.00      0.93      0.96       958

    accuracy                           0.96      1880
   macro avg       0.96      0.96      0.96      1880
weighted avg       0.96      0.96      0.96      1880



In [227]:
# Save the model SVM
joblib.dump(model_SVM, 'model/model_SVM.pkl')

['model/model_SVM.pkl']

## LOGISTIC REGRESSION MODEL

In [230]:
# Create and train the Logistic Regression model with class_weight='balanced' to handle class imbalance
log_reg = LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=200, random_state=42)

In [232]:
# Train the model
log_reg.fit(X_train, y_train)

# Predict on the test data
y_pred = log_reg.predict(X_test)

In [234]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[920   2]
 [ 74 884]]

Accuracy Score:
0.9595744680851064

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       922
           1       1.00      0.92      0.96       958

    accuracy                           0.96      1880
   macro avg       0.96      0.96      0.96      1880
weighted avg       0.96      0.96      0.96      1880



In [236]:
# Save the model LR
joblib.dump(log_reg, 'model/stroke_logistic_reg_model.pkl')

['model/stroke_logistic_reg_model.pkl']

## VOTING CLASSIFIER: COMBINING LOGISTIC REGRESSION & RANDOM FOREST

In [148]:
from sklearn.ensemble import VotingClassifier

# Instantiate individual classifiers
#lr_classifier = LogisticRegression(solver="lbfgs", max_iter=300, random_state=42)
lr_classifier = LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=200, random_state=42)

#rf_classifier = RandomForestClassifier(n_estimators=500, class_weight='balanced', random_state=42)
rf_classifier = RandomForestClassifier(
    n_estimators=500,  # number of trees
    max_depth=10,  # limiting tree depth to avoid overfitting
    min_samples_split=10,  # allowing splits even for smaller nodes
    min_samples_leaf=4,  # minimum samples in leaf node
    max_features='sqrt',  # use square root of total features for each split
    random_state=42,  # for reproducibility
    bootstrap=True,  # using bootstrap sampling
    #class_weight='balanced'  # if data is imbalanced
    class_weight={0: 2, 1: 12} 
)
# Combine them into a VotingClassifier
voting_clf = VotingClassifier(estimators=[('lr', lr_classifier), ('rf', rf_classifier)], voting='soft')

voting_clf.fit(X_train_scaled, y_train)
y_pred = voting_clf.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       922
           1       0.96      0.96      0.96       958

    accuracy                           0.96      1880
   macro avg       0.96      0.96      0.96      1880
weighted avg       0.96      0.96      0.96      1880



In [150]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[887  35]
 [ 38 920]]

Accuracy Score:
0.9611702127659575

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       922
           1       0.96      0.96      0.96       958

    accuracy                           0.96      1880
   macro avg       0.96      0.96      0.96      1880
weighted avg       0.96      0.96      0.96      1880



## XGBoost

In [79]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(scale_pos_weight=5, random_state=78)  # You can adjust scale_pos_weight for Class 1
model_xgb.fit(X_train_scaled, y_train)
y_pred = model_xgb.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       922
           1       0.97      0.96      0.97       958

    accuracy                           0.97      1880
   macro avg       0.97      0.97      0.97      1880
weighted avg       0.97      0.97      0.97      1880



In [81]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[898  24]
 [ 35 923]]

Accuracy Score:
0.9686170212765958

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       922
           1       0.97      0.96      0.97       958

    accuracy                           0.97      1880
   macro avg       0.97      0.97      0.97      1880
weighted avg       0.97      0.97      0.97      1880



## XGBOOST WITH RF, LR, SVM

In [165]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=500, random_state=78)
lr_model = LogisticRegression(class_weight={0: 1, 1: 5}, random_state=78)
svm_model = SVC(class_weight={0: 1, 1: 5}, probability=True, random_state=78)

voting_model = VotingClassifier(estimators=[
    ('random_forest', rf_model),
    ('logistic_regression', lr_model),
    ('svm', svm_model)
], voting='soft')  # Soft voting uses probabilities

voting_model.fit(X_train_scaled, y_train)
y_pred = voting_model.predict(X_test_scaled)


In [167]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[896  26]
 [ 43 915]]

Accuracy Score:
0.9632978723404255

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       922
           1       0.97      0.96      0.96       958

    accuracy                           0.96      1880
   macro avg       0.96      0.96      0.96      1880
weighted avg       0.96      0.96      0.96      1880



In [173]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define the base models
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
lr_model = LogisticRegression(class_weight={0: 1, 1: 3}, random_state=78)  # Example with class weight adjustment
svm_model = SVC(class_weight='balanced', probability=True, random_state=78)

# Create a voting classifier using different algorithms
voting_model = VotingClassifier(estimators=[
    ('random_forest', rf_model),
    ('logistic_regression', lr_model),
    ('svm', svm_model)
], voting='soft')  # Soft voting uses probabilities

# Train the voting model
voting_model.fit(X_train_scaled, y_train)

# Get the predicted probabilities
probabilities = voting_model.predict_proba(X_test_scaled)[:, 1]

# Adjust threshold (e.g., setting it to 0.4 to reduce false negatives)
y_pred = (probabilities > 0.35).astype(int)

# Evaluate the new predictions with adjusted threshold
print("Classification Report with Adjusted Threshold (0.4):")
print(classification_report(y_test, y_pred))


Classification Report with Adjusted Threshold (0.4):
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       922
           1       0.97      0.96      0.97       958

    accuracy                           0.97      1880
   macro avg       0.97      0.97      0.97      1880
weighted avg       0.97      0.97      0.97      1880



In [175]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[897  25]
 [ 35 923]]

Accuracy Score:
0.9680851063829787

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       922
           1       0.97      0.96      0.97       958

    accuracy                           0.97      1880
   macro avg       0.97      0.97      0.97      1880
weighted avg       0.97      0.97      0.97      1880



In [179]:
# Save the model LR
joblib.dump(voting_model, 'model/VotingClassifier_LR_RF_SVM_model.pkl')

['model/VotingClassifier_LR_RF_SVM_model.pkl']

## TPOT FOR CLASSIFICATION

In [69]:
# Train the TPOT Classifier
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, random_state=42)
model_tpot = tpot.fit(X_train_scaled, y_train)

Version 0.12.2 of tpot is outdated. Version 1.0.0 was released 7 days ago.


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=3, p=1, weights=distance)


In [70]:
# Evaluate the Model
print(f"Accuracy: {tpot.score(X_test, y_test)}")



Accuracy: 0.5191489361702127


In [90]:
# Export the Best Model
tpot.export('best_model.py')

## PREDICTION VERIFICATION

In [238]:
import joblib

# Load the trained model
model = joblib.load("model/stroke_rf_model.pkl")

feature_names = X_train.columns.tolist()  # If 'X_train' is the DataFrame used for training
print("Feature names:", feature_names)

Feature names: ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'gender_Female', 'gender_Male', 'gender_Other', 'ever_married_No', 'ever_married_Yes', 'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural', 'Residence_type_Urban', 'smoking_status_Unknown', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes']


In [240]:
import joblib

# Load the trained model
model_lr = joblib.load("model/stroke_logistic_reg_model.pkl")

# Print the feature names expected by the model
print(model_lr.feature_names_in_)

['age' 'hypertension' 'heart_disease' 'avg_glucose_level' 'bmi'
 'gender_Female' 'gender_Male' 'gender_Other' 'ever_married_No'
 'ever_married_Yes' 'work_type_Govt_job' 'work_type_Never_worked'
 'work_type_Private' 'work_type_Self-employed' 'work_type_children'
 'Residence_type_Rural' 'Residence_type_Urban' 'smoking_status_Unknown'
 'smoking_status_formerly smoked' 'smoking_status_never smoked'
 'smoking_status_smokes']


## LOGISTIC REGRESSION PREDICTION VERIFICATION

In [251]:
import joblib
import pandas as pd

# Load the trained model
model_lr = joblib.load("model/stroke_logistic_reg_model.pkl")

# Create a sample input with all the required features
sample_data = {
    'age': [51],
    'hypertension': [0],  # 1 for Yes, 0 for No
    'heart_disease': [0],  # 0 for No, 1 for Yes
    'avg_glucose_level': [166.29],
    'bmi': [25.6],
    'gender_Female': [0],
    'gender_Male': [1],
    'gender_Other': [0],
    'ever_married_No': [0],
    'ever_married_Yes': [1],
    'work_type_Govt_job': [0],
    'work_type_Never_worked': [0],
    'work_type_Private': [1],
    'work_type_Self-employed': [0],
    'work_type_children': [0],
    'Residence_type_Rural': [1],
    'Residence_type_Urban': [0],
    'smoking_status_Unknown': [0],
    'smoking_status_formerly smoked': [1],
    'smoking_status_never smoked': [0],
    'smoking_status_smokes': [0]
}

# Convert the sample data into a pandas DataFrame
sample_df = pd.DataFrame(sample_data)

# Use the trained model to make a prediction
prediction = model_lr.predict(sample_df)

# Output the prediction
print("Prediction:", prediction)


Prediction: [0]


## SVM PREDICTION VERIFICATION

In [254]:
import joblib
import pandas as pd

# Load the trained model
model = joblib.load("model/model_SVM.pkl")

# Create a sample input with all the required features
sample_data = {
    'age': [51],
    'hypertension': [0],  # 1 for Yes, 0 for No
    'heart_disease': [0],  # 0 for No, 1 for Yes
    'avg_glucose_level': [166.29],
    'bmi': [25.6],
    'gender_Female': [0],
    'gender_Male': [1],
    'gender_Other': [0],
    'ever_married_No': [0],
    'ever_married_Yes': [1],
    'work_type_Govt_job': [0],
    'work_type_Never_worked': [0],
    'work_type_Private': [1],
    'work_type_Self-employed': [0],
    'work_type_children': [0],
    'Residence_type_Rural': [1],
    'Residence_type_Urban': [0],
    'smoking_status_Unknown': [0],
    'smoking_status_formerly smoked': [1],
    'smoking_status_never smoked': [0],
    'smoking_status_smokes': [0]
}

# Convert the sample data into a pandas DataFrame
sample_df = pd.DataFrame(sample_data)

# Use the trained model to make a prediction
prediction = model.predict(sample_df.values)

# Output the prediction
print("Prediction:", prediction)

Prediction: [0]


## RANDOM FOREST PREDICTION VERIFICATION

In [257]:
import joblib
import pandas as pd

# Load the trained model
model = joblib.load("model/model_rf_new.pkl")

# Create a sample input with all the required features
sample_data = {
    'age': [51],
    'hypertension': [0],  # 1 for Yes, 0 for No
    'heart_disease': [0],  # 0 for No, 1 for Yes
    'avg_glucose_level': [166.29],
    'bmi': [25.6],
    'gender_Female': [0],
    'gender_Male': [1],
    'gender_Other': [0],
    'ever_married_No': [0],
    'ever_married_Yes': [1],
    'work_type_Govt_job': [0],
    'work_type_Never_worked': [0],
    'work_type_Private': [1],
    'work_type_Self-employed': [0],
    'work_type_children': [0],
    'Residence_type_Rural': [1],
    'Residence_type_Urban': [0],
    'smoking_status_Unknown': [0],
    'smoking_status_formerly smoked': [1],
    'smoking_status_never smoked': [0],
    'smoking_status_smokes': [0]
}

# Convert the sample data into a pandas DataFrame
sample_df = pd.DataFrame(sample_data)

# Use the trained model to make a prediction
prediction = model.predict(sample_df.values)

# Output the prediction
print("Prediction:", prediction)

Prediction: [1]
