In [1]:
import warnings
warnings.filterwarnings('ignore')

# Import Necessary Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.ensemble import RandomForestClassifier

# Set the decimal format
pd.options.display.float_format = "{:.2f}".format

# Load the dataset
file_path = 'diabetes_prediction_dataset.csv'  # Adjust the file path as needed
df = pd.read_csv(file_path)

# Split data into features and target variable
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessor with StandardScaler and OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'hypertension', 'heart_disease']),
        ('cat', OneHotEncoder(), ['gender', 'smoking_history'])
    ]
)

# Fit the preprocessor and save its components
preprocessor.fit(X_train)
joblib.dump(preprocessor.named_transformers_['num'], 'scaler.joblib')
joblib.dump(preprocessor.named_transformers_['cat'], 'encoder.joblib')

# Create a pipeline that preprocesses the data, resamples data, and then trains a classifier
clf = imbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('over', SMOTE(sampling_strategy=0.1)),
    ('under', RandomUnderSampler(sampling_strategy=0.5)),
    ('classifier', RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=200))
])

# Define the hyperparameters for GridSearchCV
# param_grid = {
#     'classifier__n_estimators': [50, 100, 200],
#     'classifier__max_depth': [None, 10, 20],
#     'classifier__min_samples_split': [2, 5, 10],
#     'classifier__min_samples_leaf': [1, 2, 4]
# }

# Perform Grid Search
# grid_search = GridSearchCV(clf, param_grid, cv=5)
clf.fit(X_train, y_train)



In [13]:
# Print the best parameters and save the best model
# print("Best Parameters: ", grid_search.best_params_)
# best_model = grid_search.best_estimator_
joblib.dump(clf['classifier'], 'diabetes_prediction_model.joblib')

# Optionally, save the preprocessed data
df.to_csv('preprocessed_data.csv', index=False)

In [5]:
preprocessor.transform(X_train).shape

(80000, 15)

In [17]:
clf.n_features_in_

8

In [16]:
from joblib import load
load('diabetes_prediction_model.joblib').n_features_in_

8