<a href="https://colab.research.google.com/github/nikatnguyen/Project4/blob/main/anika_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
#Import dependencies
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sqlalchemy import create_engine

In [46]:
#Import dataset
engine = create_engine('sqlite:///obesity_dummies.db')
df = pd.read_sql('obesity_dummies_df.sql', con=engine, index_col=None)
df.head()

Unnamed: 0,Age,Height,Weight,FCVC,CH2O,FAF,NObeyesdad,Gender_Male,family_history_with_overweight_yes,FAVC_yes,...,CAEC_Sometimes,CAEC_no,SMOKE_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21.0,1.62,64.0,2.0,2.0,0.0,Normal_Weight,False,True,False,...,True,False,False,False,False,True,False,False,True,False
1,21.0,1.52,56.0,3.0,3.0,3.0,Normal_Weight,False,True,False,...,True,False,True,False,True,False,False,False,True,False
2,23.0,1.8,77.0,2.0,2.0,2.0,Normal_Weight,True,True,False,...,True,False,False,True,False,False,False,False,True,False
3,27.0,1.8,87.0,3.0,2.0,2.0,Overweight_Level_I,True,False,False,...,True,False,False,True,False,False,False,False,False,True
4,22.0,1.78,89.8,2.0,2.0,0.0,Overweight_Level_II,True,False,False,...,True,False,False,False,True,False,False,False,True,False


In [47]:
y_check = df['NObeyesdad'].unique()
y_check

array(['Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II',
       'Obesity_Type_I', 'Insufficient_Weight', 'Obesity_Type_II',
       'Obesity_Type_III'], dtype=object)

In [48]:
#Split data set into features and targets
from sklearn.preprocessing import LabelEncoder
y = df['NObeyesdad']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X = df.drop(columns=['NObeyesdad'])

In [49]:
label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6])

array(['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I',
       'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I',
       'Overweight_Level_II'], dtype=object)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=48)

# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [51]:
X_train.head()

Unnamed: 0,Age,Height,Weight,FCVC,CH2O,FAF,Gender_Male,family_history_with_overweight_yes,FAVC_yes,CAEC_Frequently,CAEC_Sometimes,CAEC_no,SMOKE_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
1312,31.641081,1.676595,89.993812,2.934671,2.041462,0.578074,True,True,True,False,True,False,False,False,True,False,False,False,False,False
962,19.621545,1.566524,61.616,2.908757,1.622827,0.539952,False,False,True,False,True,False,False,False,True,False,False,False,True,False
465,18.0,1.6,51.0,2.0,1.0,1.0,False,True,True,True,False,False,False,False,True,False,False,False,True,False
37,21.0,1.6,48.0,2.0,1.0,1.0,False,False,True,False,True,False,False,False,True,False,False,False,True,False
1363,31.335093,1.665798,89.738596,2.274164,1.358172,1.482411,True,True,True,False,True,False,False,False,True,False,False,False,False,False


In [52]:
#Using RandomForestClassifier for machine learning model
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
rf_classifier.fit(X_train, y_train)


In [53]:
#Make predictions
y_pred = rf_classifier.predict(X_test)

In [54]:
#Accuracy of classifier model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.95
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96        52
           1       0.84      0.96      0.90        51
           2       0.98      0.99      0.98        83
           3       1.00      0.96      0.98        56
           4       1.00      1.00      1.00        69
           5       0.86      0.91      0.88        54
           6       1.00      0.88      0.94        58

    accuracy                           0.95       423
   macro avg       0.95      0.95      0.95       423
weighted avg       0.96      0.95      0.95       423



In [55]:
#Using GridSearch to find the best parameters for random forest classifier model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    # Add other parameters as needed
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': 20, 'n_estimators': 200}


In [56]:
#Implementing the best model
final_model = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                                        max_depth=best_params['max_depth'],
                                        random_state=42)

final_model.fit(X_train, y_train)

In [57]:
# Testing accuracy of final model
final_predictions = final_model.predict(X_test)

accuracy = accuracy_score(y_test, final_predictions)
print(f"Accuracy of the final_model: {accuracy:.2f}")

Accuracy of the final_model: 0.96


In [59]:
# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X)

# Scale the data
X_scaled = X_scaler.transform(X)

#Save model for deployment
import pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(X_scaler, f)


In [60]:
final_model = RandomForestClassifier(max_depth = 20, n_estimators = 200, random_state = 42)
final_model.fit(X_scaled, y)

In [61]:
X_scaled.shape

(2111, 20)

In [62]:
X.columns

Index(['Age', 'Height', 'Weight', 'FCVC', 'CH2O', 'FAF', 'Gender_Male',
       'family_history_with_overweight_yes', 'FAVC_yes', 'CAEC_Frequently',
       'CAEC_Sometimes', 'CAEC_no', 'SMOKE_yes', 'CALC_Frequently',
       'CALC_Sometimes', 'CALC_no', 'MTRANS_Bike', 'MTRANS_Motorbike',
       'MTRANS_Public_Transportation', 'MTRANS_Walking'],
      dtype='object')

In [63]:
#Save model for deployment
import pickle
with open('final_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)