<a href="https://colab.research.google.com/github/nikatnguyen/Project4/blob/main/anika_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Import dependencies
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
#Import dataset
df = pd.read_csv('https://raw.githubusercontent.com/nikatnguyen/Project4/main/Resources/obesity_dummies_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Height,Weight,FCVC,CH2O,FAF,NObeyesdad,Gender_Male,family_history_with_overweight_yes,...,CAEC_Sometimes,CAEC_no,SMOKE_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,0,21.0,1.62,64.0,2.0,2.0,0.0,Normal_Weight,0,1,...,1,0,0,0,0,1,0,0,1,0
1,1,21.0,1.52,56.0,3.0,3.0,3.0,Normal_Weight,0,1,...,1,0,1,0,1,0,0,0,1,0
2,2,23.0,1.8,77.0,2.0,2.0,2.0,Normal_Weight,1,1,...,1,0,0,1,0,0,0,0,1,0
3,3,27.0,1.8,87.0,3.0,2.0,2.0,Overweight_Level_I,1,0,...,1,0,0,1,0,0,0,0,0,1
4,4,22.0,1.78,89.8,2.0,2.0,0.0,Overweight_Level_II,1,0,...,1,0,0,0,1,0,0,0,1,0


In [3]:
#Split data set into features and targets
from sklearn.preprocessing import LabelEncoder
y = df['NObeyesdad']
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X = df.drop(columns=['NObeyesdad'])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=48)

# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [5]:
#Using RandomForestClassifier for machine learning model
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
rf_classifier.fit(X_train, y_train)


In [6]:
#Make predictions
y_pred = rf_classifier.predict(X_test)

In [7]:
#Accuracy of classifier model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Accuracy: 0.95
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        52
           1       0.80      0.96      0.88        51
           2       0.98      0.98      0.98        83
           3       1.00      0.96      0.98        56
           4       1.00      1.00      1.00        69
           5       0.91      0.89      0.90        54
           6       0.95      0.90      0.92        58

    accuracy                           0.95       423
   macro avg       0.95      0.94      0.94       423
weighted avg       0.95      0.95      0.95       423



In [8]:
#Using GridSearch to find the best parameters for random forest classifier model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    # Add other parameters as needed
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': 20, 'n_estimators': 100}


In [9]:
#Implementing the best model
final_model = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                                        max_depth=best_params['max_depth'],
                                        random_state=42)

final_model.fit(X_train, y_train)

In [10]:
# Testing accuracy of final model
final_predictions = final_model.predict(X_test)

accuracy = accuracy_score(y_test, final_predictions)
print(f"Accuracy of the final_model: {accuracy:.2f}")

Accuracy of the final_model: 0.95


In [11]:
#Save model for deployment
import pickle
with open('final_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)