In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle

In [None]:
# Load the dataset
df = pd.read_csv("/home/ubuntu/upload/ObesityDataSet2.csv")

In [None]:
# Display basic information and first few rows
print(df.info())
print(df.head())

In [None]:
# Handle missing values: Drop rows with any NaN values
df.dropna(inplace=True)

In [None]:
# Preprocessing: Handle categorical variables
# Identify categorical columns
categorical_cols = df.select_dtypes(include='object').columns

# Apply Label Encoding to categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Save the label encoder for the target variable (NObeyesdad) if needed for inverse transformation
target_label_encoder = label_encoders["NObeyesdad"]

In [None]:
# Define features (X) and target (y)
X = df.drop('NObeyesdad', axis=1)
y = df["NObeyesdad"]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale numerical features (optional but good practice for some models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Model 1: Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate
print("Random Forest Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

In [None]:
# Model 2: Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_gb = gb_model.predict(X_test_scaled)

# Evaluate
print("\nGradient Boosting Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_gb))

In [None]:
# Compare and select the best model
# For simplicity, we\'ll choose the one with higher accuracy on the test set
# In a real scenario, cross-validation and other metrics would be considered

if accuracy_score(y_test, y_pred_rf) > accuracy_score(y_test, y_pred_gb):
    best_model = rf_model
    model_name = "Random Forest"
else:
    best_model = gb_model
    model_name = "Gradient Boosting"

print(f"\nBest model selected: {model_name}")

In [None]:
# Save the best model, scaler, and target_label_encoder
# It\'s crucial to save the scaler and label encoder to preprocess new data consistently

model_filename = "best_obesity_model.pkl"
scaler_filename = "scaler.pkl"
target_encoder_filename = "target_label_encoder.pkl"

with open(model_filename, 'wb') as file:
    pickle.dump(best_model, file)

with open(scaler_filename, 'wb') as file:
    pickle.dump(scaler, file)

with open(target_encoder_filename, 'wb') as file:
    pickle.dump(target_label_encoder, file)

print(f"Best model saved as {model_filename}")
print(f"Scaler saved as {scaler_filename}")
print(f"Target Label Encoder saved as {target_encoder_filename}")