### Obesity Level Prediction using Logistic Regression
In this notebook, logisitc regression is used to predict obesity levels based on lifestyle and physical features. 

In [3]:
# Import libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load split data from feather files
train_path = os.path.join("..", "processed_data", "train_data.feather")
test_path = os.path.join("..", "processed_data", "test_data.feather")

train_df = pd.read_feather(train_path)
test_df = pd.read_feather(test_path)

# Split features and labels
y_train = train_df["obesity_level"]
X_train = train_df.drop(columns=["obesity_level"])

y_test = test_df["obesity_level"]
X_test = test_df.drop(columns=["obesity_level"])

# Define numerical columns
numerical_cols = X_train.select_dtypes(include=["float64"]).columns.tolist()

# Separate nominal and ordinal categorical features
nominal_cols = [
    'gender', 
    'family_history_overweight', 
    'smokes', 
    'calorie_tracking', 
    'transport_mode'
]

ordinal_cols = [
    'high_caloric_food_freq', 
    'vegetables_freq', 
    'main_meal_count', 
    'snacking_freq', 
    'water_intake', 
    'physical_activity_freq', 
    'screen_time_hours', 
    'alcohol_consumption_freq'
]

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("nom", OneHotEncoder(handle_unknown='ignore'), nominal_cols),
        ("ord", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_cols)
    ]
)

# Encode the target variable (necessary for classification)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create pipeline
pipeline_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, solver='saga', multi_class='multinomial', random_state=42))
])

# Define parameter grid for Logistic Regression
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10],  # Regularization strength (inverse)
    'classifier__penalty': ['l2']          # L2 regularization
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline_lr, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train_encoded)

# Get the best model
best_lr = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
y_pred = best_lr.predict(X_test)
test_accuracy = accuracy_score(y_test_encoded, y_pred)
print("\nTest Set Performance:")
print(f"Accuracy: {test_accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix (optional)
import matplotlib.pyplot as plt
import seaborn as sns

conf_matrix = confusion_matrix(y_test_encoded, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Logistic Regression')
plt.show()


ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.