In [21]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

#Load the dataset
df = pd.read_csv('./preprocessed_obesity.csv')

#Automatically define categorical and numeric columns based on data types, excluding the target column
target_column = 'nobeyesdad'
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove the target column from the categorical list (because it's also an object type)
categorical_columns.remove(target_column)

#Define features and target variable
X = df.drop(target_column, axis=1)
y = df[target_column]

#Create a column transformer for One-Hot Encoding of categorical columns and scaling of numeric columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns),  # One-hot encoding for categorical columns
        ('num', StandardScaler(), numeric_columns)      # Standard scaling for numeric columns
    ])

#Create a pipeline with preprocessor and Logistic Regression with more iterations
log_reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', LogisticRegression(max_iter=2000))])  # Increased max_iter

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#Train the Logistic Regression model
log_reg_pipeline.fit(X_train, y_train)

#Make predictions
y_pred_log_reg = log_reg_pipeline.predict(X_test)

#Evaluate the model
log_reg_acc = accuracy_score(y_test, y_pred_log_reg)
log_reg_report = classification_report(y_test, y_pred_log_reg)

# Output accuracy and classification report
print(f"Logistic Regression model accuracy: {log_reg_acc * 100:.2f}%")
print("Classification Report:\n", log_reg_report)


Logistic Regression model accuracy: 85.01%
Classification Report:
                      precision    recall  f1-score   support

insufficient_weight       0.87      0.95      0.91        87
      normal_weight       0.87      0.65      0.75        95
     obesity_type_i       0.93      0.88      0.90       105
    obesity_type_ii       0.97      0.99      0.98        95
   obesity_type_iii       0.98      1.00      0.99        91
 overweight_level_i       0.65      0.65      0.65        80
overweight_level_ii       0.64      0.80      0.71        74

           accuracy                           0.85       627
          macro avg       0.85      0.85      0.84       627
       weighted avg       0.86      0.85      0.85       627

