In [1]:
import os, seaborn, sklearn, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from scipy import sparse
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, balanced_accuracy_score, f1_score
from sklearn.metrics import classification_report

In [8]:
file = Path("ObesityDataSet_raw_and_data_sinthetic.csv")

df = pd.read_csv(file)


df = df.dropna()
df = df.drop(columns=["Height"])
df = df.drop(columns=["Weight"])

df = df.rename(columns={"NObeyesdad":"Obesity_levels"})
df.head()

Unnamed: 0,Gender,Age,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity_levels
0,Female,21.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [9]:
X = df.drop(columns=["Obesity_levels"])
y = df["Obesity_levels"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

cat_cols = [
    "Gender",
    "family_history_with_overweight",
    "FAVC",
    "CAEC",
    "SMOKE",
    "SCC",
    "CALC",
    "MTRANS"
]

num_cols = [
    "Age",
    "FCVC",
    "NCP",
    "CH2O",
    "FAF",
    "TUE"
]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols),
        ("num", StandardScaler(with_mean=False), num_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3, 
)

In [13]:
#baseline model
from sklearn.dummy import DummyClassifier

#majority vote strategy
clf = LogisticRegression( solver="lbfgs", max_iter = 5000, class_weight = "balanced")

dummy = Pipeline (steps=[ ("preprocess", preprocess), ("dummy", DummyClassifier(strategy="most_frequent", random_state=42))]
)



In [14]:

clf = LogisticRegression( solver="lbfgs", max_iter = 5000, class_weight = "balanced")

pipe = Pipeline (steps=[ ("preprocess", preprocess), ("model", clf)]
)

pipe.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,False
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,5000


In [19]:



y_pred = pipe.predict(X_test)


scoring = {
    'f1_macro': 'f1_macro',
    'balanced_accuracy': make_scorer(balanced_accuracy_score)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(pipe, X, y, cv=cv, scoring=scoring, n_jobs=-1)

dummyresults=cross_validate(dummy, X, y, cv=cv, scoring = scoring, n_jobs=1)


print(f"Dummy(most_frequent)  F1-macro: {dummyresults['test_f1_macro'].mean():.3f} ± {dummyresults['test_f1_macro'].std():.3f} | "
      f"BalAcc: {dummyresults['test_balanced_accuracy'].mean():.3f} ± {dummyresults['test_balanced_accuracy'].std():.3f}")
print("##################")
print(f"Model F1-macro: {cv_results['test_f1_macro'].mean():.3f} ± {cv_results['test_f1_macro'].std():.3f}")
print(f"Model Balanced Accuracy: {cv_results['test_balanced_accuracy'].mean():.3f} ± {cv_results['test_balanced_accuracy'].std():.3f}")

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))





Dummy(most_frequent)  F1-macro: 0.041 ± 0.000 | BalAcc: 0.143 ± 0.000
##################
Model F1-macro: 0.589 ± 0.018
Model Balanced Accuracy: 0.607 ± 0.017
                     precision    recall  f1-score   support

Insufficient_Weight      0.607     0.794     0.688        68
      Normal_Weight      0.588     0.417     0.488        72
     Obesity_Type_I      0.547     0.591     0.568        88
    Obesity_Type_II      0.552     0.865     0.674        74
   Obesity_Type_III      0.909     0.988     0.947        81
 Overweight_Level_I      0.643     0.493     0.558        73
Overweight_Level_II      0.424     0.194     0.267        72

           accuracy                          0.625       528
          macro avg      0.610     0.620     0.598       528
       weighted avg      0.613     0.625     0.603       528



SyntaxError: invalid syntax (2505283722.py, line 1)