In [1]:
import pandas as pd

RAW = '/kaggle/input/obesity-levels/ObesityDataSet_raw_and_data_sinthetic.csv'

df = pd.read_csv(filepath_or_buffer=RAW)
for column in ['FAVC', 'SCC', 'SMOKE', 'family_history_with_overweight']:
    df[column + '_bool'] = df[column] == 'yes'
df['BMI'] = df['Weight'] / (df['Height'] * df['Height'])
TARGET = 'NObeyesdad'
df.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,...,FAF,TUE,CAEC,MTRANS,NObeyesdad,FAVC_bool,SCC_bool,SMOKE_bool,family_history_with_overweight_bool,BMI
0,21.0,Female,1.62,64.0,no,no,2.0,3.0,no,no,...,0.0,1.0,Sometimes,Public_Transportation,Normal_Weight,False,False,False,True,24.386526
1,21.0,Female,1.52,56.0,Sometimes,no,3.0,3.0,yes,yes,...,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,False,True,True,True,24.238227
2,23.0,Male,1.8,77.0,Frequently,no,2.0,3.0,no,no,...,2.0,1.0,Sometimes,Public_Transportation,Normal_Weight,False,False,False,True,23.765432
3,27.0,Male,1.8,87.0,Frequently,no,3.0,3.0,no,no,...,2.0,0.0,Sometimes,Walking,Overweight_Level_I,False,False,False,False,26.851852
4,22.0,Male,1.78,89.8,Sometimes,no,2.0,1.0,no,no,...,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II,False,False,False,False,28.342381


In [2]:
df.nunique()

Age                                    1402
Gender                                    2
Height                                 1574
Weight                                 1525
CALC                                      4
FAVC                                      2
FCVC                                    810
NCP                                     635
SCC                                       2
SMOKE                                     2
CH2O                                   1268
family_history_with_overweight            2
FAF                                    1190
TUE                                    1129
CAEC                                      4
MTRANS                                    5
NObeyesdad                                7
FAVC_bool                                 2
SCC_bool                                  2
SMOKE_bool                                2
family_history_with_overweight_bool       2
BMI                                    1968
dtype: int64

In [3]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)

express.histogram(data_frame=df, x='Age', color=TARGET)

In [4]:
express.histogram(data_frame=df, x=TARGET)

Our classes are nearly balanced.

In [5]:
express.histogram(data_frame=df, x='BMI', color=TARGET, nbins=200)

BMI does such a good job of predicting obesity by itself that it seems unsporting to use it in the model. So let's leave it out.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Age                                  2111 non-null   float64
 1   Gender                               2111 non-null   object 
 2   Height                               2111 non-null   float64
 3   Weight                               2111 non-null   float64
 4   CALC                                 2111 non-null   object 
 5   FAVC                                 2111 non-null   object 
 6   FCVC                                 2111 non-null   float64
 7   NCP                                  2111 non-null   float64
 8   SCC                                  2111 non-null   object 
 9   SMOKE                                2111 non-null   object 
 10  CH2O                                 2111 non-null   float64
 11  family_history_with_overweight

We have no nulls, so let's build a simple model based on just numerical values.

In [7]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

COLUMNS = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'] + [column for column in df.columns if column.endswith('bool')] 
X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.20, random_state=2024, stratify=df[TARGET])

time_start = arrow.now()
regression = LogisticRegression(max_iter=100000, tol=1e-4).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(regression.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=regression.predict(X=X_test))))
print('model done in {}'.format(arrow.now() - time_start))

model fit in 11454 iterations took 0:00:07.393282
accuracy: 0.7683
model done in 0:00:07.397983


In [8]:
express.histogram(x=COLUMNS, y=regression.coef_[0]).show()

In [9]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=regression.predict(X=X_test)))

                     precision    recall  f1-score   support

Insufficient_Weight       0.76      0.94      0.84        54
      Normal_Weight       0.76      0.55      0.64        58
     Obesity_Type_I       0.78      0.81      0.80        70
    Obesity_Type_II       0.83      0.88      0.85        60
   Obesity_Type_III       0.93      1.00      0.96        65
 Overweight_Level_I       0.65      0.69      0.67        58
Overweight_Level_II       0.60      0.47      0.52        58

           accuracy                           0.77       423
          macro avg       0.76      0.76      0.76       423
       weighted avg       0.76      0.77      0.76       423

