In [4]:
import pandas as pd
from category_encoders import MEstimateEncoder
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L2
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras import Sequential
from sklearn.metrics import classification_report, accuracy_score 
import numpy as np

In [5]:
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")

In [8]:
df.shape

(2111, 17)

In [10]:
df.NObeyesdad.value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

In [12]:
df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [14]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [16]:
df.CAEC.value_counts()

CAEC
Sometimes     1765
Frequently     242
Always          53
no              51
Name: count, dtype: int64

In [18]:
df.FAVC.value_counts()

FAVC
yes    1866
no      245
Name: count, dtype: int64

In [20]:
df.MTRANS.value_counts()

MTRANS
Public_Transportation    1580
Automobile                457
Walking                    56
Motorbike                  11
Bike                        7
Name: count, dtype: int64

In [22]:
Frequency_enc = {"Always" : 3, "Frequently" : 3, "Sometimes" : 3, "no" : 3,}
Binary_enc = {"yes": 1, "no":0}
Gender_enc = {"Female":0, "Male":1}
MTRANS_enc = {"Public_Transportation":4,"Automobile":3,"Walking":2,
              "Motorbike":1,"Bike":0,}

In [24]:
df.NObeyesdad.value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

In [26]:
X = df.copy()
y = X.pop("NObeyesdad")

In [28]:
X["CAEC"] = X["CAEC"].map(Frequency_enc)
X["CALC"] = X["CALC"].map(Frequency_enc)
X["FAVC"] = X["FAVC"].map(Binary_enc)
X["family_history_with_overweight"] = X["family_history_with_overweight"].map(Binary_enc)
X["SMOKE"] = X["SMOKE"].map(Binary_enc)
X["SCC"] = X["SCC"].map(Binary_enc)
X["MTRANS"] = X["MTRANS"].map(MTRANS_enc)
X["Gender"] = X["Gender"].map(Gender_enc)

In [30]:
X.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,0,21,1.62,64.0,1,0,2.0,3.0,3,0,2.0,0,0.0,1.0,3,4
1,0,21,1.52,56.0,1,0,3.0,3.0,3,1,3.0,1,3.0,0.0,3,4
2,1,23,1.8,77.0,1,0,2.0,3.0,3,0,2.0,0,2.0,1.0,3,4
3,1,27,1.8,87.0,0,0,3.0,3.0,3,0,2.0,0,2.0,0.0,3,2
4,1,22,1.78,89.8,0,0,2.0,1.0,3,0,2.0,0,0.0,0.0,3,4


In [32]:
X.isna().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
dtype: int64

In [34]:
encoder_le = LabelEncoder()
y_enc = encoder_le.fit_transform(y)
X_train, X_test, y_train, y_test =  train_test_split(X, y_enc, test_size = 0.20, random_state=42, stratify=y_enc)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
scaler =  MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [36]:
pd.DataFrame(y_enc, columns=['classes'])

Unnamed: 0,classes
0,1
1,1
2,1
3,5
4,6
...,...
2106,4
2107,4
2108,4
2109,4


In [58]:
model = Sequential()
model.add(InputLayer(shape=(X_train_scaled.shape[1],)))
model.add(Dense(64, activation='relu', bias_regularizer=L2(1)))
model.add(Dropout(0.3))
model.add(Dense(7, activation ='softmax'))
model.compile(optimizer=Adam(learning_rate = 3e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [60]:
model.fit(X_train_scaled, y_train, epochs=150, validation_data = [X_val_scaled, y_val], shuffle=True)

Epoch 1/150
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.2196 - loss: 1.9153 - val_accuracy: 0.4645 - val_loss: 1.7168
Epoch 2/150
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4117 - loss: 1.6712 - val_accuracy: 0.4822 - val_loss: 1.5093
Epoch 3/150
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4533 - loss: 1.4953 - val_accuracy: 0.5621 - val_loss: 1.3181
Epoch 4/150
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5012 - loss: 1.3464 - val_accuracy: 0.5799 - val_loss: 1.2041
Epoch 5/150
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5270 - loss: 1.2536 - val_accuracy: 0.6124 - val_loss: 1.1111
Epoch 6/150
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5566 - loss: 1.1322 - val_accuracy: 0.6065 - val_loss: 1.0508
Epoch 7/150
[1m43/43[0m [32m━━━

<keras.src.callbacks.history.History at 0x20487862e70>

In [62]:
y_pred = model.predict(X_test_scaled)
predictions = encoder_le.inverse_transform(np.argmax(y_pred, axis=1))
classif = classification_report(np.argmax(y_pred, axis=1), y_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [64]:
pd.DataFrame(classification_report(np.argmax(y_pred, axis=1), y_test, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,1.0,1.0,1.0,54.0
1,0.931034,0.981818,0.955752,55.0
2,1.0,0.958904,0.979021,73.0
3,0.983333,0.983333,0.983333,60.0
4,0.984615,1.0,0.992248,64.0
5,0.965517,0.811594,0.88189,69.0
6,0.810345,0.979167,0.886792,48.0
accuracy,0.955083,0.955083,0.955083,0.955083
macro avg,0.953549,0.959259,0.954148,423.0
weighted avg,0.959195,0.955083,0.954977,423.0


In [66]:
accuracy_score(np.argmax(y_pred, axis=1), y_test)

0.9550827423167849