In [22]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.utils import class_weight
import pickle
import numpy as np

In [23]:
df = pd.read_csv("../Data/ObesityDataSet_raw_and_data_sinthetic.csv")

In [24]:
df.shape

(2111, 17)

In [25]:
df.NObeyesdad.value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

In [26]:
df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [27]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [28]:
df.CAEC.value_counts()

CAEC
Sometimes     1765
Frequently     242
Always          53
no              51
Name: count, dtype: int64

In [29]:
df.FAVC.value_counts()

FAVC
yes    1866
no      245
Name: count, dtype: int64

In [30]:
df.MTRANS.value_counts()

MTRANS
Public_Transportation    1580
Automobile                457
Walking                    56
Motorbike                  11
Bike                        7
Name: count, dtype: int64

In [31]:
df_dtypes = dict(df.dtypes)

In [32]:
def cat_ord_sep(df_dict, df):
    df_cat = pd.DataFrame([])
    df_ord = pd.DataFrame([])
    for cols in df:
        if df[cols].dtype == 'O':
            df_cat[cols] = df[cols]
        elif df[cols].dtype == 'int64':
            df_ord[cols] = df[cols]
        else:
            df_ord[cols] = df[cols]
    
    return df_cat, df_ord

df_copy = df.copy()
target = df_copy.pop("NObeyesdad")

df_cat_final, df_ord_final = cat_ord_sep(df_dtypes, df_copy)
df_ord_columns = list(df_ord_final.columns)
df_cat_columns = list(df_cat_final.columns)

In [33]:
df.NObeyesdad.value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

In [34]:
X = df.copy()
y = X.pop("NObeyesdad")

In [35]:
encoder_le = LabelEncoder()
y_enc = encoder_le.fit_transform(y)
X_train, X_test, y_train, y_test =  train_test_split(X, y_enc, test_size = 0.20, random_state=42, stratify=y_enc)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

Targ_enc = TargetEncoder(cols=df_cat_columns)
X_train_te = Targ_enc.fit_transform(X_train[df_cat_columns], y_train)
X_val_te = Targ_enc.transform(X_val[df_cat_columns])
X_test_te = Targ_enc.transform(X_test[df_cat_columns])

X_train_te = pd.DataFrame(X_train_te, columns=df_cat_columns)
X_train_final = pd.concat([X_train_te, X_train[df_ord_columns]], axis=1)

X_val_te = pd.DataFrame(X_val_te, columns=df_cat_columns)
X_val_final = pd.concat([X_val_te, X_val[df_ord_columns]], axis=1)

X_test_te = pd.DataFrame(X_test_te,columns=df_cat_columns)
X_test_final = pd.concat([X_test_te, X_test[df_ord_columns]], axis=1)


scaler =  StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_val_scaled = scaler.transform(X_val_final)
X_test_scaled = scaler.transform(X_test_final)

In [36]:
scaler_file = "../tools/scaler.sav"
targ_enc_file = "../tools/targ_enc.sav"
pickle.dump(scaler, open(scaler_file,'wb'))
pickle.dump(Targ_enc, open(targ_enc_file,'wb'))

train_df_scaled = pd.DataFrame(X_train_scaled,columns=X.columns)
test_df_scaled = pd.DataFrame(X_test_scaled,columns=X.columns)
val_df_scaled = pd.DataFrame(X_val_scaled,columns=X.columns)

train_y_series = pd.DataFrame(y_train, columns=["NObeyesdad"])
test_y_series = pd.DataFrame(y_test, columns=["NObeyesdad"])
val_y_series = pd.DataFrame(y_val, columns=["NObeyesdad"])

Train = pd.concat([train_df_scaled,train_y_series], axis=1)
Test = pd.concat([test_df_scaled,test_y_series], axis=1)
Val = pd.concat([val_df_scaled,val_y_series], axis=1)


Train.to_csv("../Data/Train.csv")
Test.to_csv("../Data/Test.csv")
Val.to_csv("../Data/Val.csv")


In [37]:
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes = np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

weights_file = "../tools/class_weights.sav"
labels_file = "../tools/labels.sav"
pickle.dump(class_weight_dict, open(weights_file,'wb'))
pickle.dump(encoder_le, open(labels_file,'wb'))
