In [1]:
# Import the main Libraries
import pandas as pd
import warnings
import joblib
warnings.filterwarnings("ignore")
%reload_ext nb_black


##  preprocessing
from sklearn.model_selection import train_test_split , cross_val_score, StratifiedKFold,cross_val_predict
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import KNNImputer ,SimpleImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn_features.transformers import DataFrameSelector
from imblearn.over_sampling import SMOTE
import joblib

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Evaluation Metric
from sklearn.metrics import f1_score , make_scorer, accuracy_score

<IPython.core.display.Javascript object>

In [2]:
# Data path
path_df_nature = r"..\data\processed/Nature data.pkl"

<IPython.core.display.Javascript object>

In [3]:
## Load data and print first 5 sample
df_nature = pd.read_pickle(path_df_nature)
df_nature.head()
data=[2.0,3.0,2.0,0.0,1.0,24.4,'Sometimes','no','Normal_Weight','yes','no','no']

Unnamed: 0,gender,age,height,weight,family_history_with_overweight,favc,fcvc,ncp,caec,smoke,ch2o,scc,faf,tue,calc,mtrans,nobeyesdad,bmi
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight,24.4
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,24.2
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight,23.8
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I,26.9
5,Male,29.0,1.62,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal_Weight,20.2


<IPython.core.display.Javascript object>

In [4]:
# Split dataset into X & Y
# Drop specified columns ('nobeyesdad', 'gender', 'age', 'height', 'weight', 'smoke') to create the feature matrix (X)
# 'x' will contain the features, and 'y' will contain the target variable
x = df_nature.drop(columns=['nobeyesdad', 'gender', 'age', 'height', 'weight', 'smoke'])

# 'y' will be the target variable, which is 'nobeyesdad' in this case
y = df_nature['nobeyesdad']

# Split dataset into training and testing sets
# 'x_train' and 'y_train' will contain the training data and labels
# 'x_test' and 'y_test' will contain the testing data and labels
# The data will be split into a 75% training set and a 25% testing set, with shuffling and a random seed for reproducibility
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=True, random_state=42, stratify=y)

# Check the shape of the training and testing datasets
print("x_train_shape", x_train.shape)
print("y_train_shape", y_train.shape)
print("*" * 50)
print("x_test_shape", x_test.shape)
print("y_test_shape", y_test.shape)


x_train_shape (1041, 12)
y_train_shape (1041,)
**************************************************
x_test_shape (347, 12)
y_test_shape (347,)


<IPython.core.display.Javascript object>

In [5]:
# Get the names of numerical columns (exclude "object" dtype columns)
numircal_col = x_train.select_dtypes(exclude="object").columns.tolist()

# Get the names of ordinal categorical columns
catego_col_ordinal = x_train[["caec", "calc", "mtrans"]].columns.tolist()

# Get the names of nominal categorical columns
catego_col_nominal = x_train[["family_history_with_overweight", "favc", "scc"]].columns.tolist()

# Print the names of the identified columns
print("Name of Numircal column:", numircal_col)
print("Name of Ordinal Categorical column:", catego_col_ordinal)
print("Name of Nominal Categorical column:", catego_col_nominal)


Name of Numircal column: ['fcvc', 'ncp', 'ch2o', 'faf', 'tue', 'bmi']
Name of Ordinal Categorical column: ['caec', 'calc', 'mtrans']
Name of Nominal Categorical column: ['family_history_with_overweight', 'favc', 'scc']


<IPython.core.display.Javascript object>

In [6]:
## Create PipeLine

#### Numrical Pipeline
num_pipe = Pipeline(
    steps=[
        ("selector", DataFrameSelector(numircal_col)),
        ("imputer", KNNImputer()),
        ("box-cox", PowerTransformer(method="yeo-johnson", standardize=False)),
        ("normalization", MinMaxScaler(feature_range=(-1, 1))),
    ]
)

## Categorical(ordinal) Pipline
ordinal_pipe = Pipeline(
    steps=[
        ("selector", DataFrameSelector(catego_col_ordinal)),
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoding", OrdinalEncoder()),
    ]
)

## Categorical(nominal) Pipline
nominal_pipe = Pipeline(
    steps=[
        ("selector", DataFrameSelector(catego_col_nominal)),
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoding", OneHotEncoder(drop="first", sparse_output=False)),
    ]
)

## Combine all
all_pipe = FeatureUnion(
    transformer_list=[
        ("Numrical", num_pipe),
        ("ordinal", ordinal_pipe),
        ("nominal", nominal_pipe),
    ]
)

## Apply
x_train_finall = all_pipe.fit_transform(x_train)
x_test_final = all_pipe.transform(x_test)

<IPython.core.display.Javascript object>

In [26]:
x_train.iloc[0:1, :]

Unnamed: 0,family_history_with_overweight,favc,fcvc,ncp,caec,ch2o,scc,faf,tue,calc,mtrans,bmi
1713,yes,yes,1.92,3.0,Sometimes,2.01,no,0.18,0.51,Sometimes,Automobile,36.3


<IPython.core.display.Javascript object>

In [24]:
all_pipe.transform(x_train.iloc[0:1, :])

array([[-0.33276166, -0.26793478, -0.05368913, -0.79041125, -0.2400954 ,
         0.27550601,  2.        ,  1.        ,  0.        ,  1.        ,
         1.        ,  0.        ]])

<IPython.core.display.Javascript object>

In [7]:
# Target transformation: Map target categories to numerical values
# Define a mapping dictionary to map category names to numerical values
map_target={'Insufficient_Weight':0
           ,'Normal_Weight':1
           ,'Overweight_Level_I':2
           ,'Overweight_Level_II':3
           ,'Obesity_Type_I':4
           ,'Obesity_Type_II':5
           ,'Obesity_Type_III':6}

# Apply the mapping to the target variable (y_train) in the training set
y_train = y_train.map(map_target)

# Apply the mapping to the target variable (y_test) in the test set
y_test= y_test.map(map_target)



<IPython.core.display.Javascript object>

In [8]:
## USING SMOTE
over = SMOTE()
x_train_resample, y_train_resample = over.fit_resample(x_train_finall, y_train)

##### check shape
print("Before", y_train_resample.value_counts())
print("*" * 30)
print("After", y_train.value_counts())

Before nobeyesdad
5    242
6    242
3    242
0    242
4    242
2    242
1    242
Name: count, dtype: int64
******************************
After nobeyesdad
6    242
5    162
1    147
4    146
3    128
2    111
0    105
Name: count, dtype: int64


<IPython.core.display.Javascript object>

# `Logistic Regression`

In [9]:
##w SOMTE
log_clf = LogisticRegression(
    penalty="l2",
    tol=0.01,
    C=1000,
    fit_intercept=True,
    random_state=42,
    max_iter=1000,
)

log_clf.fit(x_train_resample, y_train_resample)  ### Learn

## Predict and Evaluation model
y_pred_log_train = log_clf.predict(x_train_finall)
y_pred_log_test = log_clf.predict(x_test_final)

## Evaluation model
f1_train_log = f1_score(y_train, y_pred_log_train, average="micro")
f1_test_log = f1_score(y_test, y_pred_log_test, average="micro")

print(
    f"F1 Score for Training Dataset using Logisitic with class_weights {f1_train_log * 100 :.3f}"
)
print(
    f"F1 Score for Testing Dataset using Logisitic after class_weights {f1_test_log * 100 :.3f}"
)

F1 Score for Training Dataset using Logisitic with class_weights 98.175
F1 Score for Testing Dataset using Logisitic after class_weights 96.254


<IPython.core.display.Javascript object>

# `RandomForestClassifier`

In [10]:
### Bagging------> RandomForestClassifier
rf_clf = RandomForestClassifier(
    n_estimators=100,
    criterion="gini",
    max_depth=10,
    max_leaf_nodes=4,
    oob_score=True,
    min_impurity_decrease=0.01,
    random_state=42,
    class_weight="balanced",
    max_samples=0.8,
    max_features=0.6,
)
rf_clf.fit(x_train_resample, y_train_resample)  ## using SMOTE

## Pridect and Evaluation
y_pred_train_rf = rf_clf.predict(x_train_finall)
y_pred_test_rf = rf_clf.predict(x_test_final)


# Evaluation
f1_rf_train = f1_score(y_train, y_pred_train_rf, average="micro")
f1_rf_test = f1_score(y_test, y_pred_test_rf, average="micro")

print(f"F1 Score for Training Dataset using RandomForest  {f1_rf_train * 100 :.3f}")
print(f"F1Score for Testing Dataset using RandomForest  {f1_rf_test * 100 :.3f}")

F1 Score for Training Dataset using RandomForest  94.813
F1Score for Testing Dataset using RandomForest  95.965


<IPython.core.display.Javascript object>

# `XGBoost`

In [11]:
#### XGBoost
xgboost = XGBClassifier(
    objective="binary:logistic",
    n_estimators=100,
    max_depth=4,
    learning_rate=0.9,
    subsample=0.7,
    reg_lambda=100,
    colsample_bytree=0.8,
    random_state=41,
)

xgboost.fit(x_train_resample, y_train_resample)  ## using SMOTE

## Pridect and Evaluation
y_pred_train_xgboost = xgboost.predict(x_train_finall)
y_pred_test_xgboost = xgboost.predict(x_test_final)


# Evaluation
f1_xgboost_train = f1_score(y_train, y_pred_train_xgboost, average="micro")
f1_xgboost_test = f1_score(y_test, y_pred_test_xgboost, average="micro")

print(
    f"F1 Score for Training Dataset using DecisionTree with SMOTE {f1_xgboost_train * 100 :.3f}"
)
print(
    f"F1 Score for Testing Dataset using DecisionTree after SMOTE {f1_xgboost_test * 100 :.3f}"
)

F1 Score for Training Dataset using DecisionTree with SMOTE 99.520
F1 Score for Testing Dataset using DecisionTree after SMOTE 98.559


<IPython.core.display.Javascript object>

In [12]:
### save moddel
joblib.dump(xgboost, "xgboost.pkl")

['xgboost.pkl']

<IPython.core.display.Javascript object>