In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv


# Train Df Setup

In [3]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Read the data
train_df = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")

# Define features and target variable
features = train_df.columns.drop('NObeyesdad')

X = train_df[features]
y = train_df["NObeyesdad"]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)



Train set: (16606, 17) (16606,)
Test set: (4152, 17) (4152,)


## Label Decoder Chart

In [4]:
# Get the mapping between encoded labels and categories
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))

# Print the mapping
print("Label Mapping:")
for category, label in label_mapping.items():
    print(f"Category: {category} - Label: {label}")

Label Mapping:
Category: Insufficient_Weight - Label: 0
Category: Normal_Weight - Label: 1
Category: Obesity_Type_I - Label: 2
Category: Obesity_Type_II - Label: 3
Category: Obesity_Type_III - Label: 4
Category: Overweight_Level_I - Label: 5
Category: Overweight_Level_II - Label: 6


# Test DF Setup

In [5]:
numerical_columns = train_df.select_dtypes(include=['float64', 'int64']).columns

numerical_columns

Index(['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'], dtype='object')

In [6]:
#test df
from sklearn.preprocessing import MinMaxScaler
test_df = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")

basic_features_copy = test_df.columns
# Using pandas get_dummies for one-hot encoding
test_df = pd.get_dummies(test_df, columns=['Gender'])


# Select numerical columns
numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the numerical columns
test_df[numerical_columns] = scaler.fit_transform(test_df[numerical_columns])
print("Scaled DataFrame:\n", test_df)

# One-hot encoding for 'MTRANS'
test_df = pd.get_dummies(test_df, columns=['MTRANS'])

# One-hot encoding for 'CALC'
test_df = pd.get_dummies(test_df, columns=['CALC'])

# One-hot encoding for 'family_history_with_overweight'
test_df = pd.get_dummies(test_df, columns=['family_history_with_overweight'])

# One-hot encoding for 'MTRANS'
test_df = pd.get_dummies(test_df, columns=['SMOKE'])

# One-hot encoding for 'CALC'
test_df = pd.get_dummies(test_df, columns=['FAVC'])

# One-hot encoding for 'family_history_with_overweight'
test_df = pd.get_dummies(test_df, columns=['CAEC'])

test_df = pd.get_dummies(test_df, columns=['SCC'])

features = test_df.columns
X_2 = test_df[features]





Scaled DataFrame:
           id       Age    Height    Weight family_history_with_overweight  \
0      20758  0.274466  0.751498  0.647675                            yes   
1      20759  0.148936  0.283019  0.214188                            yes   
2      20760  0.255319  0.364821  0.575933                            yes   
3      20761  0.148495  0.194579  0.513014                            yes   
4      20762  0.255319  0.334709  0.522265                            yes   
...      ...       ...       ...       ...                            ...   
13835  34593  0.198465  0.512045  0.309624                            yes   
13836  34594  0.319149  0.264151  0.182457                             no   
13837  34595  0.190119  0.255749  0.042652                             no   
13838  34596  0.148936  0.320755  0.111061                            yes   
13839  34597  0.265764  0.683508  0.650343                            yes   

      FAVC      FCVC       NCP        CAEC SMOKE      CH

## Comparison of Old and New Test_DF

In [7]:
#OLD FEATURES
print(f'Test Dataframe Old Features: \n')
for v,a in enumerate(basic_features_copy):
    print(f'index: {v}, \t Feature: {a}')

Test Dataframe Old Features: 

index: 0, 	 Feature: id
index: 1, 	 Feature: Gender
index: 2, 	 Feature: Age
index: 3, 	 Feature: Height
index: 4, 	 Feature: Weight
index: 5, 	 Feature: family_history_with_overweight
index: 6, 	 Feature: FAVC
index: 7, 	 Feature: FCVC
index: 8, 	 Feature: NCP
index: 9, 	 Feature: CAEC
index: 10, 	 Feature: SMOKE
index: 11, 	 Feature: CH2O
index: 12, 	 Feature: SCC
index: 13, 	 Feature: FAF
index: 14, 	 Feature: TUE
index: 15, 	 Feature: CALC
index: 16, 	 Feature: MTRANS


In [8]:
#NEW FEATURES
print(f'Test Dataframe New Features: \n')
for n,i in enumerate(features):
    print(f'index: {n}, \t Feature: {i}')

Test Dataframe New Features: 

index: 0, 	 Feature: id
index: 1, 	 Feature: Age
index: 2, 	 Feature: Height
index: 3, 	 Feature: Weight
index: 4, 	 Feature: FCVC
index: 5, 	 Feature: NCP
index: 6, 	 Feature: CH2O
index: 7, 	 Feature: FAF
index: 8, 	 Feature: TUE
index: 9, 	 Feature: Gender_Female
index: 10, 	 Feature: Gender_Male
index: 11, 	 Feature: MTRANS_Automobile
index: 12, 	 Feature: MTRANS_Bike
index: 13, 	 Feature: MTRANS_Motorbike
index: 14, 	 Feature: MTRANS_Public_Transportation
index: 15, 	 Feature: MTRANS_Walking
index: 16, 	 Feature: CALC_Always
index: 17, 	 Feature: CALC_Frequently
index: 18, 	 Feature: CALC_Sometimes
index: 19, 	 Feature: CALC_no
index: 20, 	 Feature: family_history_with_overweight_no
index: 21, 	 Feature: family_history_with_overweight_yes
index: 22, 	 Feature: SMOKE_no
index: 23, 	 Feature: SMOKE_yes
index: 24, 	 Feature: FAVC_no
index: 25, 	 Feature: FAVC_yes
index: 26, 	 Feature: CAEC_Always
index: 27, 	 Feature: CAEC_Frequently
index: 28, 	 Featur

In [9]:
#ONLY NEW CREATED FEATURES BY ONE-HOT
print(f'New Features Created by One-Hot: \n')
for o in features:
    if o not in basic_features_copy:
        print(o)

New Features Created by One-Hot: 

Gender_Female
Gender_Male
MTRANS_Automobile
MTRANS_Bike
MTRANS_Motorbike
MTRANS_Public_Transportation
MTRANS_Walking
CALC_Always
CALC_Frequently
CALC_Sometimes
CALC_no
family_history_with_overweight_no
family_history_with_overweight_yes
SMOKE_no
SMOKE_yes
FAVC_no
FAVC_yes
CAEC_Always
CAEC_Frequently
CAEC_Sometimes
CAEC_no
SCC_no
SCC_yes


### Numerical Values Min Max Scaler

In [10]:
from sklearn.preprocessing import MinMaxScaler

# Select numerical columns
numerical_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Using pandas get_dummies for one-hot encoding
train_df = pd.get_dummies(train_df, columns=['Gender'])

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the numerical columns
train_df[numerical_columns] = scaler.fit_transform(train_df[numerical_columns])


### Dealing with Categorical values

In [11]:
# One-hot encoding for 'MTRANS'
train_df = pd.get_dummies(train_df, columns=['MTRANS'])

# One-hot encoding for 'CALC'
train_df = pd.get_dummies(train_df, columns=['CALC'])

# One-hot encoding for 'family_history_with_overweight'
train_df = pd.get_dummies(train_df, columns=['family_history_with_overweight'])

# One-hot encoding for 'MTRANS'
train_df = pd.get_dummies(train_df, columns=['SMOKE'])

# One-hot encoding for 'CALC'
train_df = pd.get_dummies(train_df, columns=['FAVC'])

# One-hot encoding for 'family_history_with_overweight'
train_df = pd.get_dummies(train_df, columns=['CAEC'])

train_df = pd.get_dummies(train_df, columns=['SCC'])



# Changed Train DF Update

In [12]:
features = train_df.drop(columns=['NObeyesdad', 'id']).columns
X = train_df[features]
y = train_df["NObeyesdad"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)

Train set: (16606, 30) (16606,)
Test set: (4152, 30) (4152,)


In [13]:
X_train

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SMOKE_no,SMOKE_yes,FAVC_no,FAVC_yes,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,SCC_no,SCC_yes
9958,0.063830,0.608755,0.460108,1.000000,0.666667,0.500000,1.000000,0.500000,False,True,...,True,False,False,True,True,False,False,False,True,False
7841,0.184417,0.577155,0.125952,0.500000,1.000000,0.500000,0.666667,0.500000,False,True,...,True,False,False,True,False,False,True,False,True,False
9293,0.166773,0.703620,0.658629,1.000000,0.626939,0.322669,0.246627,0.000000,False,True,...,True,False,False,True,False,False,True,False,True,False
15209,0.574468,0.247307,0.325249,0.500000,0.666667,0.000000,0.000000,0.000000,True,False,...,True,False,False,True,False,False,True,False,True,False
16515,0.191489,0.665826,0.444243,1.000000,0.666667,1.000000,0.666667,0.500000,False,True,...,True,False,True,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,0.085106,0.518319,0.328761,0.814396,0.520965,0.422323,0.096011,0.361138,True,False,...,True,False,False,True,False,False,True,False,True,False
11964,0.170213,0.570708,0.444243,1.000000,0.666667,0.500000,1.000000,0.000000,False,True,...,True,False,True,False,False,False,True,False,True,False
5390,0.148936,0.323401,0.230054,0.500000,0.666667,1.000000,0.666667,0.000000,False,True,...,True,False,False,True,True,False,False,False,True,False
860,0.255319,0.380710,0.578623,1.000000,0.666667,0.885366,0.000000,0.118653,True,False,...,True,False,False,True,False,False,True,False,True,False


# MODELS

In [14]:
#impors
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform

### XGBClassifier

In [15]:
# Create XGBoost classifier
model = XGBClassifier(objective="multi:softmax",learning_rate=0.2)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9007707129094412

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94       524
           1       0.88      0.88      0.88       626
           2       0.88      0.86      0.87       543
           3       0.98      0.97      0.97       657
           4       1.00      1.00      1.00       804
           5       0.78      0.78      0.78       484
           6       0.78      0.81      0.79       514

    accuracy                           0.90      4152
   macro avg       0.89      0.89      0.89      4152
weighted avg       0.90      0.90      0.90      4152



### XGBClassifier with param_grid, RandomizedSearchCV, random_search

In [16]:
# Create XGBoost classifier
model = XGBClassifier(objective="multi:softmax", num_class=len(np.unique(y_encoded)), random_state=42)

# Define the parameter grid with reduced size
param_grid = {
    'learning_rate': loguniform(0.03, 0.3),
    'n_estimators': [100, 200],
    'max_depth': [3, 4],
    'min_child_weight': [1, 2],
    'gamma': [0, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0, 0.1]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Fit the best model with early stopping
best_model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=False)

# Make predictions
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.9072736030828517

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.94      0.94       524
           1       0.88      0.89      0.88       626
           2       0.90      0.87      0.88       543
           3       0.98      0.97      0.98       657
           4       1.00      1.00      1.00       804
           5       0.79      0.79      0.79       484
           6       0.80      0.82      0.81       514

    accuracy                           0.91      4152
   macro avg       0.90      0.90      0.90      4152
weighted avg       0.91      0.91      0.91      4152



# Test Prediction

In [17]:
X_2

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,...,SMOKE_no,SMOKE_yes,FAVC_no,FAVC_yes,CAEC_Always,CAEC_Frequently,CAEC_Sometimes,CAEC_no,SCC_no,SCC_yes
0,20758,0.274466,0.751498,0.647675,0.969308,0.666667,0.912815,0.285133,0.000000,False,...,True,False,False,True,False,False,True,False,True,False
1,20759,0.148936,0.283019,0.214188,0.500000,0.000000,1.000000,0.333333,0.000000,True,...,True,False,False,True,False,False,True,False,True,False
2,20760,0.255319,0.364821,0.575933,1.000000,0.666667,0.810939,0.000000,0.125251,True,...,True,False,False,True,False,False,True,False,True,False
3,20761,0.148495,0.194579,0.513014,0.500000,0.659303,0.893209,0.031617,0.000000,False,...,True,False,False,True,False,False,True,False,True,False
4,20762,0.255319,0.334709,0.522265,1.000000,0.666667,0.826766,0.000000,0.370534,True,...,True,False,False,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13835,34593,0.198465,0.512045,0.309624,0.906617,0.666667,0.000000,0.269025,0.389316,False,...,True,False,True,False,False,False,True,False,True,False
13836,34594,0.319149,0.264151,0.182457,1.000000,0.666667,0.500000,0.000000,0.000000,True,...,True,False,False,True,False,False,True,False,True,False
13837,34595,0.190119,0.255749,0.042652,1.000000,0.424580,0.500000,0.649947,0.500000,True,...,True,False,False,True,False,True,False,False,True,False
13838,34596,0.148936,0.320755,0.111061,0.500000,0.666667,0.500000,1.000000,1.000000,False,...,True,False,False,True,False,False,True,False,True,False


In [None]:
#TEST PREDICTION
X_2['CALC'] = np.where(X_2['CALC'] == 'Always', 'Frequently', X_2['CALC'])

y_pred_2 = best_model.predict(X_2)

y_pred_2

In [None]:
# Define the mapping dictionary
label_to_category = {
    0: "Insufficient_Weight",
    1: "Normal_Weight",
    2: "Obesity_Type_I",
    3: "Obesity_Type_II",
    4: "Obesity_Type_III",
    5: "Overweight_Level_I",
    6: "Overweight_Level_II"
}


# Convert numerical labels to categories
predicted_categories = [label_to_category[label] for label in y_pred_2]




## Submission export

In [None]:
submission_df = pd.DataFrame({
    "id": test_df["id"],  # Assuming test_df contains the test set with 'id' column
    "NObeyesdad": predicted_categories  # Assuming y_pred contains the predicted class labels
})

# Assuming submission_df contains the DataFrame with predictions
submission_df.to_csv("submission.csv", index=False)
