In [4]:
import pandas as pd
import numpy as np


In [5]:
import sys, os

# # Go one folder up, then into Src
project_root = os.path.abspath("..")
sys.path.append(project_root + "/src")

from data_cleaning import clean_dataset

df = pd.read_csv("../app/data/generated_diet_data.csv")
df = clean_dataset(df)


In [6]:
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,bmi,activity_level,goal,medical_condition,diet_preference,water_intake_liters,sleep_hours,hemoglobin,sugar_level,cholesterol_level,recommended_diet
0,36,Male,177.6,69.9,34.6,Light,Lose Weight,Unknown,Non-Veg,1.7,7.5,15.5,Slightly High,Borderline,Low-Carb Diet
1,30,Male,180.7,81.3,24.9,Light,Lose Weight,Unknown,Veg,2.84,6.2,14.0,Normal,Borderline,Balanced Diet
2,38,Male,170.3,83.2,28.7,Light,Gain Muscle,Unknown,Veg,2.21,5.6,14.7,Normal,Borderline,Balanced Diet
3,47,Male,168.1,71.8,25.4,Sedentary,Lose Weight,Unknown,Veg,2.22,7.6,14.2,Slightly High,High,Heart-Healthy Diet
4,29,Male,164.3,71.3,26.4,Light,Lose Weight,Unknown,Non-Veg,1.48,7.2,14.1,Normal,Borderline,Balanced Diet


In [7]:
from sklearn.model_selection import train_test_split

target = "recommended_diet"

#features of (X) and (y)
X = df.drop(columns=[target])
y = df[target]

#stratified split : keeping class proprtions the same
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

# make a copy of it for easy saving or for verifying
train_df = X_train.copy()
train_df[target] = y_train

test_df = y_test.copy()
test_df[target] = y_test

#Need to save train and test split data
import os
os.makedirs("../app/data/processed", exist_ok=True)
train_df.to_csv("../app/data/processed/train.csv", index=False)
test_df.to_csv("../app/data/processed/test.csv", index=False)

print("Train shape", train_df.shape)
print("Test_df", test_df.shape)



Train shape (2000, 15)
Test_df (501,)


In [8]:
#lets check if the class distribution in each split

print("\n overall target dist:")
print(df[target].value_counts(normalize=True).sort_index())

print("\n Train target dist:")
print(train_df[target].value_counts(normalize=True).sort_index())

print("\n Test target dist:")
print(test_df[target].value_counts(normalize=True).sort_index())


 overall target dist:
recommended_diet
Balanced Diet         0.6444
Heart-Healthy Diet    0.0928
High-Protein Diet     0.0284
Iron-Rich Diet        0.0288
Low-Carb Diet         0.2056
Name: proportion, dtype: float64

 Train target dist:
recommended_diet
Balanced Diet         0.6445
Heart-Healthy Diet    0.0925
High-Protein Diet     0.0285
Iron-Rich Diet        0.0290
Low-Carb Diet         0.2055
Name: proportion, dtype: float64

 Test target dist:
recommended_diet
Balanced Diet         0.644
Heart-Healthy Diet    0.094
High-Protein Diet     0.028
Iron-Rich Diet        0.028
Low-Carb Diet         0.206
Name: proportion, dtype: float64


In [9]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



In [10]:
# detect columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

##### Building the preprocessing Transformers

**Numerical Transformer**

In [11]:
numeric_transformer = Pipeline(
        ("scaler", StandardScaler)
)

**Categorical Transformer**

In [12]:
categorical_tansformer = Pipeline([
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

**Combining both Numeric and Categorical**

In [13]:
preprocessor = ColumnTransformer(
    transformers = [
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_tansformer, cat_cols)
    ]
)

**Encoding the *target* column seperately**

In [14]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [15]:
print(y_train_encoded)

[0 0 4 ... 0 2 0]
