Import libraries

In [30]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Create path

In [31]:
proj_root = Path.cwd().resolve().parent
sys.path.insert(0, str(proj_root))

In [32]:
RAW_DATA = proj_root / "data" / "raw"
PROCESSED_DATA = proj_root / "data" / "processed"

In [33]:
train_path = RAW_DATA / "train.csv"
test_path = RAW_DATA / "test.csv"
processed_path = PROCESSED_DATA / "processed.csv"

Import datasets:

In [34]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

Get a basic overview

In [35]:
df_train.shape, df_test.shape

((911, 12), (911, 11))

In [36]:
df_train.head()

Unnamed: 0,ID,date,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg
0,0,2022-12-19,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy,28.946465
1,1,2023-11-21,244,15,10.317872,64.430475,1,0,42.311779,,MeAt,51.549053
2,4,2022-02-01,148,16,27.7143,69.046113,1,0,41.184305,Beginner,MeAt,53.008323
3,5,2023-03-19,157,19,19.173902,46.292823,6,0,41.543492,Beginner,MeAt,48.621527
4,6,2022-07-18,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,MEAT,44.156984


In [37]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                911 non-null    int64  
 1   date              911 non-null    object 
 2   meals_served      911 non-null    int64  
 3   kitchen_staff     911 non-null    int64  
 4   temperature_C     911 non-null    float64
 5   humidity_percent  911 non-null    float64
 6   day_of_week       911 non-null    int64  
 7   special_event     911 non-null    int64  
 8   past_waste_kg     911 non-null    float64
 9   staff_experience  747 non-null    object 
 10  waste_category    911 non-null    object 
 11  food_waste_kg     911 non-null    float64
dtypes: float64(4), int64(5), object(3)
memory usage: 85.5+ KB


In [38]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,911.0,528.327113,305.072794,0.0,266.0,531.0,795.5,1049.0
meals_served,911.0,375.405049,502.812717,100.0,211.0,306.0,407.0,4730.0
kitchen_staff,911.0,11.90011,4.285153,5.0,8.0,12.0,15.0,19.0
temperature_C,911.0,22.18928,8.922389,-10.372207,15.684585,22.11504,28.804294,60.0
humidity_percent,911.0,60.761313,17.330821,30.121111,46.017835,61.63396,75.78791,89.982828
day_of_week,911.0,3.01427,2.009542,0.0,1.0,3.0,5.0,6.0
special_event,911.0,0.08562,0.279956,0.0,0.0,0.0,0.0,1.0
past_waste_kg,911.0,27.015691,12.774223,5.008394,16.091383,26.854109,38.149878,49.803703
food_waste_kg,911.0,44.842691,27.934366,10.819048,32.887912,41.14693,50.046681,274.328783


In [39]:
df_test.head()

Unnamed: 0,ID,date,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category
0,0,2022-12-19,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy
1,1,2023-11-21,244,15,10.317872,64.430475,1,0,42.311779,,MeAt
2,4,2022-02-01,148,16,27.7143,69.046113,1,0,41.184305,Beginner,MeAt
3,5,2023-03-19,157,19,19.173902,46.292823,6,0,41.543492,Beginner,MeAt
4,6,2022-07-18,297,10,26.375233,79.741064,0,0,26.525097,Intermediate,MEAT


We are going to combine train & test dataframes for uniform Preprocessing.

In [40]:
#df_test['food_waste_kg'] = np.nan

In [41]:
#df = pd.concat([df_train, df_test], ignore_index = True)

Check for messing values

In [42]:
missing = df_train.isnull().sum()
missing = missing[missing > 0].sort_values(ascending = False)
missing

staff_experience    164
dtype: int64

Check for duplicates

In [43]:
duplicates = df_train.duplicated().sum()
duplicates

np.int64(0)

Unique counts per column

In [44]:
df_train.nunique()

ID                  911
date                867
meals_served        373
kitchen_staff        15
temperature_C       892
humidity_percent    867
day_of_week           7
special_event         2
past_waste_kg       867
staff_experience      4
waste_category        5
food_waste_kg       867
dtype: int64

Value counts for categorical

In [45]:
cat_cols = df_train.select_dtypes(include = "object").columns
for col in cat_cols:
    print(f"\n Unique values:", df_train[col].value_counts(dropna = False))


 Unique values: date
2022-01-28    2
2022-01-13    2
2022-02-01    2
2022-01-11    2
2022-01-05    2
             ..
2024-05-21    1
2022-03-29    1
2022-11-27    1
2023-04-12    1
2024-05-10    1
Name: count, Length: 867, dtype: int64

 Unique values: staff_experience
Beginner        191
EXPERT          186
Intermediate    186
intermediate    184
NaN             164
Name: count, dtype: int64

 Unique values: waste_category
MEAT          210
dairy         180
Vegetables    176
GRAINS        176
MeAt          169
Name: count, dtype: int64


**Pre-Pipeline Manual Cleaning:**

In [46]:
df_train.drop(columns = ['ID'], inplace = True)

In [47]:
for col in cat_cols:
    df_train[col] = df_train[col].str.lower().str.strip()

In [48]:
for col in cat_cols:
    print(f"\n Unique values:", df_train[col].value_counts(dropna = False))


 Unique values: date
2022-01-28    2
2022-01-13    2
2022-02-01    2
2022-01-11    2
2022-01-05    2
             ..
2024-05-21    1
2022-03-29    1
2022-11-27    1
2023-04-12    1
2024-05-10    1
Name: count, Length: 867, dtype: int64

 Unique values: staff_experience
intermediate    370
beginner        191
expert          186
NaN             164
Name: count, dtype: int64

 Unique values: waste_category
meat          379
dairy         180
vegetables    176
grains        176
Name: count, dtype: int64


In [49]:
df_train['staff_experience'].fillna('unknown', inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['staff_experience'].fillna('unknown', inplace = True)


In [50]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              911 non-null    object 
 1   meals_served      911 non-null    int64  
 2   kitchen_staff     911 non-null    int64  
 3   temperature_C     911 non-null    float64
 4   humidity_percent  911 non-null    float64
 5   day_of_week       911 non-null    int64  
 6   special_event     911 non-null    int64  
 7   past_waste_kg     911 non-null    float64
 8   staff_experience  911 non-null    object 
 9   waste_category    911 non-null    object 
 10  food_waste_kg     911 non-null    float64
dtypes: float64(4), int64(4), object(3)
memory usage: 78.4+ KB


**Feature Engineering (Data Features)**

In [51]:
df_train.head(100)

Unnamed: 0,date,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg
0,2022-12-19,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy,28.946465
1,2023-11-21,244,15,10.317872,64.430475,1,0,42.311779,unknown,meat,51.549053
2,2022-02-01,148,16,27.714300,69.046113,1,0,41.184305,beginner,meat,53.008323
3,2023-03-19,157,19,19.173902,46.292823,6,0,41.543492,beginner,meat,48.621527
4,2022-07-18,297,10,26.375233,79.741064,0,0,26.525097,intermediate,meat,44.156984
...,...,...,...,...,...,...,...,...,...,...,...
95,2022-04-20,235,5,28.047191,70.585582,2,0,23.673220,intermediate,dairy,37.030818
96,2023-05-22,448,7,19.776069,53.684061,0,1,47.023967,beginner,dairy,61.496623
97,2024-08-24,107,12,28.129639,65.825746,5,0,12.406967,beginner,meat,18.316725
98,2024-08-07,172,10,14.221355,43.750317,2,0,49.696934,intermediate,meat,39.176465


In [52]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_train['month'] = df_train['date'].dt.month
df_train['is_weekend'] = (df_train['date'].dt.weekday >= 5).astype(int)

In [53]:
df_train.drop(columns = ['date'], inplace = True)

In [55]:
df_train.head(30)

Unnamed: 0,meals_served,kitchen_staff,temperature_C,humidity_percent,day_of_week,special_event,past_waste_kg,staff_experience,waste_category,food_waste_kg,month,is_weekend
0,196,13,27.887273,45.362854,0,0,7.740587,intermediate,dairy,28.946465,12,0
1,244,15,10.317872,64.430475,1,0,42.311779,unknown,meat,51.549053,11,0
2,148,16,27.7143,69.046113,1,0,41.184305,beginner,meat,53.008323,2,0
3,157,19,19.173902,46.292823,6,0,41.543492,beginner,meat,48.621527,3,1
4,297,10,26.375233,79.741064,0,0,26.525097,intermediate,meat,44.156984,7,0
5,241,18,16.863506,79.285919,3,0,11.834878,intermediate,dairy,27.39367,3,0
6,443,16,19.888627,77.328136,0,0,22.862659,beginner,vegetables,52.172118,4,0
7,416,16,18.559591,75.786502,5,1,34.599442,intermediate,meat,72.052407,12,1
8,439,18,24.111027,43.395803,4,0,17.459149,expert,dairy,44.284157,7,0
9,267,7,25.412493,89.405183,1,0,23.067392,expert,grains,33.23393,11,0


In [29]:
df_train.to_csv(processed_path, index=False)


**Define Target and Features**

In [60]:
X = df_train.drop(columns = ['food_waste_kg'])
y = df_train['food_waste_kg']

Split the dataset

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 42)

**Now we will build a pipeline:**

In [30]:
# we have to treat features differently
categorical_cols = ['staff_experience', 'waste_category']
numerical_cols = [ col for col in X_train.columns if col not in categorical_cols]

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor

# Encoding only categorical columns
preprocess = ColumnTransformer(
    transformers = [
        ("cat", OrdinalEncoder(), categorical_cols)
    ],

    remainder = "passthrough"
)

# Full Pipeline: Preprocess + model

model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocess),
    ("model", RandomForestRegressor(
        n_estimators = 300,
        random_state = 42,
        max_depth = 15
    ))
])


**Train Model**

In [32]:
model_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


**Evaluate Model**

In [33]:
from sklearn.metrics import mean_absolute_error, r2_score

preds = model_pipeline.predict(X_test)


print("MAE:", mean_absolute_error(y_test, preds))
print("R2:", r2_score(y_test, preds))


MAE: 4.5855692957809895
R2: 0.9220388551326355


In [None]:
corr = df_train.corr(numeric_only=True)['food_waste_kg'].sort_values(ascending=False)
print(corr)


food_waste_kg       1.000000
meals_served        0.852128
past_waste_kg       0.260767
special_event       0.118757
month               0.024007
day_of_week         0.008090
is_weekend          0.007370
kitchen_staff       0.007192
temperature_C      -0.015911
humidity_percent   -0.021793
Name: food_waste_kg, dtype: float64
