In [47]:
import pytorch_lightning as pl
import pandas as pd
import torch
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error
from torchmetrics import MeanSquaredError
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

In [2]:
df = pd.read_csv('data/train.csv')
dfm = pd.read_csv('data/meal_info.csv')
dfc = pd.read_csv('data/fulfilment_center_info.csv')
df = pd.merge(df, dfm, on='meal_id', how='left')
df = pd.merge(df, dfc, on='center_id', how='left')

In [3]:
df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,cuisine,city_code,region_code,center_type,op_area
0,1379560,1,55,1885,136.83,152.29,0,0,177,Beverages,Thai,647,56,TYPE_C,2.0
1,1466964,1,55,1993,136.83,135.83,0,0,270,Beverages,Thai,647,56,TYPE_C,2.0
2,1346989,1,55,2539,134.86,135.86,0,0,189,Beverages,Thai,647,56,TYPE_C,2.0
3,1338232,1,55,2139,339.5,437.53,0,0,54,Beverages,Indian,647,56,TYPE_C,2.0
4,1448490,1,55,2631,243.5,242.5,0,0,40,Beverages,Indian,647,56,TYPE_C,2.0


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,456548.0,1250096.0,144354.822378,1000000.0,1124998.75,1250183.5,1375140.25,1499999.0
week,456548.0,74.76877,41.524956,1.0,39.0,76.0,111.0,145.0
center_id,456548.0,82.1058,45.975046,10.0,43.0,76.0,110.0,186.0
meal_id,456548.0,2024.337,547.42092,1062.0,1558.0,1993.0,2539.0,2956.0
checkout_price,456548.0,332.2389,152.939723,2.97,228.95,296.82,445.23,866.27
base_price,456548.0,354.1566,160.715914,55.35,243.5,310.46,458.87,866.27
emailer_for_promotion,456548.0,0.08115247,0.273069,0.0,0.0,0.0,0.0,1.0
homepage_featured,456548.0,0.1091999,0.31189,0.0,0.0,0.0,0.0,1.0
num_orders,456548.0,261.8728,395.922798,13.0,54.0,136.0,324.0,24299.0
city_code,456548.0,601.5534,66.195914,456.0,553.0,596.0,651.0,713.0


In [5]:
df["category"] = df["category"].astype("category")
columns_to_drop = ["num_orders", "id"]
test_start_week = df["week"].max() * 0.8
validation_start_week = df["week"].max() * 0.6
df_train = df[df["week"] < validation_start_week]
df_validation = df[(df["week"] >= validation_start_week) & (df["week"] < test_start_week)]
df_test = df[df["week"] >= test_start_week]
X_train = df_train.drop(columns=columns_to_drop)
y_train = df_train["num_orders"]
X_validation = df_validation.drop(columns=columns_to_drop)
y_validation = df_validation["num_orders"]
X_test = df_test.drop(columns=columns_to_drop)
y_test = df_test["num_orders"]

In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263590 entries, 0 to 263589
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   week                   263590 non-null  int64   
 1   center_id              263590 non-null  int64   
 2   meal_id                263590 non-null  int64   
 3   checkout_price         263590 non-null  float64 
 4   base_price             263590 non-null  float64 
 5   emailer_for_promotion  263590 non-null  int64   
 6   homepage_featured      263590 non-null  int64   
 7   category               263590 non-null  category
 8   cuisine                263590 non-null  object  
 9   city_code              263590 non-null  int64   
 10  region_code            263590 non-null  int64   
 11  center_type            263590 non-null  object  
 12  op_area                263590 non-null  float64 
dtypes: category(1), float64(3), int64(7), object(2)
memory usage: 26.4+ MB


In [7]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
week,263590.0,44.542365,24.75009,1.0,23.0,45.0,66.0,86.0
center_id,263590.0,82.191878,45.970215,10.0,43.0,76.0,110.0,186.0
meal_id,263590.0,2018.845719,546.80424,1062.0,1543.0,1971.0,2539.0,2956.0
checkout_price,263590.0,329.775977,152.226226,2.97,229.89,291.06,438.5,738.23
base_price,263590.0,350.88183,157.041646,55.35,243.5,309.43,455.93,738.23
emailer_for_promotion,263590.0,0.080318,0.271785,0.0,0.0,0.0,0.0,1.0
homepage_featured,263590.0,0.108984,0.311619,0.0,0.0,0.0,0.0,1.0
city_code,263590.0,601.524018,66.269349,456.0,553.0,596.0,651.0,713.0
region_code,263590.0,56.577393,17.627253,23.0,34.0,56.0,77.0,93.0
op_area,263590.0,4.082859,1.089724,0.9,3.6,4.0,4.5,7.0


In [8]:
X_train.nunique()

week                       86
center_id                  77
meal_id                    51
checkout_price           1892
base_price               1745
emailer_for_promotion       2
homepage_featured           2
category                   14
cuisine                     4
city_code                  51
region_code                 8
center_type                 3
op_area                    30
dtype: int64

In [9]:
# df = pd.DataFrame({
#     'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red'],  # Categorical
#     'Size': ['S', 'M', 'L', 'S', 'L'],  # Ordinal
#     "Category": ['A', 'B', 'A', 'B', 'A'],  # Categorical
#     'Price': [10, 20, 15, 25, 30],  # Continuous
#     'Week': [10, 20, 15, 25, 30]  # Continuous
# })
# ordinal_columns = ["Category"]
# one_hot_columns = ["Size", "Color"]
# numerical_columns = ["Price"]
# unchanged_columns = ["Week"]

ordinal_columns = ["center_id", "meal_id", "category", "city_code", "region_code"]
one_hot_columns = ["cuisine", "center_type"]
numerical_columns = ["checkout_price", "base_price", "op_area"]
unchanged_columns = ["week", "emailer_for_promotion", "homepage_featured"]


def transform_data(df, pipeline):
    if not pipeline:
        ordinal_encoder_pipeline = Pipeline(
            steps=[
                ("ordinal_encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
                ("functional_transformer", FunctionTransformer(lambda df: df + 1)),
            ]
        )
        one_hot_encoder_pipeline = Pipeline(
            steps=[
                ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
            ]
        )
        numerical_encoder_pipeline = Pipeline(
            steps=[
                ("simple_imputer", SimpleImputer(strategy='constant', fill_value=0)),
                ("min_max_scaler", MinMaxScaler())
            ]
        )
        preprocessor = ColumnTransformer(
            transformers=[
                ('ordinal_encoder_pipeline', ordinal_encoder_pipeline, ordinal_columns),
                ("one_hot_encoder_pipeline", one_hot_encoder_pipeline, one_hot_columns),
                ("numerical_encoder_pipeline", numerical_encoder_pipeline, numerical_columns)
            ], remainder='passthrough')
        pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
        encoded = pipeline.fit_transform(df)
    else:
        encoded = pipeline.transform(df)
    new_one_hot_columns = list(
        pipeline.named_steps["preprocessor"].named_transformers_["one_hot_encoder_pipeline"].get_feature_names_out(
            one_hot_columns))
    new_column_names = ordinal_columns + new_one_hot_columns + numerical_columns + unchanged_columns
    return pd.DataFrame(encoded, columns=new_column_names), pipeline


X_train, pipeline = transform_data(X_train, None)

In [10]:
X_test, pipeline = transform_data(X_test, pipeline)
X_validation, pipeline = transform_data(X_validation, pipeline)

In [11]:
X_train[ordinal_columns] = X_train[ordinal_columns].astype("int")
X_test[ordinal_columns] = X_test[ordinal_columns].astype("int")
X_validation[ordinal_columns] = X_validation[ordinal_columns].astype("int")
categorical_cardinalities = [X_train[column].max() for column in ordinal_columns]
X_train_numeric = X_train.drop(columns=ordinal_columns)
X_train_embedding = X_train[ordinal_columns]

X_test_numeric = X_test.drop(columns=ordinal_columns)
X_test_embedding = X_test[ordinal_columns]
X_validation_numeric = X_validation.drop(columns=ordinal_columns)
X_validation_embedding = X_validation[ordinal_columns]

In [12]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)


class CustomDataset(Dataset):
    def __init__(self, X_numeric, X_embedding, y):
        self.X_numeric = torch.tensor(X_numeric.values, dtype=torch.float32).to(device)
        self.X_embedding = torch.tensor(X_embedding.values, dtype=torch.int).to(device)
        self.y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1).to(device)

    def __len__(self):
        return len(self.X_numeric)

    def __getitem__(self, idx):
        return self.X_numeric[idx], self.X_embedding[idx], self.y[idx]


train_dataset = CustomDataset(X_train_numeric, X_train_embedding, y_train)
validation_dataset = CustomDataset(X_validation_numeric, X_validation_embedding, y_validation)
test_dataset = CustomDataset(X_test_numeric, X_test_embedding, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

mps


In [49]:
import torch.nn as nn
import torch.optim as optim


class RegressionModel(pl.LightningModule):
    def __init__(self, numerical_input_dim, embedding_cardinality, embedding_dim=4):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(num_categories, embedding_dim)
            for num_categories in embedding_cardinality
        ])
        total_embedding_size = len(embedding_cardinality) * embedding_dim
        self.model = nn.Sequential(
            nn.Linear(numerical_input_dim + total_embedding_size, 128),
            nn.ReLU(),
            nn.Linear(128, 16),
            nn.ReLU(),
            nn.Linear(16, 1)  # Regression output
        )
        self.loss_fn = nn.MSELoss()

    def forward(self, numerical, categorical):
        embedded = [embed_layer(categorical[:, i]) for i, embed_layer in enumerate(self.embeddings)]
        embedded = torch.cat(embedded, dim=1)
        x = torch.cat([embedded, numerical], dim=1)
        return self.model(x)

    def training_step(self, batch, batch_idx):
        numerical, categorical, y = batch
        y_pred = self(numerical, categorical)
        loss = self.loss_fn(y_pred, y)
        self.log("train_loss", loss, prog_bar=True, on_epoch=True, on_step=False)
        return loss

    def validation_step(self, batch, batch_idx):
        numerical, categorical, y = batch
        y_pred = self(numerical, categorical)
        loss = self.loss_fn(y_pred, y)
        self.log("val_loss", loss, prog_bar=True, on_epoch=True, on_step=False)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.01)


model = RegressionModel(numerical_input_dim=X_train_numeric.shape[1],
                        embedding_cardinality=categorical_cardinalities).to(device)
model.train()
checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    mode="min",
    save_top_k=1,
    filename="best_model-{epoch:02d}-{val_loss:.4f}",
    verbose=True
)
early_stopping_callback = EarlyStopping(
    monitor="val_loss",
    mode="min",
    patience=5,
    verbose=True
)
trainer = pl.Trainer(max_epochs=50,
                     log_every_n_steps=1,
                     accelerator=str(device),
                     precision="16-mixed",
                     callbacks=[checkpoint_callback, early_stopping_callback])
trainer.fit(model, train_loader, validation_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name       | Type       | Params | Mode 
--------------------------------------------------
0 | embeddings | ModuleList | 804    | train
1 | model      | Sequential | 6.4 K  | train
2 | loss_fn    | MSELoss    | 0      | train
--------------------------------------------------
7.2 K     Trainable params
0         Non-trainable params
7.2 K     Total params
0.029     Total estimated model params size (MB)
13        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/ozge/PycharmProjects/ozge/kaggle/.env/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/Users/ozge/PycharmProjects/ozge/kaggle/.env/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 37714.262
Epoch 0, global step 4119: 'val_loss' reached 37714.26172 (best 37714.26172), saving model to '/Users/ozge/PycharmProjects/ozge/kaggle/food_demand_forecasting/lightning_logs/version_31/checkpoints/best_model-epoch=00-val_loss=37714.2617.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 1588.410 >= min_delta = 0.0. New best score: 36125.852
Epoch 1, global step 8238: 'val_loss' reached 36125.85156 (best 36125.85156), saving model to '/Users/ozge/PycharmProjects/ozge/kaggle/food_demand_forecasting/lightning_logs/version_31/checkpoints/best_model-epoch=01-val_loss=36125.8516.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 3011.125 >= min_delta = 0.0. New best score: 33114.727
Epoch 2, global step 12357: 'val_loss' reached 33114.72656 (best 33114.72656), saving model to '/Users/ozge/PycharmProjects/ozge/kaggle/food_demand_forecasting/lightning_logs/version_31/checkpoints/best_model-epoch=02-val_loss=33114.7266.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 16476: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 20595: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 24714: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 28833: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 33114.727. Signaling Trainer to stop.
Epoch 7, global step 32952: 'val_loss' was not in top 1


In [58]:
model = RegressionModel.load_from_checkpoint(checkpoint_callback.best_model_path,
                                             numerical_input_dim=X_train_numeric.shape[1],
                                             embedding_cardinality=categorical_cardinalities).to(device)


In [59]:
model.eval()
test_to_pred_numeric = torch.tensor(X_test_numeric.values, dtype=torch.float32, device=device)
test_to_pred_embedding = torch.tensor(X_test_embedding.values, dtype=torch.int, device=device)
with torch.no_grad():
    predictions = model.to(str(device))(test_to_pred_numeric, test_to_pred_embedding)
predictions = predictions.cpu().numpy().reshape(-1)
print("Predictions:", predictions)

Predictions: [364.94754 208.78683 200.46094 ... 400.96475 383.45883 159.6444 ]


In [60]:
mae = mean_absolute_error(y_test.values, predictions)
mse = mean_squared_error(y_test.values, predictions)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

"""
Mean Absolute Error: 94.37028976804564
Mean Squared Error: 32748.60997751085
"""

Mean Absolute Error: 94.37028976804564
Mean Squared Error: 32748.60997751085


'\nMean Absolute Error: 147.2391832704101\nMean Squared Error: 60632.16680838898\n\nMean Absolute Error: 103.89691858055811\nMean Squared Error: 38411.99397804801\n\nMean Absolute Error: 94.41249263131034\nMean Squared Error: 33476.05190155651\n'