In [66]:
import pytorch_lightning as pl
import pandas as pd
import torch
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer, MinMaxScaler
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv('data/train.csv')
dfm = pd.read_csv('data/meal_info.csv')
dfc = pd.read_csv('data/fulfilment_center_info.csv')
df = pd.merge(df, dfm, on='meal_id', how='left')
df = pd.merge(df, dfc, on='center_id', how='left')

In [3]:
df.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,category,cuisine,city_code,region_code,center_type,op_area
0,1379560,1,55,1885,136.83,152.29,0,0,177,Beverages,Thai,647,56,TYPE_C,2.0
1,1466964,1,55,1993,136.83,135.83,0,0,270,Beverages,Thai,647,56,TYPE_C,2.0
2,1346989,1,55,2539,134.86,135.86,0,0,189,Beverages,Thai,647,56,TYPE_C,2.0
3,1338232,1,55,2139,339.5,437.53,0,0,54,Beverages,Indian,647,56,TYPE_C,2.0
4,1448490,1,55,2631,243.5,242.5,0,0,40,Beverages,Indian,647,56,TYPE_C,2.0


In [90]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,456548.0,1250096.0,144354.822378,1000000.0,1124998.75,1250183.5,1375140.25,1499999.0
week,456548.0,74.76877,41.524956,1.0,39.0,76.0,111.0,145.0
center_id,456548.0,82.1058,45.975046,10.0,43.0,76.0,110.0,186.0
meal_id,456548.0,2024.337,547.42092,1062.0,1558.0,1993.0,2539.0,2956.0
checkout_price,456548.0,332.2389,152.939723,2.97,228.95,296.82,445.23,866.27
base_price,456548.0,354.1566,160.715914,55.35,243.5,310.46,458.87,866.27
emailer_for_promotion,456548.0,0.08115247,0.273069,0.0,0.0,0.0,0.0,1.0
homepage_featured,456548.0,0.1091999,0.31189,0.0,0.0,0.0,0.0,1.0
num_orders,456548.0,261.8728,395.922798,13.0,54.0,136.0,324.0,24299.0
city_code,456548.0,601.5534,66.195914,456.0,553.0,596.0,651.0,713.0


In [4]:
df["category"] = df["category"].astype("category")
columns_to_drop = ["num_orders", "id"]
test_start_week = df["week"].max() * 0.8
validation_start_week = df["week"].max() * 0.6
df_train = df[df["week"] < validation_start_week]
df_validation = df[(df["week"] >= validation_start_week) & (df["week"] < test_start_week)]
df_test = df[df["week"] >= test_start_week]
X_train = df_train.drop(columns=columns_to_drop)
y_train = df_train["num_orders"]
X_validation = df_validation.drop(columns=columns_to_drop)
y_validation = df_validation["num_orders"]
X_test = df_test.drop(columns=columns_to_drop)
y_test = df_test["num_orders"]

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263590 entries, 0 to 263589
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   week                   263590 non-null  int64   
 1   center_id              263590 non-null  int64   
 2   meal_id                263590 non-null  int64   
 3   checkout_price         263590 non-null  float64 
 4   base_price             263590 non-null  float64 
 5   emailer_for_promotion  263590 non-null  int64   
 6   homepage_featured      263590 non-null  int64   
 7   category               263590 non-null  category
 8   cuisine                263590 non-null  object  
 9   city_code              263590 non-null  int64   
 10  region_code            263590 non-null  int64   
 11  center_type            263590 non-null  object  
 12  op_area                263590 non-null  float64 
dtypes: category(1), float64(3), int64(7), object(2)
memory usage: 26.4+ MB


In [6]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
week,263590.0,44.542365,24.75009,1.0,23.0,45.0,66.0,86.0
center_id,263590.0,82.191878,45.970215,10.0,43.0,76.0,110.0,186.0
meal_id,263590.0,2018.845719,546.80424,1062.0,1543.0,1971.0,2539.0,2956.0
checkout_price,263590.0,329.775977,152.226226,2.97,229.89,291.06,438.5,738.23
base_price,263590.0,350.88183,157.041646,55.35,243.5,309.43,455.93,738.23
emailer_for_promotion,263590.0,0.080318,0.271785,0.0,0.0,0.0,0.0,1.0
homepage_featured,263590.0,0.108984,0.311619,0.0,0.0,0.0,0.0,1.0
city_code,263590.0,601.524018,66.269349,456.0,553.0,596.0,651.0,713.0
region_code,263590.0,56.577393,17.627253,23.0,34.0,56.0,77.0,93.0
op_area,263590.0,4.082859,1.089724,0.9,3.6,4.0,4.5,7.0


In [7]:
X_train.nunique()

week                       86
center_id                  77
meal_id                    51
checkout_price           1892
base_price               1745
emailer_for_promotion       2
homepage_featured           2
category                   14
cuisine                     4
city_code                  51
region_code                 8
center_type                 3
op_area                    30
dtype: int64

In [8]:
# df = pd.DataFrame({
#     'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red'],  # Categorical
#     'Size': ['S', 'M', 'L', 'S', 'L'],  # Ordinal
#     "Category": ['A', 'B', 'A', 'B', 'A'],  # Categorical
#     'Price': [10, 20, 15, 25, 30],  # Continuous
#     'Week': [10, 20, 15, 25, 30]  # Continuous
# })
# ordinal_columns = ["Category"]
# one_hot_columns = ["Size", "Color"]
# numerical_columns = ["Price"]
# unchanged_columns = ["Week"]

ordinal_columns = ["center_id", "meal_id", "category", "city_code", "region_code"]
one_hot_columns = ["cuisine", "center_type"]
numerical_columns = ["checkout_price", "base_price", "op_area"]
unchanged_columns = ["week", "emailer_for_promotion", "homepage_featured"]


def transform_data(df, pipeline):
    if not pipeline:
        ordinal_encoder_pipeline = Pipeline(
            steps=[
                ("ordinal_encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
                ("functional_transformer", FunctionTransformer(lambda df: df + 1)),
            ]
        )
        one_hot_encoder_pipeline = Pipeline(
            steps=[
                ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
            ]
        )
        numerical_encoder_pipeline = Pipeline(
            steps=[
                ("simple_imputer",  SimpleImputer(strategy='constant', fill_value=0)),
                ("min_max_scaler",  MinMaxScaler())
            ]
        )
        preprocessor = ColumnTransformer(
            transformers=[
                ('ordinal_encoder_pipeline', ordinal_encoder_pipeline, ordinal_columns),
                ("one_hot_encoder_pipeline", one_hot_encoder_pipeline, one_hot_columns),
                ("numerical_encoder_pipeline", numerical_encoder_pipeline, numerical_columns)
            ], remainder='passthrough')
        pipeline = Pipeline(steps = [("preprocessor", preprocessor)])
        encoded = pipeline.fit_transform(df)
    else:
        encoded = pipeline.transform(df)
    new_one_hot_columns = list(pipeline.named_steps["preprocessor"].named_transformers_["one_hot_encoder_pipeline"].get_feature_names_out(one_hot_columns))
    new_column_names = ordinal_columns + new_one_hot_columns + numerical_columns + unchanged_columns
    return pd.DataFrame(encoded, columns=new_column_names), pipeline
X_train, pipeline = transform_data(X_train, None)

In [9]:
X_train

Unnamed: 0,center_id,meal_id,category,city_code,region_code,cuisine_Continental,cuisine_Indian,cuisine_Italian,cuisine_Thai,center_type_TYPE_A,center_type_TYPE_B,center_type_TYPE_C,checkout_price,base_price,op_area,week,emailer_for_promotion,homepage_featured
0,24.0,23.0,1.0,31.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.182058,0.141958,0.180328,1.0,0.0,0.0
1,24.0,27.0,1.0,31.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.182058,0.117854,0.180328,1.0,0.0,0.0
2,24.0,39.0,1.0,31.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.179379,0.117898,0.180328,1.0,0.0,0.0
3,24.0,30.0,1.0,31.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.457702,0.559659,0.180328,1.0,0.0,0.0
4,24.0,43.0,1.0,31.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.327136,0.274060,0.180328,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263585,28.0,13.0,3.0,3.0,6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.542176,0.670367,0.590164,86.0,0.0,1.0
263586,28.0,32.0,3.0,3.0,6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.522346,0.673295,0.590164,86.0,0.0,1.0
263587,28.0,45.0,10.0,3.0,6.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.403653,0.359375,0.590164,86.0,0.0,0.0
263588,28.0,40.0,10.0,3.0,6.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.416805,0.373536,0.590164,86.0,0.0,0.0


In [10]:
X_test, pipeline = transform_data(X_test, pipeline)
X_test

Unnamed: 0,center_id,meal_id,category,city_code,region_code,cuisine_Continental,cuisine_Indian,cuisine_Italian,cuisine_Thai,center_type_TYPE_A,center_type_TYPE_B,center_type_TYPE_C,checkout_price,base_price,op_area,week,emailer_for_promotion,homepage_featured
0,24.0,23.0,1.0,31.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.200446,0.139117,0.180328,116.0,0.0,0.0
1,24.0,27.0,1.0,31.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.200446,0.142045,0.180328,116.0,0.0,0.0
2,24.0,39.0,1.0,31.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.197808,0.136276,0.180328,116.0,0.0,0.0
3,24.0,30.0,1.0,31.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.401015,0.356534,0.180328,116.0,0.0,0.0
4,24.0,43.0,1.0,31.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.204404,0.146307,0.180328,116.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98470,28.0,13.0,3.0,3.0,6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.654354,0.627841,0.590164,145.0,0.0,0.0
98471,28.0,32.0,3.0,3.0,6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.651633,0.624912,0.590164,145.0,0.0,0.0
98472,28.0,45.0,10.0,3.0,6.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.319220,0.389117,0.590164,145.0,0.0,0.0
98473,28.0,40.0,10.0,3.0,6.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.327136,0.377797,0.590164,145.0,0.0,0.0


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263590 entries, 0 to 263589
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   center_id              263590 non-null  float64
 1   meal_id                263590 non-null  float64
 2   category               263590 non-null  float64
 3   city_code              263590 non-null  float64
 4   region_code            263590 non-null  float64
 5   cuisine_Continental    263590 non-null  float64
 6   cuisine_Indian         263590 non-null  float64
 7   cuisine_Italian        263590 non-null  float64
 8   cuisine_Thai           263590 non-null  float64
 9   center_type_TYPE_A     263590 non-null  float64
 10  center_type_TYPE_B     263590 non-null  float64
 11  center_type_TYPE_C     263590 non-null  float64
 12  checkout_price         263590 non-null  float64
 13  base_price             263590 non-null  float64
 14  op_area                263590 non-nu

In [50]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.float32).to(device)
        self.y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1).to(device)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

mps


In [51]:
import torch.nn as nn
import torch.optim as optim

class RegressionModel(pl.LightningModule):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 16),
            nn.ReLU(),
            nn.Linear(16, 1)  # Regression output
        )
        self.loss_fn = nn.MSELoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = self.loss_fn(y_pred, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = self.loss_fn(y_pred, y)
        self.log("val_loss", loss, prog_bar=True)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.01)

# Initialize model with the correct input size
model = RegressionModel(input_dim=X_train.shape[1]).to(device)

In [52]:
model.train()
trainer = pl.Trainer(max_epochs=10, log_every_n_steps=1, accelerator=str(device), precision="16")
trainer.fit(model, train_loader, test_loader)

/Users/ozge/PycharmProjects/ozge/kaggle/.env/lib/python3.9/site-packages/lightning_fabric/connector.py:572: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type       | Params | Mode 
-----------------------------------------------
0 | model   | Sequential | 4.5 K  | train
1 | loss_fn | MSELoss    | 0      | train
-----------------------------------------------
4.5 K     Trainable params
0         Non-trainable params
4.5 K     Total params
0.018     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/ozge/PycharmProjects/ozge/kaggle/.env/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/Users/ozge/PycharmProjects/ozge/kaggle/.env/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Validation: |          | 0/? [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [72]:
model.eval()
test_to_pred = torch.tensor(X_test.values, dtype=torch.float32, device=device)
with torch.no_grad():
    predictions = model.to(str(device))(test_to_pred)
predictions = predictions.cpu().numpy().reshape(-1)
print("Predictions:", predictions)

Predictions: [249.31152  159.40718  122.996185 ... 458.02286  439.82938  395.15256 ]


In [73]:
mae = mean_absolute_error(y_test.values, predictions)
mse = mean_squared_error(y_test.values, predictions)
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

Mean Absolute Error: 147.2391832704101
Mean Squared Error: 60632.16680838898
