In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

In [11]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing, metrics
from sklearn.compose import ColumnTransformer

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from sales_forecasting.utils import timeseries_split
from sales_forecasting.plot import plot_timeseries, plot_feature_importance
from sales_forecasting.features import col_name

In [4]:
df = pd.read_parquet(".data/df_agg_monthly_oversampled.parquet")

In [5]:
train_split, valid_split = timeseries_split(df, max_month=33, col='date_block_num', continuous=False)
train_test_split, test_split = timeseries_split(df, max_month=34, col='date_block_num', continuous=False)

In [6]:
target_col = 'item_cnt_month'
train_target, valid_target = train_split[target_col].clip(0, 20), valid_split[target_col].clip(0, 20)

In [7]:
cols_to_drop = [target_col, 'date_block_num', 'shop_id', 'item_id']
X_train, X_valid = train_split.drop(columns=cols_to_drop), valid_split.drop(columns=cols_to_drop)

# MLP

In [8]:
ohe_cols = ['city_id', 'item_category_id', 'general_item_category_id', 'date_month']
num_cols = [*col_name("lagged", list(range(1, 12))), *col_name('rolling', [3, 6, 9]), 'months_since_last_buy']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', preprocessing.StandardScaler(), num_cols),
        ('cat', preprocessing.OneHotEncoder(handle_unknown='ignore'), ohe_cols)
    ]
)

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_valid_preprocessed = preprocessor.transform(X_valid)

In [9]:
class SalesPredictionMLPModel(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: list[int], dropout_rate: float, batch_norm: bool = True):
        super(SalesPredictionMLPModel, self).__init__()
        
        layers = []
        dims = [input_dim] + hidden_dim
        for in_dim, out_dim in zip(dims, dims[1:]):
            layers.append(nn.Linear(in_dim, out_dim))
            if batch_norm:
                layers.append(nn.BatchNorm1d(out_dim))
            layers.extend([nn.Dropout(p=dropout_rate), nn.ReLU()])
        layers.append(nn.Linear(dims[-1], 1))

        self.ff = nn.Sequential(*layers)

    def forward(self, X: torch.Tensor) -> torch.Tensor:
        return self.ff(X)

In [12]:
class SalesMLPDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.toarray(), dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [27]:
batch_size = 1024

train_dataset = SalesMLPDataset(X_train_preprocessed, train_target)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = SalesMLPDataset(X_valid_preprocessed, valid_target)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

dropout_rate = 0.2
input_dim = X_train_preprocessed.shape[1]
hidden_dim = [32, ]
lr = 1e-3

model = SalesPredictionMLPModel(input_dim, hidden_dim, dropout_rate=dropout_rate)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


In [28]:
len(train_dataset)

21137776

In [29]:
def train(model: nn.Module, optimizer, criterion, train_loader: DataLoader, val_loader: DataLoader, n_epochs: int):
    history = {
        'train_loss': [],
        'val_loss': []
    }

    for epoch in range(n_epochs):
        model.train()
        running_loss = 0.0
        for batch in train_loader:
            data, target = batch

            optimizer.zero_grad()
            predictions = model(data)
            loss = criterion(predictions, target)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                data, target = batch

                predictions = model(data)
                loss = criterion(predictions, target)
                
                val_loss += loss.item()

        history['train_loss'].append(running_loss / len(train_loader))    
        history['val_loss'].append(val_loss / len(val_loader))
        
        print(f'Epoch [{epoch+1}/{n_epochs}], Train Loss: {running_loss/len(train_loader):.2f} \
    [rmse: {np.sqrt(running_loss/len(train_loader)):.2f}], Val Loss: {val_loss/len(val_loader):.2f} \
    [rmse: {np.sqrt(val_loss/len(val_loader)):.2f}]')
    
    return history

In [30]:
train(model, optimizer, criterion, train_loader, valid_loader, n_epochs=10)

Epoch [1/10], Train Loss: 0.19     [rmse: 0.44], Val Loss: 1.09     [rmse: 1.04]
Epoch [2/10], Train Loss: 0.18     [rmse: 0.42], Val Loss: 0.46     [rmse: 0.68]


KeyboardInterrupt: 

In [None]:
def create_time_sequence_vectors(df: pd.DataFrame, maxlen: int, target_month: int) -> np.ndarray:
    target_index = df['date_block_num'] == target_month
    x = df[~target_index]
    y = df[target_index]

    if y.empty:
        y = [np.nan]
    else:
        y = y['item_cnt_month'].values
    
    v = np.zeros(maxlen)
    x['date_block_num'] = x['date_block_num'].astype(int)
    v[x['date_block_num'].values] = x['item_cnt_month'].values

    return pd.DataFrame({'monthly_sales_array': [v], 'y': y})

def transform_data_to_features(df: pd.DataFrame, max_seq_len: int) -> pd.DataFrame:
    target_month = df['date_block_num'].max()

    df = df.merge(
        df[df.date_block_num == target_month][['shop_id', 'item_id']],
        on=['shop_id', 'item_id'],
        how='right'    
    )
    
    padded_seqs = df.groupby(["shop_id", "item_id"]) \
        .apply(create_time_sequence_vectors, maxlen=max_seq_len, target_month=target_month) \
        .reset_index() \
        .rename(columns={0: 'monthly_sales_array'}, inplace=False) \
        .drop(columns=['level_2'])
    
    return padded_seqs

df_train_vectors = transform_data_to_features(train_split_featurized, 35)
df_val_vectors = transform_data_to_features(test_split_featurized, 35)

In [None]:
df_train = df_train_vectors.merge(pd.read_csv(".data/items.csv")[['item_id', 'item_category_id']], on='item_id', how='left')
df_val = df_val_vectors.merge(pd.read_csv(".data/items.csv")[['item_id', 'item_category_id']], on='item_id', how='left')

In [None]:
df_train['y'] = df_train['y'].clip(0, 20)
df_val['y'] = df_val['y'].clip(0, 20)

In [None]:
class SalesDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.shop_ids = torch.tensor(self.df['shop_id'].values, dtype=torch.long).unsqueeze(1)
        self.item_ids = torch.tensor(self.df['item_id'].values, dtype=torch.long).unsqueeze(1)
        self.category_ids = torch.tensor(self.df['item_category_id'].values, dtype=torch.long).unsqueeze(1)
        self.sales_array = torch.tensor(np.vstack(self.df['monthly_sales_array'].values), dtype=torch.float32).unsqueeze(2)
        self.targets = torch.tensor(self.df['y'].values, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return (self.shop_ids[idx], self.item_ids[idx], self.category_ids[idx], self.sales_array[idx], self.targets[idx])


In [None]:
df_shop_2 = df_train[df_train['shop_id'] == 2].head(20)

# Set up the plot
plt.figure(figsize=(20, 30))

# Iterate over the rows and plot the monthly_sales_array for each item_id
for i, (_, row) in enumerate(df_shop_2.iterrows(), 1):
    
    sales_array = row.monthly_sales_array[:32]
    
    plt.subplot(len(df_shop_2) // 2 + 1, 2, i)
    plt.plot(sales_array, label='Sales')

    non_zero_values = np.where(sales_array != 0)[0]
    plt.scatter(non_zero_values, sales_array[non_zero_values], color='blue')

    # Plot the continuation to the target value
    plt.plot([31, 32], [sales_array[-1], row.y], color='red')
    plt.scatter(32, row.y, color='red', label='Target')
    
    plt.title(f'Item ID: {row.item_id}')
    plt.xlabel('Month')
    plt.ylabel('Sales')

plt.tight_layout()
plt.show()

In [None]:
class SalesPredictionModel(nn.Module):
    def __init__(self, num_shops, num_items, num_categories, embedding_size, dropout_rate):
        super(SalesPredictionModel, self).__init__()
        
        # Embedding layers for categorical variables
        self._embedding_size = embedding_size
        self.shop_embedding = nn.Embedding(num_embeddings=num_shops, embedding_dim=embedding_size)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_size)
        self.category_embedding = nn.Embedding(num_embeddings=num_categories, embedding_dim=embedding_size)
        self.embedding_dropout = nn.Dropout(0.1)

        # LSTM layer
        self._lstm_hidden_size = 2 * embedding_size
        self.lstm = nn.LSTM(input_size=1, hidden_size=self._lstm_hidden_size, batch_first=True)
        
        # Fully connected layers
        self._linear_input_size = self._embedding_size * 3 + self._lstm_hidden_size + 3
        self._dropout_rate = dropout_rate
        self.batch_norm = nn.BatchNorm1d(self._linear_input_size)
        self.ff = nn.Sequential(
            nn.Linear(self._linear_input_size, 16),
            nn.ReLU(),
            nn.Dropout(self._dropout_rate),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Dropout(self._dropout_rate),
            nn.ReLU(),
            nn.Linear(8, 1)
        )

    def cross_prod(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        return torch.sum((x * y), axis=1).unsqueeze(1)

    def forward(self, shop_id, item_id, category_id, sales_array):
        shop_embed = self.shop_embedding(shop_id).squeeze(1)
        item_embed = self.item_embedding(item_id).squeeze(1)
        category_embed = self.category_embedding(category_id).squeeze(1)

        shop_item_cross = self.cross_prod(shop_embed, item_embed)
        shop_category_cross = self.cross_prod(shop_embed, category_embed)
        shop_item_category_cross = self.cross_prod(shop_item_cross, category_embed)

        shop_embed, item_embed, category_embed = (self.embedding_dropout(_) for _ in [shop_embed, item_embed, category_embed])
        
        _, (lstm_hidden_state, _) = self.lstm(sales_array)
        lstm_hidden_state = lstm_hidden_state.squeeze(0)

        concatenated = torch.cat([
            shop_embed, item_embed, category_embed, lstm_hidden_state,
            shop_item_cross, shop_category_cross, shop_item_category_cross
        ], dim=1)
        concatenated = self.batch_norm(concatenated)
        
        output = self.ff(concatenated)
        
        return output

In [None]:
batch_size = 32

# Instantiate the dataset
train_dataset = SalesDataset(df_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = SalesDataset(df_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Parameters
num_shops = df_train['shop_id'].max() + 1        # Example: total unique shops
num_items = df_train['item_id'].max() + 1       # Example: total unique items
num_categories = df_train['item_category_id'].max() + 1    # Example: total unique categories
embedding_size = 8     # Embedding size for each categorical feature
dropout_rate = 0.3
lr = 1e-3

# Instantiate the model
model = SalesPredictionModel(num_shops, num_items, num_categories, embedding_size, dropout_rate=dropout_rate)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


In [None]:
# inp = next(iter(dataloader))
# model = SalesPredictionModel(num_shops, num_items, num_categories, embedding_size)
# model(*inp[:-1]).shape

In [None]:
model

In [None]:
num_epochs = 20

history = {
    'train_loss': [],
    'val_loss': []
}

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        shop_ids, item_ids, category_ids, sales_arrays, targets = batch

        optimizer.zero_grad()
        predictions = model(shop_ids, item_ids, category_ids, sales_arrays)
        loss = criterion(predictions, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            shop_ids, item_ids, category_ids, sales_arrays, targets = batch

            predictions = model(shop_ids, item_ids, category_ids, sales_arrays)
            loss = criterion(predictions, targets)
            
            val_loss += loss.item()

    history['train_loss'].append(running_loss / len(train_loader))    
    history['val_loss'].append(val_loss / len(val_loader))
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {running_loss/len(train_loader):.2f} \
[rmse: {np.sqrt(running_loss/len(train_loader)):.2f}], Val Loss: {val_loss/len(val_loader):.2f} \
[rmse: {np.sqrt(val_loss/len(val_loader)):.2f}]')

In [None]:
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Time')
plt.legend()
plt.show()

In [None]:
# Instantiate the dataset
train_dataset = SalesDataset(df_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Parameters
num_shops = df_val['shop_id'].max() + 1        # Example: total unique shops
num_items = df_val['item_id'].max() + 1       # Example: total unique items
num_categories = df_val['item_category_id'].max() + 1    # Example: total unique categories

# Instantiate the model
model = SalesPredictionModel(num_shops, num_items, num_categories, embedding_size, dropout_rate=dropout_rate)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


In [None]:
num_epochs = 33

history = {
    'train_loss': []
}

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        shop_ids, item_ids, category_ids, sales_arrays, targets = batch

        optimizer.zero_grad()
        predictions = model(shop_ids, item_ids, category_ids, sales_arrays)
        loss = criterion(predictions, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    history['train_loss'].append(running_loss / len(train_loader))    
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {running_loss/len(train_loader):.2f} \
[rmse: {np.sqrt(running_loss/len(train_loader)):.2f}]')

In [None]:
df_test_raw = pd.read_csv(".data/test.csv", index_col=[0]).assign(date_block_num=34, item_cnt_month=np.nan)
df_test_vectors = transform_data_to_features(pd.concat([test_split_featurized, df_test_raw]), 35)
df_test = df_test_vectors.merge(pd.read_csv(".data/items.csv")[['item_id', 'item_category_id']], on='item_id', how='left')

In [None]:
test_dataset = SalesDataset(df_test)

In [None]:
preds = []

model.eval()
with torch.no_grad():
    for i in range(len(test_dataset)):
        elem = (_.unsqueeze(0) for _ in test_dataset[i])
        shop_ids, item_ids, category_ids, sales_arrays, _ = elem

        predictions = model(shop_ids, item_ids, category_ids, sales_arrays)
        preds.append(predictions.item())

In [None]:
df_submission = df_test_raw \
    .reset_index()[['ID', 'shop_id', 'item_id']] \
    .merge(
        df_test.assign(predictions=preds)[['shop_id', 'item_id', 'predictions']], 
        on=['shop_id', 'item_id'], 
        how='left'
    )[['ID', 'predictions']] \
    .rename(columns={'predictions': 'item_cnt_month'})

In [None]:
df_submission['item_cnt_month'] = df_submission['item_cnt_month'].clip(0, 20)

In [None]:
df_submission.to_csv(".data/submission_lstm2.csv", index=False)