In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch.optim as optim

In [3]:
df = pd.read_parquet(".data/df_full.parquet")
df_items = pd.read_csv(".data/items.csv")

In [11]:
def timeseries_split(df: pd.DataFrame, col: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    assert df[col].dtype == np.int64, "Expected np.int64 timeseries month column"
    max_month = df[col].max()
    train_index = (df[col] <= max_month - 2)
    valid_index = (df[col] <= max_month - 1)
    test_index = (df[col] <= max_month)
    return df[train_index], df[valid_index], df[test_index]

In [5]:
df_train, df_valid, df_test = timeseries_split(df, col="date_block_num")

In [6]:
df_train

Unnamed: 0,date,date_block_num,shop_id,shop_name,item_id,item_name,item_category_id,item_category_name,item_cnt_day
0,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",991,"3D Action Puzzle ""Динозавры"" Тиранозавр",67,Подарки - Развитие,1.0
1,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",1472,"Assassin's Creed 3 [Xbox 360, русская версия]",23,Игры - XBOX 360,1.0
2,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",1905,"Bestseller. Grand Theft Auto: San Andreas [PC,...",30,Игры PC - Стандартные издания,1.0
3,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",2920,Disney. LEGO Пираты Карибского моря (Essential...,21,Игры - PSP,2.0
4,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",3320,"FIFA 13 (с поддержкой PS Move) [PS3, русская в...",19,Игры - PS3,1.0
...,...,...,...,...,...,...,...,...,...
2882330,2015-09-30,32,59,"Ярославль ТЦ ""Альтаир""",15069,Меч и Магия. Герои VII. Эксклюзивное издание [...,28,Игры PC - Дополнительные издания,1.0
2882331,2015-09-30,32,59,"Ярославль ТЦ ""Альтаир""",15255,Мягкая игрушка Angry Birds Зеленая свинка 30см,63,Подарки - Мягкие игрушки,1.0
2882332,2015-09-30,32,59,"Ярославль ТЦ ""Альтаир""",15256,Мягкая игрушка Angry Birds Красная птица 30см ...,63,Подарки - Мягкие игрушки,2.0
2882333,2015-09-30,32,59,"Ярославль ТЦ ""Альтаир""",16184,Настольная игра Морской бой Для путешествий,65,Подарки - Настольные игры (компактные),1.0


In [7]:
df_valid

Unnamed: 0,date,date_block_num,shop_id,shop_name,item_id,item_name,item_category_id,item_category_name,item_cnt_day
0,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",991,"3D Action Puzzle ""Динозавры"" Тиранозавр",67,Подарки - Развитие,1.0
1,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",1472,"Assassin's Creed 3 [Xbox 360, русская версия]",23,Игры - XBOX 360,1.0
2,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",1905,"Bestseller. Grand Theft Auto: San Andreas [PC,...",30,Игры PC - Стандартные издания,1.0
3,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",2920,Disney. LEGO Пираты Карибского моря (Essential...,21,Игры - PSP,2.0
4,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",3320,"FIFA 13 (с поддержкой PS Move) [PS3, русская в...",19,Игры - PS3,1.0
...,...,...,...,...,...,...,...,...,...
2935844,2015-10-31,33,59,"Ярославль ТЦ ""Альтаир""",20730,"Фигурка Scalers Wave 1 Alien 2""",72,Подарки - Фигурки,1.0
2935845,2015-10-31,33,59,"Ярославль ТЦ ""Альтаир""",20866,Фигурка Tom Clancy's Splinter Cell Blacklist S...,72,Подарки - Фигурки,1.0
2935846,2015-10-31,33,59,"Ярославль ТЦ ""Альтаир""",20949,Фирменный пакет майка 1С Интерес белый (34*42)...,71,"Подарки - Сумки, Альбомы, Коврики д/мыши",1.0
2935847,2015-10-31,33,59,"Ярославль ТЦ ""Альтаир""",21369,ХОББИТ: БИТВА ПЯТИ ВОИНСТВ (регион),40,Кино - DVD,1.0


In [8]:
df_test

Unnamed: 0,date,date_block_num,shop_id,shop_name,item_id,item_name,item_category_id,item_category_name,item_cnt_day
0,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",991,"3D Action Puzzle ""Динозавры"" Тиранозавр",67,Подарки - Развитие,1.0
1,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",1472,"Assassin's Creed 3 [Xbox 360, русская версия]",23,Игры - XBOX 360,1.0
2,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",1905,"Bestseller. Grand Theft Auto: San Andreas [PC,...",30,Игры PC - Стандартные издания,1.0
3,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",2920,Disney. LEGO Пираты Карибского моря (Essential...,21,Игры - PSP,2.0
4,2013-01-01,0,2,"Адыгея ТЦ ""Мега""",3320,"FIFA 13 (с поддержкой PS Move) [PS3, русская в...",19,Игры - PS3,1.0
...,...,...,...,...,...,...,...,...,...
3150044,2015-11-01,34,59,"Ярославль ТЦ ""Альтаир""",22162,ЯРОСТЬ,40,Кино - DVD,-1.0
3150045,2015-11-01,34,59,"Ярославль ТЦ ""Альтаир""",22163,ЯРОСТЬ ( регион),40,Кино - DVD,-1.0
3150046,2015-11-01,34,59,"Ярославль ТЦ ""Альтаир""",22164,ЯРОСТЬ (BD),37,Кино - Blu-Ray,-1.0
3150047,2015-11-01,34,59,"Ярославль ТЦ ""Альтаир""",22166,Язык запросов 1С:Предприятия [Цифровая версия],54,Книги - Цифра,-1.0


In [16]:
TIME_SEQUENCE_LEN = df_train['date_block_num'].max()

In [17]:
def create_time_sequence_vectors(x: pd.DataFrame, maxlen: int) -> np.ndarray:
    v = np.zeros(maxlen)
    v[x['date_block_num'].values] = x['item_cnt_day'].values

    return v

def transform_data_to_features(df: pd.DataFrame) -> pd.DataFrame:
    max_month = df['date_block_num'].max()
    df_x, df_y = df[df['date_block_num'] != max_month], df[df['date_block_num'] == max_month]
    df_x_monthly = df_x.groupby(["shop_id", "item_id", "date_block_num"])['item_cnt_day'].sum().reset_index()
    df_x_monthly_series = df_x_monthly.groupby(["shop_id", "item_id"]) \
        .apply(create_time_sequence_vectors, maxlen=TIME_SEQUENCE_LEN) \
        .reset_index() \
        .rename(columns={0: 'monthly_sales_array'}, inplace=False)
    
    return df_x_monthly_series

In [18]:
df_train_monthly = df_train.groupby(["shop_id", "item_id", "date_block_num"])['item_cnt_day'].sum().reset_index()
df_train_monthly['item_cnt_day'] = df_train_monthly['item_cnt_day'].clip(0, 20)

In [19]:
transform_data_to_features(df_train)

  df_x_monthly_series = df_x_monthly.groupby(["shop_id", "item_id"]) \


Unnamed: 0,shop_id,item_id,monthly_sales_array
0,0,30,"[0.0, 31.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,0,31,"[0.0, 11.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,0,32,"[6.0, 10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,0,33,"[3.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,0,35,"[1.0, 14.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...
411835,59,22154,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
411836,59,22155,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
411837,59,22162,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
411838,59,22164,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
df_train_monthly

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_day
0,0,30,1,20.0
1,0,31,1,11.0
2,0,32,0,6.0
3,0,32,1,10.0
4,0,33,0,3.0
...,...,...,...,...
1577588,59,22164,27,2.0
1577589,59,22164,30,1.0
1577590,59,22167,9,1.0
1577591,59,22167,11,2.0


In [None]:
tst = pd.DataFrame([[0, 10], [3, 20], [4, 7]], columns=['date_block_num', 'item_cnt_day'])

create_series_vectors(tst)

In [None]:
df_train_monthly_series = df_train_monthly.groupby(["shop_id", "item_id"]) \
    .apply(create_series_vectors) \
    .reset_index() \
    .rename(columns={0: 'monthly_sales_array'}, inplace=False)

In [None]:
df_train_monthly_series

In [16]:
df_train_monthly_series = df_train_monthly_series.merge(df_items[['item_id', 'item_category_id']], on="item_id", how="left")
df_train_monthly_series = df_train_monthly_series.reindex(columns=['shop_id', 'item_id', 'item_category_id', 'monthly_sales_array'])

In [None]:
df_train_monthly_series

In [18]:
df_valid_monthly = df_valid.groupby(["shop_id", "item_id", "date_block_num"])['item_cnt_day'].sum().reset_index()
df_valid_monthly['item_cnt_day'] = df_valid_monthly['item_cnt_day'].clip(0, 20)
df_valid_monthly = df_valid_monthly.drop(columns=["date_block_num"])

In [None]:
df_valid_monthly

In [20]:
df_final = df_train_monthly_series.merge(df_valid_monthly, on=["shop_id", "item_id"], how="left").rename(columns={'item_cnt_day': 'y'}, inplace=False)

In [None]:
df_final

In [22]:
df_final_raw = df_final.copy()

In [23]:
df_final = df_final[df_final['y'].notna()]

In [24]:
X = df_final.drop(columns=["y"])
y = df_final['y'].values

In [None]:
X

In [26]:
class SalesPredictionModel(nn.Module):
    def __init__(self, num_shops, num_items, num_categories, embedding_size, sales_vector_size):
        super(SalesPredictionModel, self).__init__()
        
        # Embedding layers for categorical variables
        self.shop_embedding = nn.Embedding(num_embeddings=num_shops, embedding_dim=embedding_size)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_size)
        self.category_embedding = nn.Embedding(num_embeddings=num_categories, embedding_dim=embedding_size)
        
        # Fully connected layers
        input_size = embedding_size * 3 + sales_vector_size  # 3 embeddings + sales vector size
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.output = nn.Linear(16, 1)
        
        # Activation
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, shop_id, item_id, category_id, sales_array):
        # Pass categorical inputs through their respective embeddings
        shop_embed = self.shop_embedding(shop_id).squeeze(1)
        item_embed = self.item_embedding(item_id).squeeze(1)
        category_embed = self.category_embedding(category_id).squeeze(1)
        
        # Concatenate all embeddings with the sales array
        concatenated = torch.cat([shop_embed, item_embed, category_embed, sales_array], dim=1)
        
        # Fully connected layers with ReLU activation
        x = self.relu(self.fc1(concatenated))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        
        # Output layer with sigmoid scaled to 0-20
        output = self.sigmoid(self.output(x)) * 20
        
        return output

In [27]:
class SalesDataset(Dataset):
    def __init__(self, df):
        # Convert DataFrame columns to PyTorch tensors
        self.shop_ids = torch.tensor(df['shop_id'].values, dtype=torch.long).unsqueeze(1)
        self.item_ids = torch.tensor(df['item_id'].values, dtype=torch.long).unsqueeze(1)
        self.category_ids = torch.tensor(df['item_category_id'].values, dtype=torch.long).unsqueeze(1)
        self.sales_array = torch.tensor(np.vstack(df['monthly_sales_array'].values), dtype=torch.float32)
        self.targets = torch.tensor(df['y'].values, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.shop_ids)

    def __getitem__(self, idx):
        return (self.shop_ids[idx], self.item_ids[idx], self.category_ids[idx], self.sales_array[idx], self.targets[idx])

# Instantiate the dataset
dataset = SalesDataset(df=df_final)


In [28]:
train_df, val_df = train_test_split(df_final, test_size=0.2, random_state=42)

# Create datasets
train_dataset = SalesDataset(train_df)
val_dataset = SalesDataset(val_df)

# Create DataLoaders for train and validation sets
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:
train_df

In [None]:
val_df

In [None]:
next(iter(train_loader))

In [None]:
df_final.item_id.max()

In [33]:
# Parameters
num_shops = df_final['shop_id'].max() + 1        # Example: total unique shops
num_items = df_final['item_id'].max() + 1       # Example: total unique items
num_categories = df_final['item_category_id'].max() + 1    # Example: total unique categories
embedding_size = 8     # Embedding size for each categorical feature
sales_vector_size = X.iloc[0]['monthly_sales_array'].size  # Size of the monthly_sales_array (fixed size)

# Instantiate the model
model = SalesPredictionModel(num_shops, num_items, num_categories, embedding_size, sales_vector_size)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)


In [None]:
model

In [None]:
num_epochs = 20

history = {
    'train_loss': [],
    'val_loss': []
}

for epoch in range(num_epochs):
    # Training phase
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        shop_ids, item_ids, category_ids, sales_arrays, targets = batch
        #print(shop_ids, item_ids, category_ids, sales_arrays, targets)
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(shop_ids, item_ids, category_ids, sales_arrays)
        
        # Compute loss
        loss = criterion(predictions, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            shop_ids, item_ids, category_ids, sales_arrays, targets = batch
            
            # Forward pass
            predictions = model(shop_ids, item_ids, category_ids, sales_arrays)
            
            # Compute loss
            loss = criterion(predictions, targets)
            val_loss += loss.item()

    history['train_loss'].append(running_loss / len(train_loader))    
    history['val_loss'].append(val_loss / len(val_loader))
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {running_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}')

In [None]:
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Time')
plt.legend()
plt.show()

In [47]:
preds = []

with torch.no_grad():
    for i in range(len(val_dataset)):
        elem = (_.unsqueeze(0) for _ in val_dataset[i])
        shop_ids, item_ids, category_ids, sales_arrays, targets = elem

        predictions = model(shop_ids, item_ids, category_ids, sales_arrays)
        preds.append(predictions.item())

In [48]:
val_df_preds = val_df.copy().assign(predictions=preds)

In [None]:
val_df_preds

In [78]:
df_test = pd.read_csv(".data/test.csv")

In [79]:
df_test = df_test.merge(df_items, on="item_id", how="left")
df_test = df_test[['shop_id', 'item_id', 'item_category_id']]

In [39]:
df_final_raw['y'] = df_final_raw['y'].fillna(0)

In [None]:
df_final_raw

In [59]:
df_final_raw['monthly_sales_array_test'] = df_final_raw.apply(lambda x: np.append(x['monthly_sales_array'], x['y']), axis=1)

In [None]:
df[(df.shop_id == 5) & (df.item_id == 5320)]

In [82]:
df_test = df_test.merge(df_final_raw, on=["shop_id", "item_id", "item_category_id"], how="left")
df_test = df_test.drop(columns=["y", "monthly_sales_array"])
df_test = df_test.rename(columns={'monthly_sales_array_test': 'monthly_sales_array'}, inplace=False)

In [89]:
def set_zeros(x):
    if not isinstance(x, np.ndarray):
        return np.zeros(34)
    return x

In [93]:
df_test['monthly_sales_array'] = df_test['monthly_sales_array'].apply(set_zeros)

In [96]:
df_test['y'] = 0

In [97]:
test_dataset = SalesDataset(df_test)

In [None]:
preds = []

with torch.no_grad():
    for i in range(len(test_dataset)):
        elem = (_.unsqueeze(0) for _ in test_dataset[i])
        shop_ids, item_ids, category_ids, sales_arrays, targets = elem

        predictions = model(shop_ids, item_ids, category_ids, sales_arrays)
        preds.append(predictions.item())