In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import sys
import os
sys.path.append(os.path.abspath('..'))
from data.preprocessing import Preprocessor
from data.feature_generation import spec_agg_features
from src.models import LSTMSeq2One

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [2]:
preprocessor = Preprocessor()
spec = preprocessor.spec_preprocessing()
spec = spec_agg_features(spec).dropna()
spec = spec.sort_values('spec_date')
spec.head(2)

Duplicate rows cleaning: 620it [00:03, 202.09it/s]


Unnamed: 0,supplier,supplier_status,spec_date,delivery_period_end,option,declared_price,consent_price,spec_price,volume_requested,volume_contracted,bids_submitted,bids_contracted,id,delivery_length,mean_delivery_length,delivery_length_diff,mean_volume,volume_diff,conversion
3153,ЛЕТО ООО,0.0,2022-07-19,2022-07-31,10.0,26.0,25.0,0.0,140.0,0.0,1.0,0.0,"ЛЕТО ООО_Юг Руси, АО_2022-07-19",12,16.0,4.0,60.0,80.0,0.5
3154,ЛЕТО ООО,0.0,2022-07-20,2022-07-31,10.0,26.0,25.0,25.0,140.0,155.0,1.0,1.0,"ЛЕТО ООО_Юг Руси, АО_2022-07-20",11,16.0,5.0,60.0,80.0,0.333333


In [3]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, target_col, time_col, service_cols, window_size=30):
        numeric_cols = list(set(df.columns) - set(service_cols + [target_col]))
        df = df.sort_values('spec_date')
        df[numeric_cols] = StandardScaler().fit_transform(df[numeric_cols])
        self.df = df
        self.df_temp = df.loc[df.apply(
            lambda row: len(df.loc[spec['spec_date'] < row['spec_date']]),
            axis=1) > 5]
        self.time_col = time_col
        self.target_col = target_col
        self.service_cols = service_cols
        self.window_size = window_size

    def __len__(self):
        return len(self.df_temp)

    def __getitem__(self, idx):
        time = self.df_temp.iloc[idx][self.time_col]
        
        x = self.df.loc[self.df[self.time_col] < time].iloc[-self.window_size:].drop(self.service_cols, 1).values
        y = self.df_temp.iloc[idx][self.target_col]
        return torch.tensor(x), torch.tensor(y)

In [4]:
spec = spec.sort_values('spec_date')
split_point = int(len(spec)*0.75)
train_dataset = TimeSeriesDataset(spec.iloc[:split_point], 'bids_contracted', 'spec_date', ['supplier', 'spec_date', 'delivery_period_end', 'id'])
test_dataset = TimeSeriesDataset(spec.iloc[split_point:], 'bids_contracted', 'spec_date', ['supplier', 'spec_date', 'delivery_period_end', 'id'])

In [5]:
def pad_collate(batch):
    return pad_sequence([i[0] for i in batch], batch_first=True), torch.stack([i[1] for i in batch]).view(-1, 1)

In [6]:
dataloader = DataLoader(train_dataset, batch_size=20, collate_fn=pad_collate)

In [7]:
model = LSTMSeq2One(15, 15, 1)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

In [8]:
def validate(model, val_data, criterion):
    model.eval()
    loss_history = []
    for i, (x, y) in enumerate(val_data):
        x, y = x.unsqueeze(0), y.unsqueeze(0).unsqueeze(0)
        output = model(x.float())
        loss = criterion(output, y.float()).item()
        loss_history.append(loss)
    return np.mean(loss_history)

In [9]:
train_history = []
val_history = []

for epoch in range(50):
    accumulated_loss = []
    for i, (x, y) in enumerate(dataloader):
        model.train()
        output = model(x.float())
        loss = criterion(output, y.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        accumulated_loss.append(loss.item())
    
    train_loss = np.mean(accumulated_loss)
    val_loss = validate(model, test_dataset, criterion)
    
    # if val_loss < val_history[-1]:
    #     torch.save(model.state_dict(), '../src/weights/pa.pth')
    
    train_history.append(train_loss)
    val_history.append(val_loss)
    
    print(f'Train loss: {train_loss}, Val loss: {val_loss}, epoch: {epoch}')

Train loss: 0.6395974801504676, Val loss: 0.7443834519318668, epoch: 0
Train loss: 0.6170504987239838, Val loss: 0.7668174200448263, epoch: 1
Train loss: 0.6125508397020758, Val loss: 0.7657479862146671, epoch: 2
Train loss: 0.6099962778588667, Val loss: 0.7617590922977009, epoch: 3
Train loss: 0.6075369954746674, Val loss: 0.7574549435458269, epoch: 4
Train loss: 0.6049287398868703, Val loss: 0.7527343972285717, epoch: 5


KeyboardInterrupt: 