In [68]:
import numpy as np
import pandas as pd

import seaborn as sns
from datetime import datetime

import sys
import os
sys.path.append(os.path.abspath('..'))
from data.preprocessing import Preprocessor
from data.feature_generation import zpp4_agg_features, zpp4_embed_agg, spec_agg_features

import torch
import torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader

In [150]:
from sklearn.metrics import accuracy_score, roc_auc_score

import sys
import os
sys.path.append(os.path.abspath('..'))
from src.metrics import apk, mapk

In [2]:
"""
Загрузка данных
"""

# таблица со спецификациями
spec = pd.read_csv('../data/processed_data/specs.csv')
spec['spec_date'] = pd.to_datetime(spec['spec_date'], format='%Y-%m-%d')
spec['delivery_period_end'] = pd.to_datetime(spec['delivery_period_end'], format='%Y-%m-%d')

# таблица с доставками
zpp4 = pd.read_csv('../data/processed_data/zpp4.csv')
zpp4['date'] = pd.to_datetime(zpp4['date'], format='%Y-%m-%d')
zpp4['spec_date'] = pd.to_datetime(zpp4['spec_date'], format='%Y-%m-%d')

In [3]:
"""
Генерация фичей
"""

# фичи эмбеддинги
spec = zpp4_embed_agg(spec)

features = [str(i) for i in range(16)]

spec = spec.dropna()

In [4]:
split_point = spec['spec_date'].quantile(0.8)

train = spec[spec['spec_date'] < split_point]
test = spec[spec['spec_date'] >= split_point]

In [105]:
def df_to_graph(df, features, target):

    # генерация индексов рёбер
    edge_index = []
    for i in range(len(df)):
        for j in range(i):
            edge_index.append([i, j])
        for j in range(i+1, len(df)):
            edge_index.append([i, j])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).view(2, -1)
    x = torch.tensor(df[features].values, dtype=torch.float)
    y = torch.tensor(df[target].values, dtype=torch.float)
    return Data(x=x, y=y, edge_index=edge_index)

In [106]:
train_dataset = [df_to_graph(train[train['spec_date'] == day], features, 'bids_contracted') for day in train['spec_date'].unique()]
test_dataset = [df_to_graph(test[test['spec_date'] == day], features, 'bids_contracted') for day in test['spec_date'].unique()]

In [228]:
loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [261]:
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = self.relu(x)
        x = self.conv2(x, edge_index)
        x = self.sigmoid(x)

        return x

In [281]:
model = GCN(16, 32, 1)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

In [282]:
model.train()
for epoch in range(1000):
    for batch in loader:
        out = model(batch)
        loss = criterion(out, batch.y.view(-1, 1))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    if epoch % 100 == 0 and epoch != 0:
        print(loss.item())

0.6123367547988892
0.6962488293647766
0.5693516731262207
0.5944064855575562
0.5909778475761414
0.6405900716781616
0.6174203753471375
0.6108258962631226
0.5658377408981323


In [273]:
model(test_dataset[0]).detach().numpy().squeeze()

array([0.5759759 , 0.5183669 , 0.47204107, 0.6619397 , 0.64461374,
       0.5601014 , 0.61113197, 0.32216984, 0.2878099 , 0.26217937,
       0.2547952 , 0.32068512, 0.4416008 , 0.620687  , 0.5026858 ],
      dtype=float32)

In [274]:
test_dataset[0].y.numpy()

array([0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1.],
      dtype=float32)

In [283]:
y_real = test_dataset[0].y.numpy()
y_pred = model(test_dataset[0]).detach().numpy().squeeze()

In [284]:
apk(list(y_real), list(np.round(y_pred)))

0.125

In [285]:
def eval(model, dataset, metric_func=mapk):
    model.eval()
    return np.mean([
        metric_func(data.y.numpy(), model(data).detach().numpy().reshape(-1))
        for data in dataset
    ])

In [286]:
# sas = lambda y_real, y_pred: apk(list(y_real), list(np.round(y_pred)))
sas = lambda y_real, y_pred: (1. if len(np.unique(y_real)) <= 1 else roc_auc_score(y_real, y_pred))

In [287]:
eval(model, train_dataset, sas)

0.6868049039233293

In [288]:
eval(model, test_dataset, sas)

0.6298414681148551

In [37]:
input_features = 3
output_features = 2

x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=torch.float)
edge_index = torch.tensor([[0, 1, 2, 0, 3], [1, 2, 0, 3, 1]], dtype=torch.long)

data = Data(x=x, edge_index=edge_index)

model = GCN(input_features=input_features, output_features=output_features)

In [10]:
model(data)

tensor([[-1.2854, -0.3237],
        [-1.4819, -0.2577],
        [-1.3450, -0.3018],
        [-1.3129, -0.3134]], grad_fn=<LogSoftmaxBackward0>)

In [60]:
data1 = Data(x=torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float), edge_index=torch.tensor([[0], [1]], dtype=torch.long))
dataset = [data1, data, data1]
loader = DataLoader(dataset, batch_size=2, shuffle=True)