In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
from datetime import datetime

import sys
import os
sys.path.append(os.path.abspath('..'))
from data.preprocessing import Preprocessor
from data.feature_generation import zpp4_agg_features, zpp4_embed_agg, spec_agg_features

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from torch_geometric.nn.models import GAT

In [2]:
from sklearn.metrics import accuracy_score, roc_auc_score

import sys
import os
sys.path.append(os.path.abspath('..'))
from src.metrics import apk, mapk

In [3]:
"""
Загрузка данных
"""

# таблица со спецификациями
spec = pd.read_csv('../data/processed_data/specs.csv')
spec['spec_date'] = pd.to_datetime(spec['spec_date'], format='%Y-%m-%d')
spec['delivery_period_end'] = pd.to_datetime(spec['delivery_period_end'], format='%Y-%m-%d')

# таблица с доставками
zpp4 = pd.read_csv('../data/processed_data/zpp4.csv')
zpp4['date'] = pd.to_datetime(zpp4['date'], format='%Y-%m-%d')
zpp4['spec_date'] = pd.to_datetime(zpp4['spec_date'], format='%Y-%m-%d')

In [4]:
"""
Генерация фичей
"""

# фичи эмбеддинги
spec = zpp4_embed_agg(spec)

features = [str(i) for i in range(16)]

spec = spec.fillna(0)

In [5]:
# разделение набора данных
split_point = spec['spec_date'].quantile(0.8)
train = spec[spec['spec_date'] < split_point]
test = spec[spec['spec_date'] >= split_point]

In [6]:
def df_to_graph(df, features, target):

    # генерация индексов рёбер
    edge_index = []
    for i in range(len(df)):
        for j in range(i):
            edge_index.append([i, j])
        for j in range(i+1, len(df)):
            edge_index.append([i, j])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).view(2, -1)
    x = torch.tensor(df[features].values, dtype=torch.float)
    y = torch.tensor(df[target].values, dtype=torch.float)
    return Data(x=x, y=y, edge_index=edge_index)

In [7]:
train_dataset = [df_to_graph(train[train['spec_date'] == day], features, 'bids_contracted') for day in train['spec_date'].unique()]
test_dataset = [df_to_graph(test[test['spec_date'] == day], features, 'bids_contracted') for day in test['spec_date'].unique()]

In [8]:
loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [261]:
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = self.relu(x)
        x = self.conv2(x, edge_index)
        x = self.sigmoid(x)

        return x

In [9]:
model = GAT(16, 32, 2, 1, dropout=0.05)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [10]:
model.train()
for epoch in range(1000):
    for batch in loader:
        out = model(batch.x, batch.edge_index)
        out = F.sigmoid(out)
        loss = criterion(out, batch.y.view(-1, 1))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    if epoch % 10 == 0 and epoch != 0:
        print(loss.item())

0.6765897274017334
0.6563298106193542
0.6590983867645264
0.6618518829345703
0.6428015232086182
0.6332069039344788
0.6503934264183044
0.6404200196266174
0.6260976791381836
0.6427507400512695
0.6247395277023315
0.6220357418060303
0.6391492486000061
0.6189703345298767
0.6284834146499634
0.6135326027870178
0.6166964173316956
0.6078242659568787
0.6172433495521545
0.6130870580673218
0.6057910323143005
0.6295979619026184
0.603305459022522
0.5951284170150757
0.5925212502479553
0.6068510413169861
0.5869389176368713
0.6077035665512085
0.6227427124977112
0.613279402256012
0.5921937823295593
0.6076918244361877
0.6195913553237915
0.5941442251205444
0.6116681098937988
0.6003482341766357
0.6231428384780884
0.6258254647254944
0.5992532968521118
0.599688708782196
0.6295122504234314
0.6086942553520203
0.6024848222732544
0.6057611107826233
0.632943868637085
0.5843992829322815
0.6055697798728943
0.59395831823349
0.6086865067481995
0.6110958456993103
0.5980188846588135
0.6209535002708435
0.5919452905654907

KeyboardInterrupt: 

In [11]:
F.sigmoid(model(test_dataset[1].x, test_dataset[0].edge_index)).detach().numpy().squeeze()

array([0.8016743 , 0.7280059 , 0.834715  , 0.82985306, 0.7701392 ,
       0.80561376, 0.8282531 , 0.75745785, 0.5418888 , 0.58603686,
       0.8087782 , 0.6818333 ], dtype=float32)

In [12]:
test_dataset[1].y.numpy()

array([1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1.], dtype=float32)

In [13]:
y_real = test_dataset[0].y.numpy()
y_pred = F.sigmoid(model(test_dataset[0].x, test_dataset[0].edge_index)).detach().numpy().squeeze()

In [14]:
apk(list(y_real), list(np.round(y_pred)))

0.16666666666666666

In [15]:
def eval(model, dataset, metric_func=mapk):
    model.eval()
    return np.mean([
        metric_func(data.y.numpy(), model(data.x, data.edge_index).detach().numpy().reshape(-1))
        for data in dataset
    ])

In [16]:
# sas = lambda y_real, y_pred: apk(list(y_real), list(np.round(y_pred)))
sas = lambda y_real, y_pred: (1. if len(np.unique(y_real)) <= 1 else roc_auc_score(y_real, y_pred))

In [17]:
eval(model, train_dataset, sas)

0.658758639283379

In [18]:
eval(model, test_dataset, sas)

0.5827347138933746

In [37]:
input_features = 3
output_features = 2

x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=torch.float)
edge_index = torch.tensor([[0, 1, 2, 0, 3], [1, 2, 0, 3, 1]], dtype=torch.long)

data = Data(x=x, edge_index=edge_index)

model = GCN(input_features=input_features, output_features=output_features)

In [10]:
model(data)

tensor([[-1.2854, -0.3237],
        [-1.4819, -0.2577],
        [-1.3450, -0.3018],
        [-1.3129, -0.3134]], grad_fn=<LogSoftmaxBackward0>)

In [60]:
data1 = Data(x=torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float), edge_index=torch.tensor([[0], [1]], dtype=torch.long))
dataset = [data1, data, data1]
loader = DataLoader(dataset, batch_size=2, shuffle=True)