## Установка библиотек

In [None]:
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
conda install -c dglteam/label/cu118 dgl
conda install pyg -c pyg
conda install scikit-learn pandas pyyaml ipywidgets

## Импорт библиотек

In [1]:
import os
os.environ["DGLBACKEND"] = "pytorch"
from pathlib import Path
import traceback

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

import dgl
import dgl.nn as dglnn
import dgl.function as fn
from dgl.data import DGLDataset

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import OneHotEncoder

---

## Работа с датасетом

In [None]:
train = pd.read_csv('train_dataset_VK/train.csv')
test  = pd.read_csv('train_dataset_VK/test.csv')
attr  = pd.read_csv('train_dataset_VK/attr.csv')

In [3]:
train

Unnamed: 0,ego_id,u,v,t,x1,x2,x3
0,0,131,84,148.0,5.669200e-07,0.000000,0.0
1,0,135,164,396.7,6.246274e-02,0.000000,0.0
2,0,47,15,,0.000000e+00,0.000000,1.0
3,0,5,4,594.5,4.962974e-02,0.000000,0.0
4,0,176,219,45.5,1.237935e+00,0.000000,0.0
...,...,...,...,...,...,...,...
122280367,1709396984692,3,5,34.6,2.307750e+00,1.098612,0.0
122280368,1709396984692,1,5,53.8,3.729143e+00,3.496508,1.0
122280369,1709396984692,1,7,1.5,4.286984e+00,0.000000,0.0
122280370,1709396984692,5,11,2.0,3.500757e+00,0.000000,0.0


In [4]:
attr

Unnamed: 0,ego_id,u,age,city_id,sex,school,university
0,0,227,68,-1,1,778293348,-1
1,0,45,38,237065842,1,82803468,238500268
2,0,142,60,237065842,1,196560139,-1
3,0,280,66,-1,2,963209731,720783270
4,0,41,18,-1,2,308862409,-1
...,...,...,...,...,...,...,...
14930743,1709396984692,2,16,492149712,2,769209871,-1
14930744,1709396984692,12,15,-1,1,-1,-1
14930745,1709396984692,18,23,-1,1,-1,-1
14930746,1709396984692,4,16,650683235,1,-1,-1


In [45]:
train.corr()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3
ego_id,1.0,0.001828,0.00184,0.00142,0.000228,-0.001061,-0.000929
u,0.001828,1.0,0.21833,-0.065169,0.011669,0.001498,-0.064033
v,0.00184,0.21833,1.0,-0.092008,-0.019556,-0.022957,-0.022212
t,0.00142,-0.065169,-0.092008,1.0,-0.190351,-0.07375,-0.02547
x1,0.000228,0.011669,-0.019556,-0.190351,1.0,0.67887,0.08936
x2,-0.001061,0.001498,-0.022957,-0.07375,0.67887,1.0,0.136608
x3,-0.000929,-0.064033,-0.022212,-0.02547,0.08936,0.136608,1.0


### Разделение датасета на CSV

In [4]:
!mkdir train
!mkdir test

In [5]:
train.ego_id.unique().shape, test.ego_id.unique().shape

((61786,), (20596,))

In [20]:
from multiprocessing import Pool

lookup_train = train["ego_id"].value_counts()
unique_train = train["ego_id"].unique()

cum_train = np.cumsum(lookup_train[unique_train])
cum_train = np.hstack(([0], cum_train))

table_train = {unique_train[i-1] : (cum_train[i-1], cum_train[i]) for i in range(1, cum_train.shape[0])}


def split_train_csv(id):
    low, high = table_train[id]
    train.iloc[low : high].to_csv(f"train/{id}.csv")


lookup_test = test["ego_id"].value_counts()
unique_test = test["ego_id"].unique()

cum_test = np.cumsum(lookup_test[unique_test])
cum_test = np.hstack(([0], cum_test))

table_test = {unique_test[i-1] : (cum_test[i-1], cum_test[i]) for i in range(1, cum_test.shape[0])}


def split_test_csv(id):
    low, high = table_test[id]
    test.iloc[low : high].to_csv(f"test/{id}.csv")

In [26]:
pool = Pool()

pool.map(split_train_csv, train["ego_id"].unique())
pool.map(split_test_csv, test["ego_id"].unique())

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:
pd.read_csv('test/8.csv')

Unnamed: 0.1,Unnamed: 0,ego_id,u,v,t,x1,x2,x3
0,0,8,20,19,185.7,3.839089e-04,0.000000,0.0
1,1,8,131,125,161.4,4.034464e-01,0.000000,0.0
2,2,8,73,56,127.0,8.554643e-05,0.000000,0.0
3,3,8,0,4,594.5,2.886418e-01,0.000000,0.0
4,4,8,63,73,127.0,4.281692e-07,0.000000,0.0
...,...,...,...,...,...,...,...,...
1013,1013,8,132,17,24.1,1.826740e+00,1.791759,0.0
1014,1014,8,29,14,346.9,,0.000000,0.0
1015,1015,8,56,59,80.0,,0.000000,0.0
1016,1016,8,14,11,300.1,,0.000000,0.0


### Формирование датасета DGL

In [2]:
attr  = pd.read_csv('train_dataset_VK/attr.csv')

In [None]:
def make_ohe(param):
    ohe = OneHotEncoder(sparse_output=False)

    encoded_data = ohe.fit_transform(param.reshape(-1, 1))
    
    # pad to the size 9, we can have at most 300 friends -> 300 differnt cities -> need 9 bits to represent 300 id's
    encoded_data = np.pad(encoded_data, ((0, 0), (300 - encoded_data.shape[1], 0)), mode='constant')

    return encoded_data

def make_embeds(params):
    params = params[["age", "sex", "city_id", "school", "university"]].values.T

    out = np.empty((params.shape[1], 902))

    out[:, 0] = params[0]
    out[:, 1] = params[1]
    out[:, 2:302] = make_ohe(params[2])
    out[:, 302:602] = make_ohe(params[3])
    out[:, 602:902] = make_ohe(params[4])
    return out

In [None]:
class VKDataset(dgl.data.DGLDataset):
    def __init__(self, data_path : str | Path, attr : pd.DataFrame):
        super().__init__("VKDataset")
        self.data_path = data_path
        self.attr = attr

        # sorted only for debugging
        self.file_names = sorted(os.listdir(data_path), key = lambda x: int(x[:-4]))

        self.ohe = OneHotEncoder()

        # faster lookup through attrs
        lookup_attr = attr["ego_id"].value_counts()
        unique_attr = attr["ego_id"].unique()

        cum_attr = np.cumsum(lookup_attr[unique_attr])
        cum_attr = np.hstack(([0], cum_attr))

        self.table_attr = {unique_attr[i-1] : (cum_attr[i-1], cum_attr[i]) for i in range(1, cum_attr.shape[0])}


    def __getitem__(self, id):
        id = int(self.file_names[id][:-4])

        # print(id)

        edges_data = pd.read_csv(f"{self.data_path}/{id}.csv")

        low, high = self.table_attr[id]
        nodes_data = attr.iloc[low : high]

        # missing ids filling
        table = {nodes_data.iloc[i].u : i for i in range(nodes_data.shape[0])}

        # print(nodes_data.dtypes)

        m = max(edges_data["u"].max(), edges_data["v"].max())
        new_data = []
        add_num = 0
        for i in range(m+1):
            if i not in table:
                table[i] = nodes_data.shape[0] + add_num
                new_data.append([id, table[i], -1, -1, -1, -1, -1])
                add_num += 1
        
        # print(nodes_data.dtypes)

        nodes_data = pd.concat((nodes_data, pd.DataFrame(new_data, columns = nodes_data.columns, dtype="int64")), ignore_index=True)

        # print(nodes_data.dtypes)

        # weird bug, where df dtypes turn to objects
        # nodes_data = nodes_data.astype({col : "int64" for col in nodes_data.columns})

        edges_data["u"] = edges_data["u"].apply(lambda x: table[x])
        edges_data["v"] = edges_data["v"].apply(lambda x: table[x])
        
        node_features_age = torch.from_numpy(nodes_data["age"].to_numpy()).float()
        # node_features_city = torch.from_numpy(nodes_data["city_id"].to_numpy())
        node_features_sex = torch.from_numpy(nodes_data["sex"].to_numpy()).float()

        node_features_city = torch.from_numpy(self.ohe.fit_transform(nodes_data[["city_id"]]).toarray())
        node_features_school = torch.from_numpy(self.ohe.fit_transform(nodes_data[["school"]]).toarray())
        node_features_university = torch.from_numpy(self.ohe.fit_transform(nodes_data[["university"]]).toarray())
        # print(node_features_city)

        # node_features = torch.tensor([node_features_age, node_features_sex, *node_features_city[0], *node_features_school[0], *node_features_university[0]])
        
        # node_features = torch.tensor([node_features_age, node_features_sex, ])
        node_features = torch.tensor(make_embeds(nodes_data))
        # node_features = torch.from_numpy(np.vstack((node_features_age, node_features_sex)).T)

        # node_features_school = torch.from_numpy(nodes_data["school"].to_numpy())
        # node_features_university = torch.from_numpy(nodes_data["university"].to_numpy())


        # node_labels = torch.from_numpy(
        #     nodes_data["Club"].astype("category").cat.codes.to_numpy()
        # )

        # edge_features_t = torch.from_numpy(edges_data["t"].to_numpy())
        edge_features_x1 = torch.from_numpy(edges_data["x1"].to_numpy())
        # edge_features_x2 = torch.from_numpy(edges_data["x2"].to_numpy())

        edges_src = torch.from_numpy(edges_data["u"].to_numpy())
        edges_dst = torch.from_numpy(edges_data["v"].to_numpy())

        # graph = dgl.graph(
        #     (edges_src, edges_dst), num_nodes=nodes_data.shape[0]
        # )

        graph = dgl.graph(
            (np.concatenate([edges_src, edges_dst]), np.concatenate([edges_dst, edges_src])), num_nodes=nodes_data.shape[0]
        )
        
        # graph.ndata["age"] = node_features_age
        # graph.ndata["city_id"] = node_features_city
        # graph.ndata["sex"] = node_features_sex

        # graph.ndata["school"] = node_features_school
        # graph.ndata["university"] = node_features_university

        graph.ndata["feature"] = node_features

        # graph.edata["t"] = torch.concatenate((edge_features_t, edge_features_t))
        graph.edata["x1"] = torch.concatenate((edge_features_x1, edge_features_x1))
        # graph.edata["x2"] = edge_features_x2

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.

        # n_nodes = nodes_data.shape[0]
        # n_train = int(n_nodes * 0.6)
        # n_val = int(n_nodes * 0.2)
        # train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        # val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        # test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        # train_mask[:n_train] = True
        # val_mask[n_train : n_train + n_val] = True
        # test_mask[n_train + n_val :] = True
        # graph.ndata["train_mask"] = train_mask
        # graph.ndata["val_mask"] = val_mask
        # graph.ndata["test_mask"] = test_mask

        return graph #, node_features, edge_features_x1

    def __len__(self):
        return len(self.file_names)

    def process(self):
        pass

    # def getitem(self, i):
    #     return self.graph

    # def len(self):
    #     return 1

# dataset = VKDataset("data/train", attr)
# graph = dataset[0]
# # 53373
# print(graph)

In [None]:
dataset_train = VKDataset("train", attr)
dataloader_train = dgl.dataloading.GraphDataLoader(dataset_train, batch_size = 1024, shuffle = True, drop_last=False, num_workers=4)

# dataset_test = VKDataset("data/test", attr)
# dataloader_test = dgl.dataloading.GraphDataLoader(dataset_test, batch_size = 10, drop_last=False, num_workers=2)

---

## Модель

### Обучение

In [None]:
class SAGE(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats):
        super().__init__()
        self.conv_in = dglnn.SAGEConv(
            in_feats=in_feats, out_feats=hid_feats, aggregator_type='pool')
        self.conv2 = dglnn.SAGEConv(
            in_feats=hid_feats, out_feats=hid_feats, aggregator_type='pool')
        self.conv_last = dglnn.SAGEConv(
            in_feats=hid_feats, out_feats=out_feats, aggregator_type='lstm')


    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv_in(graph, inputs)
        h = F.tanh(h)
        h = self.conv_last(graph, h)
        return h


class DotProductPredictor(nn.Module):
    def forward(self, graph, h):
        # h contains the node representations computed from the GNN defined
        # in the node classification section (Section 5.1).
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return graph.edata['score']

In [None]:
class Encoder(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super().__init__()
        self.encoder_hidden_layer = nn.Linear(
            in_features=in_features, out_features=hidden_features
        )
        self.encoder_output_layer = nn.Linear(
            in_features=hidden_features, out_features=out_features
        )

    def forward(self, features):
        activation = self.encoder_hidden_layer(features)
        activation = torch.relu(activation)
        code = self.encoder_output_layer(activation)
        return code


class Model(nn.Module):
    def __init__(self, in_features=902, hidden_features=50, out_features=5):
        super().__init__()
        self.sage = SAGE(128, hidden_features, out_features)
        self.pred = DotProductPredictor()
        self.dropout = nn.Dropout(0.5)
        self.node_encoder = Encoder(in_features, 256, 128).float()
    
    def forward(self, g, x):
        x = self.node_encoder(x)
        h = self.sage(g, x)
        h = self.pred(g, h)
        return h

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
model = Model()

In [None]:
opt = torch.optim.Adam(model.parameters())

loss_func = nn.MSELoss()

# shuffle = np.random.permutation(len(dataset_train))
for epoch in range(10):
    for graph in (pbar := tqdm(dataloader_train)):
        try:
            if graph.num_edges() > 10_000: # skip very large graphs
                continue

            node_features, label = graph.ndata["feature"], graph.edata["x1"]

            graph, node_features, label = (graph.to(device), node_features.to(device).float(), label.to(device).float())
            
            opt.zero_grad()
            
            pred = model(graph, node_features)
            pred = pred.squeeze()
            loss = loss_func(label, pred)
            # loss = ((pred - label) ** 2).mean()
            
            loss.backward()
            opt.step()
            
            pbar.set_description(f"loss: {loss.item():.3f}")
            
            # if i > 10000:
            #     break
        except Exception:
            print(traceback.format_exc())

    torch.save(model.state_dict(), f"model/849_{epoch}.pth")

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

KeyboardInterrupt: 