In [1]:
# !pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
!pip install torch_geometric

!pip install dgl -f https://data.dgl.ai/wheels/cu118/repo.html
!pip install dglgo -f https://data.dgl.ai/wheels-test/repo.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-2.0.0+cu118.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels/cu118/repo.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels-test/repo.html


In [2]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install gdown
!gdown 1-WqODEO_TWy-fBV6f0r1B1tAlKkdS4hL

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading...
From: https://drive.google.com/uc?id=1-WqODEO_TWy-fBV6f0r1B1tAlKkdS4hL
To: /content/Moscow0.zip
100% 2.99G/2.99G [00:34<00:00, 87.3MB/s]


In [4]:
!unzip -q Moscow0.zip

In [5]:
!mv Moscow0 data

In [6]:
import torch
from torch import nn
import torch.nn.functional as F

import dgl
import dgl.function as fn
import dgl.nn as dglnn

import networkx as nx

import numpy as np

import pandas as pd
import polars as pl
import pyarrow as pr
from pyarrow import csv

from dask import dataframe as dd

from pathlib import Path

from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import OneHotEncoder

import traceback

import os

from tqdm.notebook import tqdm

from sklearn.metrics import mean_squared_error as mse

In [7]:
# anoying warning
pd.options.mode.chained_assignment = None

In [8]:
attr = pd.read_csv("data/attr.csv")

In [9]:
ohe = OneHotEncoder(sparse_output=False)

In [10]:
def make_ohe(param):
    ohe = OneHotEncoder(sparse_output=False)

    encoded_data = ohe.fit_transform(param.reshape(-1, 1))
    
    # pad to the size 9, we can have at most 300 friends -> 300 differnt cities -> need 9 bits to represent 300 id's
    encoded_data = np.pad(encoded_data, ((0, 0), (300 - encoded_data.shape[1], 0)), mode='constant')

    return encoded_data

def make_embeds(params):
    params = params[["age", "sex", "city_id", "school", "university"]].values.T

    out = np.empty((params.shape[1], 902))

    out[:, 0] = params[0]
    out[:, 1] = params[1]
    out[:, 2:302] = make_ohe(params[2])
    out[:, 302:602] = make_ohe(params[3])
    out[:, 602:902] = make_ohe(params[4])
    return out

In [11]:
class VKDataset(dgl.data.DGLDataset):
    def __init__(self, data_path : str | Path, attr : pd.DataFrame):
        super().__init__("VKDataset")
        self.data_path = data_path
        self.attr = attr

        # sorted only for debugging
        self.file_names = sorted(os.listdir(data_path), key = lambda x: int(x[:-4]))

        self.ohe = OneHotEncoder()

        # faster lookup through attrs
        lookup_attr = attr["ego_id"].value_counts()
        unique_attr = attr["ego_id"].unique()

        cum_attr = np.cumsum(lookup_attr[unique_attr])
        cum_attr = np.hstack(([0], cum_attr))

        self.table_attr = {unique_attr[i-1] : (cum_attr[i-1], cum_attr[i]) for i in range(1, cum_attr.shape[0])}


    def __getitem__(self, id):
        id = int(self.file_names[id][:-4])

        # print(id)

        edges_data = pd.read_csv(f"{self.data_path}/{id}.csv")

        low, high = self.table_attr[id]
        nodes_data = attr.iloc[low : high]

        # missing ids filling
        table = {nodes_data.iloc[i].u : i for i in range(nodes_data.shape[0])}

        # print(nodes_data.dtypes)

        m = max(edges_data["u"].max(), edges_data["v"].max())
        new_data = []
        add_num = 0
        for i in range(m+1):
            if i not in table:
                table[i] = nodes_data.shape[0] + add_num
                new_data.append([id, table[i], -1, -1, -1, -1, -1])
                add_num += 1
        
        # print(nodes_data.dtypes)

        nodes_data = pd.concat((nodes_data, pd.DataFrame(new_data, columns = nodes_data.columns, dtype="int64")), ignore_index=True)

        # print(nodes_data.dtypes)

        # weird bug, where df dtypes turn to objects
        # nodes_data = nodes_data.astype({col : "int64" for col in nodes_data.columns})

        edges_data["u"] = edges_data["u"].apply(lambda x: table[x])
        edges_data["v"] = edges_data["v"].apply(lambda x: table[x])
        
        node_features_age = torch.from_numpy(nodes_data["age"].to_numpy()).float()
        # node_features_city = torch.from_numpy(nodes_data["city_id"].to_numpy())
        node_features_sex = torch.from_numpy(nodes_data["sex"].to_numpy()).float()

        node_features_city = torch.from_numpy(self.ohe.fit_transform(nodes_data[["city_id"]]).toarray())
        node_features_school = torch.from_numpy(self.ohe.fit_transform(nodes_data[["school"]]).toarray())
        node_features_university = torch.from_numpy(self.ohe.fit_transform(nodes_data[["university"]]).toarray())
        # print(node_features_city)

        # node_features = torch.tensor([node_features_age, node_features_sex, *node_features_city[0], *node_features_school[0], *node_features_university[0]])
        
        # node_features = torch.tensor([node_features_age, node_features_sex, ])
        node_features = torch.tensor(make_embeds(nodes_data))
        # node_features = torch.from_numpy(np.vstack((node_features_age, node_features_sex)).T)

        # node_features_school = torch.from_numpy(nodes_data["school"].to_numpy())
        # node_features_university = torch.from_numpy(nodes_data["university"].to_numpy())


        # node_labels = torch.from_numpy(
        #     nodes_data["Club"].astype("category").cat.codes.to_numpy()
        # )

        # edge_features_t = torch.from_numpy(edges_data["t"].to_numpy())
        edge_features_x1 = torch.from_numpy(edges_data["x1"].to_numpy())
        # edge_features_x2 = torch.from_numpy(edges_data["x2"].to_numpy())

        edges_src = torch.from_numpy(edges_data["u"].to_numpy())
        edges_dst = torch.from_numpy(edges_data["v"].to_numpy())

        # graph = dgl.graph(
        #     (edges_src, edges_dst), num_nodes=nodes_data.shape[0]
        # )

        graph = dgl.graph(
            (np.concatenate([edges_src, edges_dst]), np.concatenate([edges_dst, edges_src])), num_nodes=nodes_data.shape[0]
        )
        
        # graph.ndata["age"] = node_features_age
        # graph.ndata["city_id"] = node_features_city
        # graph.ndata["sex"] = node_features_sex

        # graph.ndata["school"] = node_features_school
        # graph.ndata["university"] = node_features_university

        graph.ndata["feature"] = node_features

        # graph.edata["t"] = torch.concatenate((edge_features_t, edge_features_t))
        graph.edata["x1"] = torch.concatenate((edge_features_x1, edge_features_x1))
        # graph.edata["x2"] = edge_features_x2

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.

        # n_nodes = nodes_data.shape[0]
        # n_train = int(n_nodes * 0.6)
        # n_val = int(n_nodes * 0.2)
        # train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        # val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        # test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        # train_mask[:n_train] = True
        # val_mask[n_train : n_train + n_val] = True
        # test_mask[n_train + n_val :] = True
        # graph.ndata["train_mask"] = train_mask
        # graph.ndata["val_mask"] = val_mask
        # graph.ndata["test_mask"] = test_mask

        return graph #, node_features, edge_features_x1

    def __len__(self):
        return len(self.file_names)

    def process(self):
        pass

    # def getitem(self, i):
    #     return self.graph

    # def len(self):
    #     return 1

dataset = VKDataset("data/train", attr)
graph = dataset[0]
# 53373
print(graph)

Graph(num_nodes=300, num_edges=3284,
      ndata_schemes={'feature': Scheme(shape=(902,), dtype=torch.float64)}
      edata_schemes={'x1': Scheme(shape=(), dtype=torch.float64)})


In [28]:
dataset_train = VKDataset("data/train", attr)
dataloader_train = dgl.dataloading.GraphDataLoader(dataset_train, batch_size = 128, shuffle = True, drop_last=False, num_workers=10)

dataset_test = VKDataset("data/test", attr)
dataloader_test = dgl.dataloading.GraphDataLoader(dataset_test, batch_size = 10, drop_last=False, num_workers=2)



In [20]:
class SAGE(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats):
        super().__init__()
        self.conv_in = dglnn.SAGEConv(
            in_feats=in_feats, out_feats=hid_feats, aggregator_type='lstm')
        self.conv2 = dglnn.SAGEConv(
            in_feats=hid_feats, out_feats=hid_feats, aggregator_type='lstm')
        self.conv_last = dglnn.SAGEConv(
            in_feats=hid_feats, out_feats=out_feats, aggregator_type='lstm')


    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv_in(graph, inputs)
        h = F.tanh(h)
        h = self.conv_last(graph, h)
        return h


class DotProductPredictor(nn.Module):
    def forward(self, graph, h):
        # h contains the node representations computed from the GNN defined
        # in the node classification section (Section 5.1).
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return graph.edata['score']

In [14]:
class Encoder(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super().__init__()
        self.encoder_hidden_layer = nn.Linear(
            in_features=in_features, out_features=hidden_features
        )
        self.encoder_output_layer = nn.Linear(
            in_features=hidden_features, out_features=out_features
        )

    def forward(self, features):
        activation = self.encoder_hidden_layer(features)
        activation = torch.relu(activation)
        code = self.encoder_output_layer(activation)
        return code


class Model(nn.Module):
    def __init__(self, in_features=902, hidden_features=100, out_features=15):
        super().__init__()
        self.sage = SAGE(128, hidden_features, out_features)
        self.pred = DotProductPredictor()
        self.node_encoder = Encoder(in_features, 256, 128).float()
    
    def forward(self, g, x):
        x = self.node_encoder(x)
        h = self.sage(g, x)
        h = self.pred(g, h)
        return h

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [16]:
model = Model()

In [21]:
model.load_state_dict(torch.load('/content/drive/MyDrive/737_2.pth', map_location='cpu'))
model.eval()

Model(
  (sage): SAGE(
    (conv_in): SAGEConv(
      (feat_drop): Dropout(p=0.0, inplace=False)
      (lstm): LSTM(128, 128, batch_first=True)
      (fc_neigh): Linear(in_features=128, out_features=100, bias=False)
      (fc_self): Linear(in_features=128, out_features=100, bias=True)
    )
    (conv2): SAGEConv(
      (feat_drop): Dropout(p=0.0, inplace=False)
      (lstm): LSTM(100, 100, batch_first=True)
      (fc_neigh): Linear(in_features=100, out_features=100, bias=False)
      (fc_self): Linear(in_features=100, out_features=100, bias=True)
    )
    (conv_last): SAGEConv(
      (feat_drop): Dropout(p=0.0, inplace=False)
      (lstm): LSTM(100, 100, batch_first=True)
      (fc_neigh): Linear(in_features=100, out_features=15, bias=False)
      (fc_self): Linear(in_features=100, out_features=15, bias=True)
    )
  )
  (pred): DotProductPredictor()
  (node_encoder): Encoder(
    (encoder_hidden_layer): Linear(in_features=902, out_features=256, bias=True)
    (encoder_output_layer): 

In [22]:
model = model.to(device).train()

In [29]:
opt = torch.optim.Adam(model.parameters())

loss_func = nn.MSELoss()

# shuffle = np.random.permutation(len(dataset_train))
for epoch in range(10):
    for graph in (pbar := tqdm(dataloader_train)):
        try:
            if graph.num_edges() > 10_000: # skip very large graphs
                continue

            node_features, label = graph.ndata["feature"], graph.edata["x1"]

            graph, node_features, label = (graph.to(device), node_features.to(device).float(), label.to(device).float())
            
            opt.zero_grad()
            
            pred = model(graph, node_features)
            pred = pred.squeeze()
            loss = loss_func(label, pred)
            # loss = ((pred - label) ** 2).mean()
            
            loss.backward()
            opt.step()
            
            pbar.set_description(f"loss: {loss.item():.3f}")
            
            # if i > 10000:
            #     break
        except Exception:
            print(traceback.format_exc())

    torch.save(model.state_dict(), f"/content/drive/MyDrive/857_{epoch}.pth")

  0%|          | 0/483 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff8678c9ab0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff8678c9ab0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

  0%|          | 0/483 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff8678c9ab0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff8678c9ab0>
<function _MultiProcessingDataLoaderIter.__del__ at 0x7ff8678c9ab0>
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
  File "/usr/local/lib/python3.10/dist-packages/torch/u

  0%|          | 0/483 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff8678c9ab0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff8678c9ab0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

  0%|          | 0/483 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff8678c9ab0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff8678c9ab0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/

KeyboardInterrupt: ignored

### Submit

In [None]:
# model = Model().to(device)

# model.load_state_dict(torch.load("/content/drive/MyDrive/model_228.csv"))

# model = model.to(device)

In [None]:
# download submit example
!gdown 1-3BbzlQAfD_VQvtE8189P8UfSv0UvciK

In [None]:
submit = pd.read_csv("submission.csv")

In [None]:
# split submit dataset
!mkdir submit

submit = pd.read_csv("submission.csv")

from multiprocessing import Pool

lookup_submit = submit["ego_id"].value_counts()
unique_submit = submit["ego_id"].unique()

cum_submit = np.cumsum(lookup_submit[unique_submit])
cum_submit = np.hstack(([0], cum_submit))

table_submit = {unique_submit[i-1] : (cum_submit[i-1], cum_submit[i]) for i in range(1, cum_submit.shape[0])}


def split_submit_csv(id):
    low, high = table_submit[id]
    submit.iloc[low : high].to_csv(f"submit/{id}.csv", index=None)

pool = Pool()

pool.map(split_submit_csv, submit["ego_id"].unique())

!mv submit data/submit

In [None]:
dataset_submit = VKDataset("data/submit", attr)
dataloader_submit = dgl.dataloading.GraphDataLoader(dataset_submit, batch_size = 1, drop_last=False, num_workers=2, shuffle=False)

In [None]:
pred.shape

In [None]:
submit = pd.read_csv("submission.csv")

def calculate_metric(truth, pred):
    return mse(truth, pred, squared=True)

metric_history = []

lookup_submit = submit["ego_id"].value_counts()
unique_submit = submit["ego_id"].unique()

cum_submit = np.cumsum(lookup_submit[unique_submit])
cum_submit = np.hstack(([0], cum_submit))

table_submit = [(cum_submit[i-1], cum_submit[i]) for i in range(1, cum_submit.shape[0])]

        
with torch.no_grad():
    for i, graph in (pbar := tqdm(enumerate(dataloader_submit), total = len(dataloader_submit))):
        node_features, label = graph.ndata["feature"], graph.edata["x1"]

        graph, node_features, label = (graph.to(device), node_features.to(device).float(), label.to(device).float())
        
        # pred = model(graph, node_features).cpu().numpy()
        
        pred = model(graph, node_features)
        pred = pred.squeeze().cpu().numpy()
        pred = pred[:pred.shape[0]//2] # model predicts x2 because of unidirectional graph

        # label = label.cpu().numpy()[:label.shape[0]//2]

        low, high = table_submit[i]
        submit["x1"].iloc[low : high] = pred

        # metric = calculate_metric(label, pred)
        # metric_history.append(metric)
        
        # pbar.set_description(f"RMSE: {metric:.3f}")

        # print(f"RMSE: {metric:.3f}")

In [None]:
submit.to_csv("/content/drive/MyDrive/submit2cups_v2.2.csv", index=False)

In [None]:
graph.num_edges()

In [None]:
g = next(iter(dataloader_submit)).to(device)

In [None]:
g

In [None]:
g

In [None]:
pred = model(g, g.ndata["feature"].float()).squeeze()

In [None]:
pred

# split dataset

In [None]:
!mkdir train
!mkdir test
!mkdir submit

In [None]:
for id in train["ego_id"].unique():
    train[train["ego_id"] == id].to_csv(f"train/{id}.csv")

for id in test["ego_id"].unique():
    test[test["ego_id"] == id].to_csv(f"test/{id}.csv")

In [None]:
from multiprocessing import Process


def split_train_csv(id):
    train[train["ego_id"] == id].to_csv(f"train/{id}.csv")

def split_train_csv(id):
    test[test["ego_id"] == id].to_csv(f"test/{id}.csv")



processes_train = [Process(target=split_train_csv, args=(id,)) for id in train["ego_id"].unique()]

processes_test = [Process(target=split_test_csv, args=(id,)) for id in test["ego_id"].unique()]


for process in processes_train:
    process.start()

for process in processes_train:
    process.join()

for process in processes_test:
    process.start()

for process in processes_test:
    process.join()


In [None]:
data = pd.DataFrame(np.random.randint(0, 5, size=(5, 5)))

In [None]:
lookup = dict(data[1].value_counts())

In [None]:
np.cumsum(np.arange(1, 5))

In [None]:
from multiprocessing import Pool

lookup_train = train["ego_id"].value_counts()
unique_train = train["ego_id"].unique()

cum_train = np.cumsum(lookup_train[unique_train])
cum_train = np.hstack(([0], cum_train))

table_train = {unique_train[i-1] : (cum_train[i-1], cum_train[i]) for i in range(1, cum_train.shape[0])}


def split_train_csv(id):
    low, high = table_train[id]
    train.iloc[low : high].to_csv(f"train/{id}.csv", index=None)


lookup_test = test["ego_id"].value_counts()
unique_test = test["ego_id"].unique()

cum_test = np.cumsum(lookup_test[unique_test])
cum_test = np.hstack(([0], cum_test))

table_test = {unique_test[i-1] : (cum_test[i-1], cum_test[i]) for i in range(1, cum_test.shape[0])}


def split_test_csv(id):
    low, high = table_test[id]
    test.iloc[low : high].to_csv(f"test/{id}.csv", index=None)


pool = Pool()

pool.map(split_train_csv, train["ego_id"].unique())

pool.map(split_test_csv, test["ego_id"].unique())

In [None]:
!mkdir submit

submit = pd.read_csv("submission.csv")

from multiprocessing import Pool

lookup_submit = submit["ego_id"].value_counts()
unique_submit = submit["ego_id"].unique()

cum_submit = np.cumsum(lookup_submit[unique_submit])
cum_submit = np.hstack(([0], cum_submit))

table_submit = {unique_submit[i-1] : (cum_submit[i-1], cum_submit[i]) for i in range(1, cum_submit.shape[0])}


def split_submit_csv(id):
    low, high = table_submit[id]
    submit.iloc[low : high].to_csv(f"submit/{id}.csv", index=None)

pool = Pool()

pool.map(split_submit_csv, submit["ego_id"].unique())

In [None]:
!mv submit data/submit