In [158]:
import os
import pandas as pd
import numpy as np

### Data

In [711]:
from typing import Union

import torch

class AddTrainValTestMask(object):
    r"""Adds a node-level random split via :obj:`train_mask`, :obj:`val_mask`
    and :obj:`test_mask` attributes to the :obj:`data` object.

    Args:
        split (string): The type of dataset split (:obj:`"train_rest"`,
            :obj:`"test_rest"`, :obj:`"random"`).
            If set to :obj:`"train_rest"`, all nodes except those in the
            validation and test sets will be used for training (as in the
            `"FastGCN: Fast Learning with Graph Convolutional Networks via
            Importance Sampling" <https://arxiv.org/abs/1801.10247>`_ paper).
            If set to :obj:`"test_rest"`, all nodes except those in the
            training and validation sets will be used for test (as in the
            `"Pitfalls of Graph Neural Network Evaluation"
            <https://arxiv.org/abs/1811.05868>`_ paper).
            If set to :obj:`"random"`, train, validation, and test sets will be
            randomly generated, according to :obj:`num_train_per_class`,
            :obj:`num_val` and :obj:`num_test` (as in the `"Semi-supervised
            Classification with Graph Convolutional Networks"
            <https://arxiv.org/abs/1609.02907>`_ paper).
        num_splits (int, optional): The number of splits to add. If bigger
            than :obj:`1`, the shape of masks will be
            :obj:`[num_nodes, num_splits]`, and :obj:`[num_nodes]` otherwise.
            (default: :obj:`1`)
        num_train_per_class (int, optional): The number of training samples
            per class in case of :obj:`"test_rest"` and :obj:`"random"` split.
            (default: :obj:`20`)
        num_val (int or float, optional): The number of validation samples.
            If float, it represents the ratio of samples to include in the
            validation set. (default: :obj:`500`)
        num_test (int or float, optional): The number of test samples in case
            of :obj:`"train_rest"` and :obj:`"random"` split. If float, it
            represents the ratio of samples to include in the test set.
            (default: :obj:`1000`)
    """
    def __init__(
        self,
        split: str,
        num_splits: int = 1,
        num_train_per_class: int = 20,
        num_val: Union[int, float] = 500,
        num_test: Union[int, float] = 1000,
    ):
        assert split in ['train_rest', 'test_rest', 'random']
        self.split = split
        self.num_splits = num_splits
        self.num_train_per_class = num_train_per_class
        self.num_val = num_val
        self.num_test = num_test

    def __call__(self, data):
        train_masks, val_masks, test_masks = [], [], []
        for _ in range(self.num_splits):
            train_mask, val_mask, test_mask = self.__sample_split__(data)
            train_masks.append(train_mask)
            val_masks.append(val_mask)
            test_masks.append(test_mask)

        data.train_mask = torch.stack(train_masks, dim=-1).squeeze(-1)
        data.val_mask = torch.stack(val_masks, dim=-1).squeeze(-1)
        data.test_mask = torch.stack(test_masks, dim=-1).squeeze(-1)

        return data

    def __sample_split__(self, data):
        train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

        if isinstance(self.num_val, float):
            num_val = round(data.num_nodes * self.num_val)
        else:
            num_val = self.num_val

        if isinstance(self.num_test, float):
            num_test = round(data.num_nodes * self.num_test)
        else:
            num_test = self.num_test

        if self.split == 'train_rest':
            perm = torch.randperm(data.num_nodes)
            val_mask[perm[:num_val]] = True
            test_mask[perm[num_val:num_val + num_test]] = True
            train_mask[perm[num_val + num_test:]] = True

        else:
            num_classes = int(data.y.max().item()) + 1
            for c in range(num_classes):
                idx = (data.y == c).nonzero(as_tuple=False).view(-1)
                idx = idx[torch.randperm(idx.size(0))]
                idx = idx[:self.num_train_per_class]
                train_mask[idx] = True

            remaining = (~train_mask).nonzero(as_tuple=False).view(-1)
            remaining = remaining[torch.randperm(remaining.size(0))]

            val_mask[remaining[:num_val]] = True

            if self.split == 'test_rest':
                test_mask[remaining[num_val:]] = True
            elif self.split == 'random':
                test_mask[remaining[num_val:num_val + num_test]] = True

        return train_mask, val_mask, test_mask

    def __repr__(self):
        return '{}(split={})'.format(self.__class__.__name__, self.split)

In [713]:
# transform
transform = transforms.Compose([AddTrainValTestMask('train_rest', num_val=500,
                                                            num_test=500)])

In [714]:
from torch_geometric.datasets import Twitch
# download
os.chdir('C:/Users/rustem.kamilyanov/trainee/reports/4')
data1 = Twitch(root='.', name='EN', transform=transform)

################
features = data1[0].x
edges = data1[0].edge_index
targets = data1[0].y
################

################
train_mask = np.random.choice([0, 1], size=len(features), p=[0.2, 0.8])
test_mask = np.ones_like(train_mask) - train_mask
#
data = Data(x=features, 
            edge_index=edges,
            y=targets,
            train_mask=train_mask,
            test_mask=test_mask)
#################

# принты
users = np.unique([x for y in data.edge_index.t().tolist() for x in y])
print(f'уникальных пользователей: {len(users)}')
print(f'размерность фичи: {features.shape}')
print(f'количество взаимосвязей : {len(edges.t())}')

уникальных пользователей: 7126
размерность фичи: torch.Size([7126, 128])
количество взаимосвязей : 77774


In [717]:
data1[0]

Data(x=[7126, 128], edge_index=[2, 77774], y=[7126], train_mask=[7126], val_mask=[7126], test_mask=[7126])

### Model

In [682]:
import torch
import torch.nn as nn

from torch_geometric.nn import GCNConv
embedding_size = 64

class GCN(nn.Module):
    def __init__(self):
        # Init parent
        super(GCN, self).__init__()
        torch.manual_seed(42)

        # GCN layers
        self.initial_conv = GCNConv(data.num_node_features, embedding_size)
        self.conv1 = GCNConv(embedding_size, embedding_size)
        self.conv2 = GCNConv(embedding_size, embedding_size)
        self.conv3 = GCNConv(embedding_size, embedding_size)
        
        # softmax layer
        self.sigmoid = nn.Sigmoid()
        
        # Output layer
        self.linear = nn.Linear(embedding_size, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        # First Conv layer
        x = self.initial_conv(x, edge_index)
        x = torch.tanh(x)

        # Other Conv layers
        x = self.conv1(x, edge_index)
        x = torch.tanh(x)
        x = self.conv2(x, edge_index)
        x = torch.tanh(x)
        x = self.conv3(x, edge_index)
        x = torch.tanh(x)
          
        # Global Pooling (stack different aggregations)
        # hidden = torch.cat([gmp(hidden, batch_index), 
        #                   gap(hidden, batch_index)], dim=1)

        # Apply a final (linear) classifier and sigmoid
        out = self.linear(x)
        out = self.sigmoid(out)

        return out


In [718]:
import mlflow

# params
model = GCN()
lr = 0.01
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# model train
model.train()

mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('gnn')
with mlflow.start_run():
    
    for epoch in range(10):
        optimizer.zero_grad()
        out_probs = model(data1)
         
        out_labels = torch.tensor([1.0 if x >= thr else 0 for x in out_probs], 
                                  requires_grad=True)
        true_labels = data1.y.to(torch.float32)

        loss = loss_func(out_labels[data.train_mask], true_labels[data.train_mask])
        #loss = loss_func(out_labels, true_labels)
        loss.backward()
        optimizer.step()
        
        mlflow.log_metric(key='train_loss_history', 
                          value=loss.item(), 
                         step=epoch)
    
    mlflow.end_run()

AttributeError: 