In [None]:
try:
  import google.colab
  IN_COLAB = True
  !nvidia-smi

except:
  IN_COLAB = False

Error: Session cannot generate requests

مقایسه بین تعداد نورونهای لایه پنهان متفاوت در روابط مختلف
مقایسه روابط مختلف که کدام روابط نتیجه بهتری دارند
کدام زمان کمتری دارند


In [1]:
# -*- coding: utf-8 -*-
#
#    Copyright (C) 2021-2029 by
#    Mahmood Amintoosi <m.amintoosi@gmail.com>
#    All rights reserved.
#    BSD license.
from itertools import combinations, chain
import time
import statistics
import pandas as pd
from sklearn.ensemble import IsolationForest

import numpy as np
from tqdm import tqdm

import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.datasets import OGB_MAG
from torch_geometric.datasets import DBLP
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv, Linear, HeteroConv



In [None]:
%%time
# GPU
# !pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+cu110.html
# !pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.9.0+cu110.html
# !pip install -q torch-geometric

if IN_COLAB:
    !pip install -q torch-scatter
    !pip install -q torch-sparse
    !pip install -q torch-geometric
# !pip install -q torch-scatter
# !pip install -q torch-sparse


CPU times: user 154 ms, sys: 72.7 ms, total: 227 ms
Wall time: 11.8 s


In [None]:
!pip show torch-geometric


Name: torch-geometric
Version: 2.0.2
Summary: Graph Neural Network Library for PyTorch
Home-page: https://github.com/pyg-team/pytorch_geometric
Author: Matthias Fey
Author-email: matthias.fey@tu-dortmund.de
License: UNKNOWN
Location: /usr/local/lib/python3.7/dist-packages
Requires: pandas, scipy, PyYAML, googledrivedownloader, rdflib, tqdm, requests, yacs, scikit-learn, jinja2, networkx, pyparsing, numpy
Required-by: 


In [2]:
%%time
# Global Variables
# data = None
# dataset = None
# train_loader = None
# val_loader = None
# ds_num_classes = None

def load_dataset(ds_name, node_tp):
    # global data
    # global dataset
    # global train_loader
    # global val_loader
    # global ds_num_classes
    path = '../data/' # IN_COLAB == True:
    if ds_name == 'DBLP':
        if IN_COLAB == False:
          path = '/mnt/c/temp/working/data/DBLP/'
        dataset = DBLP(path)
        data = dataset[0]
        # We initialize conference node features with a single feature.
        data['conference'].x = torch.ones(data['conference'].num_nodes, 1)
        ds_num_classes = 4
        train_input_nodes = (node_tp, data[node_tp].train_mask)
    elif ds_name == 'OGB_MAG':
        if IN_COLAB == False:
          path = '/mnt/c/temp/working/data/'
        transform = T.ToUndirected()  # Add reverse edge types.
        dataset = OGB_MAG(path, preprocess='metapath2vec', transform=transform)
        data = dataset[0]
        ds_num_classes = dataset.num_classes
        train_input_nodes = (node_tp, data[node_tp].test_mask) #train->test
    else:
        print('Unknown dataset!')

    val_input_nodes = (node_tp, data[node_tp].val_mask)
    kwargs = {'batch_size': 64, 'num_workers': 2, 'persistent_workers': True}

    train_loader = NeighborLoader(data, num_neighbors=[10] * 2, shuffle=False,
                                  input_nodes=train_input_nodes, **kwargs)

    val_loader = NeighborLoader(data, num_neighbors=[10] * 2,
                                input_nodes=val_input_nodes, **kwargs)
    return data, ds_num_classes, train_loader, val_loader


class HeteroGNN(torch.nn.Module):
    def __init__(self, r_list, hidden_channels, out_channels, num_layers):
        super().__init__()

        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                edge_type: SAGEConv((-1, -1), hidden_channels)
                for edge_type in r_list
            })
            self.convs.append(conv)

        self.lin = Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)
            x_dict = {key: F.leaky_relu(x) for key, x in x_dict.items()}
        return self.lin(x_dict[node_tp])

# ds_name = 'DBLP'
# node_tp = 'author'  # node to predict
ds_name = 'OGB_MAG'
node_tp = 'paper'  # node to predict
data, ds_num_classes, train_loader, val_loader =  load_dataset(ds_name, node_tp)
# ds_num_classes = dataset.num_classes

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = HeteroGNN(data.metadata()[1], hidden_channels=64, out_channels=ds_num_classes,
                  num_layers=2)
model = model.to(device)


@torch.no_grad()
def init_params():
    # Initialize lazy parameters via forwarding a single batch to the model:
    # print("In init, train_loader:", train_loader)
    batch = next(iter(train_loader))
    batch = batch.to(device)
    model(batch.x_dict, batch.edge_index_dict)


def train():
    model.train()
    i = 0
    total_examples = total_loss = 0
    # for batch in tqdm(train_loader):
    for batch in train_loader:
        optimizer.zero_grad()
        batch = batch.to(device)
        # if i<1:
        #   print(batch)
        # i += 1

        batch_size = batch[node_tp].batch_size
        out = model(batch.x_dict, batch.edge_index_dict)
        loss = F.cross_entropy(out[:batch_size], batch[node_tp].y[:batch_size])
        loss.backward()
        optimizer.step()

        total_examples += batch_size
        total_loss += float(loss) * batch_size

    return total_loss / total_examples


@torch.no_grad()
def test(loader):
    model.eval()

    total_examples = total_correct = 0
    for batch in tqdm(loader):
        batch = batch.to(device)
        batch_size = batch[node_tp].batch_size

        out = model(batch.x_dict, batch.edge_index_dict)
        pred = out.argmax(dim=-1)

        total_examples += batch_size
        total_correct += int((pred[:batch_size] ==
                             batch[node_tp].y[:batch_size]).sum())

    return total_correct / total_examples


CPU times: user 19.1 s, sys: 7.18 s, total: 26.3 s
Wall time: 21.1 s


In [3]:
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


In [4]:
all_relations = data.metadata()[1]
# get all combinations, we will use this as indices for the columns later
indices = list(powerset(range(len(all_relations))))
# remove the empty subset
indices.pop(0)
# indices


()

In [5]:
batch = next(iter(train_loader))
batch


HeteroData(
  [1mpaper[0m={
    x=[9173, 128],
    y=[9173],
    train_mask=[9173],
    val_mask=[9173],
    test_mask=[9173],
    batch_size=64
  },
  [1mauthor[0m={ x=[2173, 128] },
  [1minstitution[0m={ x=[217, 128] },
  [1mfield_of_study[0m={ x=[1536, 128] },
  [1m(author, affiliated_with, institution)[0m={ edge_index=[2, 0] },
  [1m(author, writes, paper)[0m={ edge_index=[2, 2600] },
  [1m(paper, cites, paper)[0m={ edge_index=[2, 5215] },
  [1m(paper, has_topic, field_of_study)[0m={ edge_index=[2, 3840] },
  [1m(institution, rev_affiliated_with, author)[0m={ edge_index=[2, 454] },
  [1m(paper, rev_writes, author)[0m={ edge_index=[2, 1950] },
  [1m(field_of_study, rev_has_topic, paper)[0m={ edge_index=[2, 5438] }
)

In [6]:
# for idx in indices:
#     r_idx = list(idx)
#     r_list = [all_relations[x] for x in r_idx]
#     for item in r_list:
#         if 'author' in item:
#             print('Hast')
#     print(r_list)


In [7]:
# indices[-1]
data.node_types


['paper', 'author', 'institution', 'field_of_study']

In [8]:
PYTHONOPTIMIZE = 1


In [9]:
def check_pair_nodes(ds_name, r_list, nodes):

    pair_nodes_in_r_list = True

    if ds_name == 'DBLP':
        for i in range(len(nodes)):
            for j in np.arange(i+1, len(nodes)):
                if (nodes[i], 'to', nodes[j]) in r_list and (nodes[j], 'to', nodes[i]) not in r_list:
                    pair_nodes_in_r_list = False
                    break
                if (nodes[j], 'to', nodes[i]) in r_list and (nodes[i], 'to', nodes[j]) not in r_list:
                    pair_nodes_in_r_list = False
                    break

    if ds_name == 'OGB_MAG':
        for tpl in r_list:
         # ارتباط مقاله به مقاله برعکس نداره که لزومی به چک کردنش باشد
            if tpl[0] == tpl[2]:  # paper to paper
                continue
            # ایجاد تاپل متناظر با هر ارتباط
            paired_tpl = tuple(reversed(tpl))
            rel = tpl[1]
            if rel.startswith('rev_'):
                rel = rel[4:]
            else:
                rel = "rev_" + rel
            lst = list(paired_tpl)
            lst[1] = rel
            paired_tpl = tuple(lst)
            # tpl , paired_tpl
            if paired_tpl not in r_list:
                pair_nodes_in_r_list = False
                break

    return pair_nodes_in_r_list


In [10]:
%%time
Report = []
VAL_ACC = []
R_LIST = []
RUN_TIME = []
# with tqdm(total=len(indices)) as progress_bar:
rel_no = 0

# ds_name = 'DBLP'
# node_tp = 'author'  # node to predict

ds_name = 'OGB_MAG'
node_tp = 'paper'  # node to predict

for idx in indices:
    # idx = indices[-3]
    r_idx = list(idx)
    r_list = [all_relations[x] for x in r_idx]

# نویسنده حتما باید باشه، چون طبقه‌بندی بر اساس اون هست
    node_tp_in_r_list = False
    if ds_name == 'DBLP':
        if ('author', 'to', 'paper') in r_list and ('paper', 'to', 'author') in r_list:
            node_tp_in_r_list = True
    if ds_name == 'OGB_MAG':
        if ('author', 'writes', 'paper') in r_list and ('paper', 'rev_writes', 'author') in r_list:
            node_tp_in_r_list = True

    if not node_tp_in_r_list:
        continue

# لیست نودهای موجود در این انتخاب
    nodes = []
    for items in r_list:
        if items[0] not in nodes:
            nodes.append(items[0])
        if items[-1] not in nodes:
            nodes.append(items[-1])

  # اگر یک ارتباط هست، برعکسش هم باید باشه.
    pair_nodes_in_r_list = check_pair_nodes(ds_name, r_list, nodes)
    if not pair_nodes_in_r_list:
        continue

    print('r_list', r_list)
    val_ACC = []
    run_Times = []
    for run_no in range(3):
        start_time = time.time()
        data, ds_num_classes, train_loader, val_loader = load_dataset(
            ds_name, node_tp)

        model = HeteroGNN(r_list, hidden_channels=64, out_channels=ds_num_classes,
                          num_layers=2)
        model = model.to(device)

        init_params()  # Initialize parameters.
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        for epoch in range(1, 2):
            loss = train()

        val_acc = test(val_loader)
        # print(f'idx: {idx}, Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_acc:.4f}')
        val_ACC.append(val_acc)
        run_Times.append(time.time() - start_time)
    # print(np.mean(VAL_ACC))
    print('run_Times before prun:', run_Times)
    run_Times_x = [[x] for x in run_Times]
    # Outlier detection
    iso = IsolationForest(contamination=0.1)
    yhat = iso.fit_predict(run_Times_x)
    run_Times = [run_Times[i] for (i, val) in enumerate(yhat) if val == 1]
    print('run_Times after prun:', run_Times)
    R_LIST.append(r_list)
    VAL_ACC.append(np.mean(val_ACC))
    RUN_TIME.append(statistics.mean(run_Times))
    ds_report = [ds_name, rel_no, np.mean(val_ACC), statistics.mean(run_Times)]
    Report.append(ds_report)
    rel_no += 1

df = pd.DataFrame(
    Report, columns=['Dataset Name', 'Rel. No.', 'ACC', 'Run Time'])
print(df)
# for i in range(len(R_LIST)):
#   print(R_LIST[i], VAL_ACC[i], RUN_TIME[i])


r_list [('author', 'writes', 'paper'), ('paper', 'rev_writes', 'author')]


In [None]:
print(train_loader)

Error: Session cannot generate requests

In [None]:
# R_LIST, VAL_ACC, RUN_TIME


In [None]:
# https://pytorch-geometric.readthedocs.io/en/latest/notes/heterogeneous.html
# https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/to_hetero_mag.py


In [None]:
# train_loader = HGTLoader(
#     data,
#     # Sample 64 nodes per type and per iteration for 4 iterations
#     # num_samples={key: [64] * 4 for key in data.node_types},
#     num_samples={key: [16] * 2 for key in node_list},
#     # Use a batch size of 128 for sampling training nodes of type paper
#     batch_size=32,
#     input_nodes=train_input_nodes
# )
