In [None]:
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.


# Custom Link Prediction Model

In [None]:
import networkx as nx
import random
random.seed(0)

num_clusters = 30
num_nodes_in_cluster = 12
g = nx.connected_caveman_graph(num_clusters, num_nodes_in_cluster)

test_ratio = 0.2 # Ratio of edges in a test dataset
edge_list = list(nx.edges(g))
random.shuffle(edge_list)
test_cutoff = int(test_ratio * len(edge_list))
test_dataset = set(edge_list[:test_cutoff])
train_dataset = set(edge_list[test_cutoff:])

In [None]:
import os
working_dir = "/tmp/caveman"
try:
    os.mkdir(working_dir)
except FileExistsError:
    pass

import json
nodes = []
data = ""
for node_id in g:
    cluster_id = float(node_id // num_nodes_in_cluster)
    train_list = []
    test_list = []
    for neighbor_id in nx.neighbors(g, node_id):
        if (node_id, neighbor_id) in train_dataset:
            train_list.append(neighbor_id)
        else:
            test_list.append(neighbor_id)
    node = {
        "node_weight": 1,
        "node_id": node_id,
        "node_type": 0,
        "uint64_feature": None,
        "float_feature": {
            "0": [float(cluster_id)/num_clusters],
        },
        "binary_feature": None,
        "edge": [
            {
                "src_id": node_id,
                "dst_id": neighbor_id,
                "edge_type": 0,
                "weight": 1.0,
            }
            for neighbor_id in train_list
        ] + [
            {
                "src_id": node_id,
                "dst_id": neighbor_id,
                "edge_type": 1,
                "weight": 1.0,
            }
            for neighbor_id in test_list
        ],
    }
    data += json.dumps(node) + "\n"
    nodes.append(node)

data_filename = working_dir + "/data.json"
with open(data_filename, "w+") as f:
    f.write(data)

meta = '{"node_float_feature_num": 1, \
         "edge_binary_feature_num": 0, \
         "edge_type_num": 2, \
         "edge_float_feature_num": 0, \
         "node_type_num": 2, \
         "node_uint64_feature_num": 0, \
         "node_binary_feature_num": 0, \
         "edge_uint64_feature_num": 0}'
meta_filename = working_dir + "/meta.json"
with open(meta_filename, "w+") as f:
    f.write(meta)

In [None]:
import deepgnn.graph_engine.snark.convert as convert
from deepgnn.graph_engine.snark.decoders import DecoderType
partitions = 1
convert.MultiWorkersConverter(
    graph_path=data_filename,
    meta_path=meta_filename,
    partition_count=partitions,
    output_dir=working_dir,
    decoder_type=DecoderType.JSON,
).convert()

In [None]:
from dataclasses import dataclass
import argparse
import numpy as np
import torch
from deepgnn.pytorch.modeling.base_model import BaseModel
from deepgnn.graph_engine import FeatureType, SamplingStrategy, GEEdgeSampler, GraphEngineBackend
from deepgnn.pytorch.common.utils import set_seed
from deepgnn.pytorch.common.dataset import TorchDeepGNNDataset
from deepgnn.pytorch.modeling import BaseModel
from deepgnn.pytorch.training import run_dist
from deepgnn.pytorch.common.metrics import F1Score

In [None]:
@dataclass
class LinkPredictionQueryParameter:
    neighbor_edge_types: np.array
    feature_idx: int
    feature_dim: int
    label_idx: int
    label_dim: int
    feature_type: FeatureType = FeatureType.FLOAT
    label_type: FeatureType = FeatureType.FLOAT


class LinkPredictionQuery:
    def __init__(self, p: LinkPredictionQueryParameter):
        self.p = p
        self.label_meta = np.array([[p.label_idx, p.label_dim]], np.int32)
        self.feat_meta = np.array([[p.feature_idx, p.feature_dim]], np.int32)

    def _query(self, g, nodes, edge_types):
        # Sample neighbors for every input node
        try:
            nodes = nodes.detach().numpy()
        except Exception:
            pass
        nbs = g.sample_neighbors(
            nodes=nodes.astype(dtype=np.int64),
            edge_types=edge_types)[0]

        # Extract features for all neighbors
        nbs_features = g.node_features(
            nodes=nbs.reshape(-1),
            features=self.feat_meta,
            feature_type=self.p.feature_type)

        # reshape the feature tensor to [nodes, neighbors, features]
        # and aggregate along neighbors dimension.
        nbs_agg = nbs_features.reshape(list(nbs.shape)+[self.p.feature_dim]).mean(1)
        node_features = g.node_features(
            nodes=nodes.astype(dtype=np.int64),
            features=self.feat_meta,
            feature_type=self.p.feature_type,
        )
        return node_features, nbs_agg

    def query_training(self, ge, edges, edge_types = np.array([0], dtype=np.int32)):
        edges = torch.Tensor(edges[:, :2]).long()
        src, src_nbs = self._query(ge, edges[:, 0], edge_types)
        dst, dst_nbs = self._query(ge, edges[:, 1], edge_types)
        context = [edges, src, src_nbs, dst, dst_nbs]

        # Prepare negative examples: edges between source nodes and random nodes
        dim = len(edges)
        source_nodes = torch.as_tensor(edges[:, 0], dtype=torch.int64).reshape(1, dim)
        random_nodes = ge.sample_nodes(dim, node_types=0, strategy=SamplingStrategy.Weighted).reshape(1, dim)
        neg_inputs = torch.cat((source_nodes, torch.tensor(random_nodes)), axis=1)
        src, src_nbs = self._query(ge, neg_inputs[:, 0], edge_types)
        dst, dst_nbs = self._query(ge, neg_inputs[:, 1], edge_types)
        context += [edges, src, src_nbs, dst, dst_nbs]

        return context


In [None]:
class LinkPrediction(BaseModel):
    def __init__(self, q_param):
        self.q = LinkPredictionQuery(q_param)
        super().__init__(
            feature_type=q_param.feature_type,
            feature_idx=q_param.feature_idx,
            feature_dim=q_param.feature_dim,
            feature_enc=None
        )
        self.feat_dim = q_param.feature_dim
        self.embed_dim = 16
        self.encode = torch.nn.Parameter(torch.FloatTensor(self.embed_dim, 2*self.feat_dim))
        self.weight = torch.nn.Parameter(torch.FloatTensor(1, self.embed_dim))
        torch.nn.init.xavier_uniform_(self.weight)
        torch.nn.init.xavier_uniform_(self.encode)

        self.metric = F1Score()

    def get_score(self, context: torch.Tensor, edge_types: np.array):
        edges, src, src_nbs, dst, dst_nbs = context
        src, src_nbs, dst, dst_nbs = [v.detach().numpy() for v in (src, src_nbs, dst, dst_nbs)]

        diff, diff_nbs = np.fabs(dst-src), np.fabs(dst_nbs-src_nbs)
        final = np.concatenate((diff, diff_nbs), axis=1)

        embed = self.encode.mm(torch.tensor(final).t())
        score = self.weight.mm(embed)
        return torch.sigmoid(score)

    def forward(self, context: torch.Tensor, edge_types: np.array = np.array([0], dtype=np.int32)):
        context = [v.squeeze(0) for v in context]
        pos_label = self.get_score(context[:5], edge_types)
        true_xent = torch.nn.functional.binary_cross_entropy(
                target=torch.ones_like(pos_label), input=pos_label, reduction="mean"
            )

        neg_label = self.get_score(context[5:], edge_types)
        negative_xent = torch.nn.functional.binary_cross_entropy(
            target=torch.zeros_like(neg_label), input=neg_label, reduction="mean"
        )

        loss = torch.sum(true_xent) + torch.sum(negative_xent)

        pred = (torch.cat((pos_label.reshape((-1)), neg_label.reshape((-1)))) >= .5)
        label = torch.cat((torch.ones_like(pos_label, dtype=bool).reshape((-1)), torch.zeros_like(neg_label, dtype=bool).reshape((-1))))
        return loss, pred, label


In [None]:
def create_model(args: argparse.Namespace):
    if args.seed:
        set_seed(args.seed)

    p = LinkPredictionQueryParameter(
            neighbor_edge_types=np.array([0], np.int32),
            feature_idx=0,
            feature_dim=2,
            label_idx=1,
            label_dim=1,
        )

    return LinkPrediction(p)

def create_optimizer(args: argparse.Namespace, model: BaseModel, world_size: int):
    return torch.optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()), lr=0.0001
    )



In [None]:
def create_dataset(
    args: argparse.Namespace,
    model: BaseModel,
    rank: int = 0,
    world_size: int = 1,
    backend: GraphEngineBackend = None,
):
    return TorchDeepGNNDataset(
        sampler_class=GEEdgeSampler,
        edge_types=np.array([0]),
        backend=backend,
        query_fn=model.q.query_training,
        prefetch_queue_size=2,
        prefetch_worker_size=2,
        sample_files=args.sample_file,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True,
        worker_index=rank,
        num_workers=world_size,
    )


In [None]:
def init_args(parser):
    #parser.add_argument("--hidden_dim", type=int, default=8, help="hidden layer dimension.")
    #parser.add_argument("--head_num", type=str2list_int, default="8,1", help="the number of attention headers.")
    pass


In [None]:
# Not needed for .py file runs
try:
    init_args_base
except NameError:
    init_args_base = init_args

In [None]:
# Not needed for .py file runs
MODEL_DIR = f"tmp/gat_{np.random.randint(9999999)}"
arg_list = [
    "--data_dir", "/tmp/caveman",
    "--mode", "train",
    "--trainer", "base",
    "--backend", "snark",
    "--graph_type", "local",
    "--converter", "skip",
    "--node_type", "0",
    "--feature_idx", "0",
    "--feature_dim", "2",
    "--label_idx", "1",
    "--label_dim", "1",
    "--batch_size", "64",
    "--learning_rate", ".001",
    "--num_epochs", "100",
    "--log_by_steps", "16",
    "--use_per_step_metrics",
    "--model_dir", MODEL_DIR,
    "--metric_dir", MODEL_DIR,
    "--save_path", MODEL_DIR,
]

def init_args_wrap(init_args_base):
    def init_args_new(parser):
        init_args_base(parser)
        parse_args = parser.parse_args
        parser.parse_args = lambda: parse_args(arg_list)
    return init_args_new

init_args = init_args_wrap(init_args_base)

In [None]:
run_dist(
    init_model_fn=create_model,
    init_dataset_fn=create_dataset,
    init_optimizer_fn=create_optimizer,
    init_args_fn=init_args,
)

In [None]:
# Not needed for .py file runs
arg_list = [
    "--data_dir", "/tmp/caveman",
    "--mode", "evaluate",
    "--trainer", "base",
    "--backend", "snark",
    "--graph_type", "local",
    "--converter", "skip",
    "--node_type", "0",
    "--feature_idx", "0",
    "--feature_dim", "2",
    "--label_idx", "1",
    "--label_dim", "1",
    "--batch_size", "64",
    "--learning_rate", ".0",
    "--num_epochs", "100",
    "--log_by_steps", "16",
    "--use_per_step_metrics",
    "--model_dir", MODEL_DIR,
    "--metric_dir", MODEL_DIR,
    "--save_path", MODEL_DIR,
]

def init_args_wrap(init_args_base):
    def init_args_new(parser):
        init_args_base(parser)
        parse_args = parser.parse_args
        parser.parse_args = lambda: parse_args(arg_list)
    return init_args_new

init_args = init_args_wrap(init_args_base)

In [None]:
run_dist(
    init_model_fn=create_model,
    init_dataset_fn=create_dataset,
    init_optimizer_fn=create_optimizer,
    init_args_fn=init_args,
)