From https://docs.dgl.ai/en/0.6.x/tutorials/blitz/4_link_predict.html

In [18]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from dgl.data import DGLDataset
import numpy as np
import scipy.sparse as sp
import networkx as nx
import itertools

In [2]:
edges = pd.read_csv('../data/twitch_gamers/large_twitch_edges.csv', header=0)
edges.columns = ['source', 'target']
features = pd.read_csv('../data/twitch_gamers/large_twitch_features.csv', header=0)

In [3]:
G = nx.from_pandas_edgelist(edges, source='source', target='target', create_using=nx.Graph())
degree_dict = dict(G.degree)
selected_nodes = []
for i in degree_dict:
    if degree_dict[i] >= 500:
        selected_nodes.append(i)
        
edges_filtered = edges[edges["source"].isin(selected_nodes)]
edges_filtered = edges[edges["target"].isin(selected_nodes)]

In [4]:
features['created_at'] = pd.to_datetime(features['created_at'])
features['updated_at'] = pd.to_datetime(features['updated_at'])

features['delta_days'] = (features['updated_at'] - features['created_at']).dt.total_seconds()/(60*60*24)

features['language'] = pd.factorize(features['language'])[0]

In [5]:
nodes_filtered = set(edges_filtered['source'].tolist() + edges_filtered['target'].tolist())
features_filtered = features.loc[nodes_filtered]

In [6]:
Sdict = dict(zip(nodes_filtered, range(len(nodes_filtered))))
edges_filtered['source'] = edges_filtered['source'].map(Sdict)
edges_filtered['target'] = edges_filtered['target'].map(Sdict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
class TwitchGamersDataset(DGLDataset):
    def __init__(self):
        super().__init__(name='twitch_gamers')

    def process(self):
        nodes_data = features_filtered
        edges_data = edges_filtered

        node_features = torch.from_numpy(nodes_data[['views', 'mature', 'life_time', 'delta_days', 'dead_account', 'language']].to_numpy()).float()
        node_labels = torch.from_numpy(nodes_data['affiliate'].astype('category').cat.codes.copy().to_numpy())
        edges_src = torch.from_numpy(edges_data['source'].to_numpy())
        edges_dst = torch.from_numpy(edges_data['target'].to_numpy())

        self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
        self.graph.ndata['feat'] = node_features
        self.graph.ndata['label'] = node_labels

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.
        n_nodes = nodes_data.shape[0]
        n_train = int(n_nodes * 0.6)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train:n_train + n_val] = True
        test_mask[n_train + n_val:] = True
        self.graph.ndata['train_mask'] = train_mask
        self.graph.ndata['val_mask'] = val_mask
        self.graph.ndata['test_mask'] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

dataset = TwitchGamersDataset()
graph = dataset[0]

print(graph)

Graph(num_nodes=37281, num_edges=567609,
      ndata_schemes={'feat': Scheme(shape=(6,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int8), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})


In [8]:
# Split edge set for training and testing
u, v = graph.edges()

eids = np.arange(graph.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = graph.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), shape=(graph.number_of_nodes(), graph.number_of_nodes()))
adj_neg = 1 - adj.todense() - np.eye(graph.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), graph.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [10]:
train_g = dgl.remove_edges(graph, eids[:test_size])

In [11]:
from dgl.nn import SAGEConv

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [13]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=graph.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=graph.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=graph.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=graph.number_of_nodes())

In [14]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [15]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [20]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
pred = MLPPredictor(16)
#pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [21]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(100):
    # forward
    h = model(train_g, train_g.ndata['feat'])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))

In epoch 0, loss: 942634.9375
In epoch 5, loss: 179779.328125
In epoch 10, loss: 47121.12109375
In epoch 15, loss: 40462.96484375
In epoch 20, loss: 34539.96484375
In epoch 25, loss: 17707.451171875
In epoch 30, loss: 10971.734375
In epoch 35, loss: 8044.42822265625
In epoch 40, loss: 4697.12548828125
In epoch 45, loss: 4674.82373046875
In epoch 50, loss: 3349.573974609375
In epoch 55, loss: 3291.9384765625
In epoch 60, loss: 2754.181640625
In epoch 65, loss: 2100.3935546875
In epoch 70, loss: 1784.59130859375
In epoch 75, loss: 1522.168701171875
In epoch 80, loss: 1268.560791015625
In epoch 85, loss: 1119.8466796875
In epoch 90, loss: 1249.9302978515625
In epoch 95, loss: 1282.6982421875
AUC 0.9543738222047904
