In [1]:
!pip install dgl

Collecting dgl
  Downloading dgl-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.0 MB/s 
Installing collected packages: dgl
Successfully installed dgl-0.6.1


In [7]:
import dgl
from dgl.data import DGLDataset

import os

import torch
import torch.nn as nn
import torch.nn.functional as F

## Creating a dataset for node classification from csv

Dataset taken from [Zachary's Karate Club](https://en.wikipedia.org/wiki/Zachary%27s_karate_club)

In [4]:
import urllib.request
import pandas as pd

In [5]:
urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/members.csv', './members.csv'
)

urllib.request.urlretrieve(
    'https://data.dgl.ai/tutorial/dataset/interactions.csv', './interactions.csv'
)

('./interactions.csv', <http.client.HTTPMessage at 0x7efbbca20e10>)

Treating members as nodes and interactions as edges

In [9]:
nodes_data = pd.read_csv('./members.csv')
edges_data = pd.read_csv('./interactions.csv')

In [10]:
print(nodes_data.head())

   Id    Club  Age
0   0  Mr. Hi   44
1   1  Mr. Hi   37
2   2  Mr. Hi   37
3   3  Mr. Hi   40
4   4  Mr. Hi   30


In [14]:
node_labels = torch.from_numpy(nodes_data['Club'].astype('category').cat.codes.to_numpy())
print(len(node_labels))
print(node_labels)

34
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.int8)


In [11]:
print(edges_data.head())

   Src  Dst    Weight
0    0    1  0.043591
1    0    2  0.282119
2    0    3  0.370293
3    0    4  0.730570
4    0    5  0.821187


In [17]:
class KarateClubDataset(DGLDataset):
  def __init__(self):
    super().__init__(name='karate_club')

  def process(self):
    nodes_data = pd.read_csv('./members.csv')
    edges_data = pd.read_csv('./interactions.csv')
    node_features = torch.from_numpy(nodes_data['Age'].to_numpy())
    node_labels = torch.from_numpy(nodes_data['Club'].astype('category').cat.codes.to_numpy())
    edge_features = torch.from_numpy(edges_data['Weight'].to_numpy())
    edges_src = torch.from_numpy(edges_data['Src'].to_numpy())
    edges_dst = torch.from_numpy(edges_data['Dst'].to_numpy())

    self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
    self.graph.ndata['feat'] = node_features
    self.graph.ndata['label'] = node_labels
    self.graph.edata['weight'] = edge_features

    # assigning masks indicating whether a node belongs to training,
    # validation, and test set
    n_nodes = nodes_data.shape[0]
    n_train = int(n_nodes * 0.6)
    n_val = int(n_nodes * 0.2)
    train_mask = torch.zeros(n_nodes, dtype=torch.bool)
    val_mask = torch.zeros(n_nodes, dtype=torch.bool)
    test_mask = torch.zeros(n_nodes, dtype=torch.bool)
    train_mask[:n_train] = True
    val_mask[n_train : n_train + n_val] = True
    test_mask[n_train + n_val:] = True
    self.graph.ndata['train_mask'] = train_mask
    self.graph.ndata['val_mask'] = val_mask
    self.graph.ndata['test_mask'] = test_mask
  
  def __getitem__(self, i):
    return self.graph
  
  def __len__(self):
    return 1

In [21]:
dataset = KarateClubDataset()
graph = dataset[0]

In [19]:
print(graph)

Graph(num_nodes=34, num_edges=156,
      ndata_schemes={'feat': Scheme(shape=(), dtype=torch.int64), 'label': Scheme(shape=(), dtype=torch.int8), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.float64)})
