In [1]:
import numpy as np
import scipy.sparse as sp
import torch

In [2]:
def encode_onehot(labels):
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot

In [3]:
# encode_onehot: encode labels to onehot format
# (1) use set(labels) to get all classes
# (2) build dict with (class: onehot) pairs
# (3) map labels into onehot format via dict.get

labels = ['dog', 'cat', 'fish', 'dog', 'fish', 'cat']
encode_onehot(labels)

array([[0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0]], dtype=int32)

In [4]:
# sp.csr_matrix: sparse matrix
# sp format: (row_num, column_num), non-zero value (float32)
data = [[1, 0, 2], [0, 0, 3], [0, 4, 0]]
mtx = sp.csr_matrix(data, dtype=np.float32)
print(mtx)
mtx.todense()

  (0, 0)	1.0
  (0, 2)	2.0
  (1, 2)	3.0
  (2, 1)	4.0


matrix([[1., 0., 2.],
        [0., 0., 3.],
        [0., 4., 0.]], dtype=float32)

In [5]:
# row-normalize: xij / sum_j(xij)
def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

In [6]:
# Row-normalize sparse matrix
mx = normalize(mtx)
print(mx)

  (0, 2)	0.6666667
  (0, 0)	0.33333334
  (1, 2)	1.0
  (2, 1)	1.0


In [7]:
# Convert a scipy sparse matrix to a torch sparse tensor
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [8]:
# torch sparse tensor format: indices(row, col), values, size
print(sparse_mx_to_torch_sparse_tensor(mx))

tensor(indices=tensor([[0, 0, 1, 2],
                       [2, 0, 2, 1]]),
       values=tensor([0.6667, 0.3333, 1.0000, 1.0000]),
       size=(3, 3), nnz=4, layout=torch.sparse_coo)


In [9]:
def load_data(path="../data/cora/", dataset="cora"):
    """Load citation network dataset (cora only for now)"""
    print('Loading {} dataset...'.format(dataset))
    
    # idx_features_labels: <paper_id> <word_attributes> + <class_label>
    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
                                        dtype=np.dtype(str))
    # features and labels
    # features as sparse matrix, and float32 for further normalization
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    labels = encode_onehot(idx_features_labels[:, -1])

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32) # paper_id
    idx_map = {j: i for i, j in enumerate(idx)} # dict with paper_id: row_id
    # edges_unordered: <paper_id of cited paper> <paper_id of citing paper>
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
                                    dtype=np.int32)
    # edges: <num_id of cited paper> <num_id of citing paper>
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    # adj sparse matrix
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize(features)
    adj = normalize(adj + sp.eye(adj.shape[0]))

    idx_train = range(140)
    idx_val = range(200, 500)
    idx_test = range(500, 1500)

    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train, idx_val, idx_test

In [11]:
"""load citation network data (cora)"""
# (1) features and labels
# (1.1) read from cora.content as np array
# (1.2) features as normalized sparse matrix
# (1.3) labels as onehot format
# (2) build graph
# (2.1) read from cora.cites as np array
# (2.2) edges as <num_id of cited paper> <num_id of citing paper>, that is (row, col)
# (2.3) build adj sparse matrix with (row, col)
# (2.4) symmetric, self-looped and normalized adj (not exactly GCN)
# (3) torch tensor format (features, labels, adj)
# (4) torch tensor format (idx of train, val, test)
adj, features, labels, idx_train, idx_val, idx_test = load_data()
print(adj)
print(features)
print(labels)

Loading cora dataset...
tensor(indices=tensor([[   0,    8,   14,  ..., 1389, 2344, 2707],
                       [   0,    0,    0,  ..., 2707, 2707, 2707]]),
       values=tensor([0.1667, 0.1667, 0.0500,  ..., 0.2000, 0.5000, 0.2500]),
       size=(2708, 2708), nnz=13264, layout=torch.sparse_coo)
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([0, 1, 5,  ..., 3, 4, 0])
