In [1]:
import sys

In [2]:
sys.path.append("..")

In [3]:
import torch
from src.data.graph import Graph

In [4]:
!pwd

'pwd' is not recognized as an internal or external command,
operable program or batch file.


#### We will start exploring the graph by instantiating a Graph object. That requires defining an argument dictionary.


In [5]:
args = {
        "graph": "karate",  # name of the graph
        "basepath": "../data",  # path to the directory that has the graph files
        "task": "classify",  # the task
        "test_frac": 0.20,  #  fraction of the edges to be used as test split
        "val_frac": 0.20,  #  fraction of the edges to be used as val split
        "gpu": False  # we don't need a GPU for exploring the graph
        } 

In [6]:
graph = Graph(**args)

Loading karate dataset...
Found 78 edges


  return torch.sparse.FloatTensor(indices, values, shape)


### Each graph object has three important attributes
1. Node features
2. Node labels
3. The adjacency matrix (sparse coo matrix)

In [7]:
# extract the number of nodes and features
num_nodes, num_features = graph.features.shape
num_nodes, num_features

(34, 34)

In [8]:
graph.labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

##### Extract the ede information

In [9]:
edges = graph.adj.coalesce().indices().T.tolist()

In [13]:
edges[:4]  # edges is a list of [node1, node2] elements, indicating an edge between node1 and node 2 

[[0, 0], [0, 1], [0, 2], [0, 3]]

In [10]:
num_edges = len(edges);
num_edges

190

In [11]:
from collections import defaultdict
def find_degree_per_node(edges):
    in_degree = defaultdict(int)

    for (src, tgt) in edges:
        in_degree[tgt] += 1
    
    avg_in_degree = sum(in_degree.values()) / len(in_degree)
    max_in_degree = max(in_degree.values())
    min_in_degree = min(in_degree.values())
    
    
    print(avg_in_degree, max_in_degree, min_in_degree)
    
        

In [12]:
find_degree_per_node(edges)

5.588235294117647 18 2


## Cora

In [14]:
args = {
        "graph": "cora",  # name of the graph
        "basepath": "../data",  # path to the directory that has the graph files
        "task": "classify",  # the task
        "test_frac": 0.20,  #  fraction of the edges to be used as test split
        "val_frac": 0.20,  #  fraction of the edges to be used as val split
        "gpu": False  # we don't need a GPU for exploring the graph
        } 

In [15]:
graph = Graph(**args)

Loading cora dataset...
Found 5429 edges


In [16]:
num_nodes, num_features = graph.features.shape
num_nodes, num_features

(2708, 1433)

In [20]:
graph.labels, graph.labels.unique()

(tensor([2, 5, 4,  ..., 1, 0, 2]), tensor([0, 1, 2, 3, 4, 5, 6]))

In [21]:
edges = graph.adj.coalesce().indices().T.tolist()
num_edges = len(edges);
num_edges

13264

In [23]:
find_degree_per_node(edges)

4.89807976366322 169 2


### Citeseer

In [24]:
args = {
        "graph": "Citeseer",  # name of the graph
        "basepath": "../data",  # path to the directory that has the graph files
        "task": "classify",  # the task
        "test_frac": 0.20,  #  fraction of the edges to be used as test split
        "val_frac": 0.20,  #  fraction of the edges to be used as val split
        "gpu": False  # we don't need a GPU for exploring the graph
        } 

In [25]:
graph = Graph(**args)

Loading Citeseer dataset...
Found 4715 edges


In [26]:
# extract the number of nodes and features
num_nodes, num_features = graph.features.shape
num_nodes, num_features

(3312, 3703)

In [27]:
graph.labels, graph.labels.unique()

(tensor([1, 4, 1,  ..., 4, 2, 5]), tensor([0, 1, 2, 3, 4, 5]))

In [34]:
edges = graph.adj
edges[0]

tensor(indices=tensor([[  0,  99, 111, 381, 415, 514, 585, 690, 691, 783, 784,
                        954]]),
       values=tensor([0.1538, 0.0769, 0.0769, 0.0769, 0.0769, 0.0769, 0.0769,
                      0.0769, 0.0769, 0.0769, 0.0769, 0.0769]),
       size=(3312,), nnz=12, layout=torch.sparse_coo)

In [28]:
edges = graph.adj.coalesce().indices().T.tolist()
num_edges = len(edges);
num_edges

12384

In [29]:
find_degree_per_node(edges)

3.739130434782609 100 1
