# Dataset analysis
This notebook is a simple analysis of the dataset used in the experiments

In [1]:
import os
import shutil

import pandas as pd
from torch_geometric.datasets import TUDataset

DATA_PATH = './data/tmp'
if os.path.exists(DATA_PATH):
    shutil.rmtree(DATA_PATH)
os.makedirs(DATA_PATH, exist_ok=True)

datasets = ['IMDB-BINARY', 'IMDB-MULTI', 'REDDIT-BINARY',
            'SYNTHETIC', 'Synthie', 'MUTAG', 'MSRC_9', 'ENZYMES']

## Properties
We will go through the datasets collecting information to characterize them.

In [2]:
meta_info = []  # pd.DataFrame(columns=['Name', '#graphs', '#classes', 'Avg. #nodes', 'Avg. #edges'])

for name in datasets:
    dataset = TUDataset(root=DATA_PATH, name=name, use_node_attr=True)
    node_sum, edge_sum = 0, 0
    max_nodes, max_edges, max_degree = 0, 0, 0
    first = True
    for data in dataset:
        node_sum += data.num_nodes
        edge_sum += data.num_edges  # divide by 2 because edges are bidirectional in pyg

        max_nodes = max(max_nodes, data.num_nodes)
        max_edges = max(max_edges, data.num_edges)
        max_degree = max(
            max_degree, data.edge_index[0].bincount().max().item())

    meta_info.append({
        'Name': name,
        '#graphs': len(dataset),
        '#classes': dataset.num_classes,
        'Avg. #nodes': node_sum / len(dataset),
        'Avg. #edges': edge_sum / len(dataset) / 2,
        'Max. #nodes': max_nodes,
        'Max. #edges': max_edges / 2,
        'Max. degree': max_degree,
        '#features': dataset.num_features,
        '#attributes': dataset.num_node_attributes
    })

meta_info = pd.DataFrame(meta_info)
meta_info

Downloading https://www.chrsmrrs.com/graphkerneldatasets/IMDB-BINARY.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/IMDB-MULTI.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/REDDIT-BINARY.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/SYNTHETIC.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/Synthie.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/MSRC_9.zip
Processing...
Done!
Downloading https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip
Processing...
Done!


Unnamed: 0,Name,#graphs,#classes,Avg. #nodes,Avg. #edges,Max. #nodes,Max. #edges,Max. degree,#features,#attributes
0,IMDB-BINARY,1000,2,19.773,96.531,136,1249.0,135,0,0
1,IMDB-MULTI,1500,3,13.001333,65.935333,89,1467.0,88,0,0
2,REDDIT-BINARY,2000,2,429.627,497.754,3782,4071.0,3062,0,0
3,SYNTHETIC,300,2,100.0,196.0,100,196.0,8,9,1
4,Synthie,400,4,95.0,172.9275,100,212.0,20,15,15
5,MUTAG,188,2,17.930851,19.792553,28,33.0,4,7,0
6,MSRC_9,221,8,40.579186,97.936652,55,145.0,16,10,0
7,ENZYMES,600,6,32.633333,62.136667,126,149.0,9,21,18


## Balance
In this section we will analyze the balance of the dataset datasets. We will check the number of samples per class.

In [3]:
for name in datasets:
    dataset = TUDataset(root=DATA_PATH, name=name)
    print(name)
    print(dataset._data.y.bincount().tolist())
    print()

IMDB-BINARY
[500, 500]

IMDB-MULTI
[500, 500, 500]

REDDIT-BINARY
[1000, 1000]

SYNTHETIC
[150, 150]

Synthie
[93, 107, 110, 90]

MUTAG
[63, 125]

MSRC_9
[19, 30, 30, 30, 23, 30, 29, 30]

ENZYMES
[100, 100, 100, 100, 100, 100]

