# Analyze IMDB Datasets

In [17]:
import os.path as osp
import numpy as np
import networkx as nx
import time
import random
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
import pandas as pd


# Torch Geometric 
try: 
    from torch_geometric.data import DataLoader, Data
    from torch_geometric.data.dataloader import Collater
    from torch_geometric.datasets import TUDataset
    from torch_geometric.utils import degree
    from torch_geometric.utils.convert import from_networkx
    from torch_geometric.nn import GINConv, GINEConv, global_add_pool
except ModuleNotFoundError: 
    !pip install torch_geometric
    from torch_geometric.data import DataLoader, Data
    from torch_geometric.data.dataloader import Collater
    from torch_geometric.datasets import TUDataset
    from torch_geometric.utils import degree
    from torch_geometric.utils.convert import from_networkx
    from torch_geometric.nn import GINConv, GINEConv, global_add_pool
    
# Pytorch Metric Learning
try: 
    from pytorch_metric_learning import losses
except ModuleNotFoundError:
    !pip install pytorch-metric-learning
    from pytorch_metric_learning import losses

## Download IMDB Datasets

In [2]:
class MyFilter(object):
    def __call__(self, data):
        return data.num_nodes <= 70

class MyPreTransform(object):
    def __call__(self, data):
        data.x = degree(data.edge_index[0], data.num_nodes, dtype=torch.long)
        data.x = F.one_hot(data.x, num_classes=69).to(torch.float)
        return data 

In [4]:
# Download imdb-b 
path = osp.join(osp.dirname(osp.realpath("./")), 'data', f'IMDB-BINARY')

imdb_b = TUDataset(
    path, 
    name = "IMDB-BINARY", 
    pre_transform = MyPreTransform(), 
    pre_filter = MyFilter()
)

In [5]:
print(imdb_b)

IMDB-BINARY(996)


In [6]:
# Download imdb-m 
path = osp.join(osp.dirname(osp.realpath("./")), 'data', f'IMDB-MULTI')

imdb_m = TUDataset(
    path, 
    name = "IMDB-MULTI", 
    pre_transform = MyPreTransform(), 
    pre_filter = MyFilter()
)

In [7]:
print(imdb_m)

IMDB-MULTI(1498)


## Analyze IMDB-BINARY

In [19]:
dataset = imdb_b

In [20]:
n = []
degs = []
for g in dataset:
    num_nodes = g.num_nodes
    deg = degree(g.edge_index[0], g.num_nodes, dtype=torch.long)
    n.append(g.num_nodes)
    degs.append(deg.max())

print(f'Mean Degree: {torch.stack(degs).float().mean()}')
print(f'Max Degree: {torch.stack(degs).max()}')
print(f'Min Degree: {torch.stack(degs).min()}')
mean_n = torch.tensor(n).float().mean().round().long().item()
print(f'Mean number of nodes: {mean_n}')
print(f'Max number of nodes: {torch.tensor(n).float().max().round().long().item()}')
print(f'Min number of nodes: {torch.tensor(n).float().min().round().long().item()}')
print(f'Number of graphs: {len(dataset)}')

Mean Degree: 18.486948013305664
Max Degree: 68
Min Degree: 11
Mean number of nodes: 19
Max number of nodes: 69
Min number of nodes: 12
Number of graphs: 996


## Analyze IMDB-Multi

In [21]:
dataset = imdb_m

In [22]:
n = []
degs = []
for g in dataset:
    num_nodes = g.num_nodes
    deg = degree(g.edge_index[0], g.num_nodes, dtype=torch.long)
    n.append(g.num_nodes)
    degs.append(deg.max())
print(f'Mean Degree: {torch.stack(degs).float().mean()}')
print(f'Max Degree: {torch.stack(degs).max()}')
print(f'Min Degree: {torch.stack(degs).min()}')
mean_n = torch.tensor(n).float().mean().round().long().item()
print(f'Mean number of nodes: {mean_n}')
print(f'Max number of nodes: {torch.tensor(n).float().max().round().long().item()}')
print(f'Min number of nodes: {torch.tensor(n).float().min().round().long().item()}')
print(f'Number of graphs: {len(dataset)}')

Mean Degree: 11.907209396362305
Max Degree: 62
Min Degree: 6
Mean number of nodes: 13
Max number of nodes: 63
Min number of nodes: 7
Number of graphs: 1498
