In [689]:
from experiments.datasets import *
import torch
from os import listdir
from torch_geometric.data import Data
# Install required packages.
import os
import torch
from sklearn.metrics import classification_report, f1_score
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

2.0.1+cpu


In [626]:
print("Loading user graph")
g = load_ig_graph()

fname = "../../../centralities.pickle"
print("Loading pre-computed centralities")
with open(fname, 'rb') as f:
    centralities = pickle.load(f)


Loading user graph
Loading pre-computed centralities


In [675]:
# DATAFRAMES_PATH = "/users/pcelayes/repos/sna_classifier/data/dataframes"
from os.path import join
import pandas as pd


train_samples = {}
test_samples = {}

for fname in listdir(DATAFRAMES_FOLDER):
    df_path = join(DATAFRAMES_FOLDER, fname)
    if fname.count("_") > 1:
        continue
    if fname.startswith("dfXtrain_"):
        user_id = fname.split(".")[0].split("_")[-1]
        if not user_id:
            continue
        Xy_train = pd.read_pickle(df_path)
        train_samples[user_id] = Xy_train.index.values.tolist()
    elif fname.startswith("dfXtest_"):
        if not user_id:
            continue
        user_id = fname.split(".")[0].split("_")[-1]
        Xy_test = pd.read_pickle(df_path)
        test_samples[user_id] = Xy_test.index.values.tolist()

In [676]:
len(train_samples)

4662

In [677]:
len(test_samples)

4662

In [678]:
list(train_samples.items())[0]

('781293',
 [637358147288571904,
  637328670911295489,
  637312767419092992,
  642115562596925440,
  642302593499308032,
  643123742529441792,
  643846692375273479,
  642399265898242048,
  642026192057536512,
  641811542988713984,
  641879116829065217,
  641250524428410880,
  637098873475960832,
  636924188784242689,
  636856587412488192,
  637087969413890048,
  637087172210896896,
  644295594421420032,
  636512739075170304,
  636337247638437888,
  636335698665205760,
  636332617223270400,
  636333398768898048,
  636277718666084352,
  636276285875036160,
  637234718556200960,
  637201723661152256,
  637048015660941312,
  637715920929062913,
  637667083145322496,
  637666490087555072,
  642267175034744833,
  636561383082041344,
  642062333653708800,
  636364605502169089,
  642050438649417728,
  637093772438978561,
  644183774578786304,
  644183354246590464,
  637025779373707265,
  637004149389946880,
  636939165620391936,
  636899478709137408,
  644126931668103168,
  636025395222024192,

In [564]:
len(neighbours)

616

In [653]:
def get_onehot(user_id):
    v = np.zeros(len(g.vs))
    ind = g.vs['twid'].index(str(user_id))
    v[ind] = 1
    return v

In [654]:
get_onehot(neighbour_ids[20]).sum()

1.0

### Implementación más sencilla

- sin embeddings
- sólo el vecindario de u
- features: is_u, is_rt, centralidades (misma info que paper anterior)

In [681]:
def create_pyg_data_objects(tweet_ids, user_id, neighbour_users):
    '''
        Given tweets, central user and neighbour_users,
        we extract a pytorchgeometric Data representation
        of the 'neighbour activity' for each tweet
        
        nodes = user + retweeting neighbors + other neighbors
        features:
            centralities
            is_central_user
            is_retweeting
    '''
    s = open_session()
    user = s.query(User).get(user_id)

    ### Compute index mappings (igraph, twid, row_id)

    # print("# Filter centralities to cover only ngids")
    # This is the fixed order we will keep for nodes in x,
    # for all the generated datasamples for this central user
    user_id = user.id
    neighbour_ids = [u.id for u in neighbour_users]    
    index_users = [user] + neighbour_users
    index_twid = [user_id] + neighbour_ids    
    index_twid_to_igraph_map = {int(l): i for (i,l) in enumerate(g.vs["twid"]) if int(l) in index_twid}
    index_ig = [index_twid_to_igraph_map[l] for l in index_twid]
    index_centralities = [np.array(m)[index_ig] for m in centralities]

    ### Compute fixed centrality features
    centralities_matrix = np.vstack(index_centralities).transpose()

    is_central_col = np.zeros(len(index_users))
    is_central_col[0] = 1
    is_central_col = np.expand_dims(is_central_col, axis=1)

    ## Compute edges
    # select edgeds within subgraph
    edges = g.es.select(_within=index_ig)
    edges_igraph = [e.tuple for e in edges]

    # now we need to map them to row indices in the x feature matrix
    # igraph_ind -> tw_ind -> row_ind

    igraph_ind_to_tw_ind_map = {ig: tw for (tw, ig) in index_twid_to_igraph_map.items()}
    tw_ind_to_row_ind_map = {tw: i for (i, tw) in enumerate(index_twid)}
    def map_ig_ind_to_row_ind(ig):
        return tw_ind_to_row_ind_map[igraph_ind_to_tw_ind_map[ig]]

    def map_edge(e):
        a, b = e
        return (map_ig_ind_to_row_ind(a), map_ig_ind_to_row_ind(b))
    
    edge_index = torch.tensor([map_edge(e) for e in edges_igraph])
    edge_index = edge_index.transpose(0,1)

    ### Compute retweeting features for each example
    data_objects = []
    for tweet_id in tweet_ids:
        is_retweeting_col = np.zeros(len(index_users))
        for i, u in enumerate(index_users):
            if tweet_id in [t.id for t in u.timeline]:
                is_retweeting_col[i] = 1
        is_retweeting_col = np.expand_dims(is_retweeting_col, axis=1)

        x = np.hstack([centralities_matrix, is_central_col, is_retweeting_col])
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(int(tweet_id in [t.id for t in user.timeline]), dtype=torch.long)
        
        d = Data(x=x, edge_index=edge_index, y=y)
        data_objects.append(d)
        
    return data_objects

In [698]:
def samples_to_data_objects(samples):
    data_objects = []
    s = open_session()

    for user_id, tweet_ids in list(samples.items())[:4]:
        print(user_id)
        user = s.query(User).get(user_id)
        neighbours = get_level2_neighbours(user, s)
        # remove central user from neighbours
        neighbour_users = [u for u in neighbours if u.id != user.id]
        data_objects += create_pyg_data_objects(tweet_ids[:200], user_id, neighbour_users)
    return data_objects

In [699]:
train_data_objects = samples_to_data_objects(train_samples)

781293
148887961
273744824
206106207


In [700]:
test_data_objects = samples_to_data_objects(test_samples)

1493451997
20182089
10012122
117251043


# Graph Classification with Graph Neural Networks

[Previous: Node Classification with Graph Neural Networks](https://colab.research.google.com/drive/14OvFnAXggxB8vM4e8vSURUp1TaKnovzX)

In this tutorial session we will have a closer look at how to apply **Graph Neural Networks (GNNs) to the task of graph classification**.
Graph classification refers to the problem of classifiying entire graphs (in contrast to nodes), given a **dataset of graphs**, based on some structural graph properties.
Here, we want to embed entire graphs, and we want to embed those graphs in such a way so that they are linearly separable given a task at hand.

In [701]:
torch.manual_seed(12345)

train_dataset = train_data_objects
test_dataset = test_data_objects

print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

Number of training graphs: 800
Number of test graphs: 800


In [702]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(x=[36086, 7], edge_index=[2, 1173141], y=[64], batch=[36086], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(x=[37483, 7], edge_index=[2, 1228717], y=[64], batch=[37483], ptr=[65])

Step 3:
Number of graphs in the current batch: 64
DataBatch(x=[35494, 7], edge_index=[2, 1152682], y=[64], batch=[35494], ptr=[65])

Step 4:
Number of graphs in the current batch: 64
DataBatch(x=[36972, 7], edge_index=[2, 1254876], y=[64], batch=[36972], ptr=[65])

Step 5:
Number of graphs in the current batch: 64
DataBatch(x=[38426, 7], edge_index=[2, 1304534], y=[64], batch=[38426], ptr=[65])

Step 6:
Number of graphs in the current batch: 64
DataBatch(x=[37364, 7], edge_index=[2, 1249450], y=[64], batch=[37364], ptr=[65])

Step 7:
Number of graphs in the current batch: 64
DataBatch(x=[36439, 7], edge_index=[2, 1213035], y=[64], batch=[36439], ptr=[65])

Step 8:
Number of graphs in the current batch: 64
DataBatch(x=[36971,

## Training a Graph Neural Network (GNN)

Training a GNN for graph classification usually follows a simple recipe:

1. Embed each node by performing multiple rounds of message passing
2. Aggregate node embeddings into a unified graph embedding (**readout layer**)
3. Train a final classifier on the graph embedding

There exists multiple **readout layers** in literature, but the most common one is to simply take the average of node embeddings:

$$
\mathbf{x}_{\mathcal{G}} = \frac{1}{|\mathcal{V}|} \sum_{v \in \mathcal{V}} \mathcal{x}^{(L)}_v
$$

PyTorch Geometric provides this functionality via [`torch_geometric.nn.global_mean_pool`](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.glob.global_mean_pool), which takes in the node embeddings of all nodes in the mini-batch and the assignment vector `batch` to compute a graph embedding of size `[batch_size, hidden_channels]` for each graph in the batch.

The final architecture for applying GNNs to the task of graph classification then looks as follows and allows for complete end-to-end training:

In [703]:
num_node_features = x.shape[1]
num_node_features

7

In [704]:
num_classes = 2

In [705]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index, batch):        
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=5)
print(model)

GCN(
  (conv1): GCNConv(7, 5)
  (conv2): GCNConv(5, 5)
  (conv3): GCNConv(5, 5)
  (lin): Linear(in_features=5, out_features=2, bias=True)
)


Here, we again make use of the [`GCNConv`](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GCNConv) with $\mathrm{ReLU}(x) = \max(x, 0)$ activation for obtaining localized node embeddings, before we apply our final classifier on top of a graph readout layer.

Let's train our network for a few epochs to see how well it performs on the training as well as test set:

In [707]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 3):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Train Acc: 0.9812, Test Acc: 0.9675
Epoch: 002, Train Acc: 0.9812, Test Acc: 0.9675


## Save Model

In [716]:
MODEL_PATH = "ggn-model.torch"
torch.save(model, MODEL_PATH)

## Evaluate Model

In [717]:
model = torch.load(MODEL_PATH)

preds = []
labels = []
for data in test_loader:
    out = model(data.x, data.edge_index, data.batch)  
    preds += out.argmax(dim=1)  # Use the class with highest probability.
    labels += data.y

f1_score(labels, preds)

In [721]:
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       774
           1       0.00      0.00      0.00        26

    accuracy                           0.97       800
   macro avg       0.48      0.50      0.49       800
weighted avg       0.94      0.97      0.95       800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
