In [24]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import json
from dataloader import Twitch, Git
import tensorflow as tf
from sklearn.model_selection import train_test_split
from spektral.layers import GCNConv, GATConv
from spektral.models.gcn import GCN
from spektral.transforms import LayerPreprocess, AdjToSpTensor
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

from models import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

def train_step(inputs, labels, mask, model, optimizer, loss_fn, with_ewc=False):
  with tf.GradientTape() as tape:
    pred = model(inputs)
    loss = loss_fn(labels[mask], pred[mask])

    if with_ewc:
      loss += model.penalty_loss()
      
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  
  return loss

def load_task(dataset="Twitch", language=None, path="./twitch", feature_dim=None):
  dataset = dataset.lower()

  if dataset == "twitch":
    dataset = Twitch(language, path=path, feature_dim=feature_dim, normalize_x=True,
                    transforms=[LayerPreprocess(GCNConv), AdjToSpTensor()])
  elif dataset == "git":
    dataset = Git(normalize_x=True, transforms=[LayerPreprocess(GCNConv), AdjToSpTensor()])
  elif dataset == "pubmed":
    dataset = Citation("pubmed", normalize_x=True, transforms=[LayerPreprocess(GCNConv), AdjToSpTensor()])

  graph = dataset[0]
  x, adj, y = graph.x, graph.a, graph.y

  # the mask are computed randomly inside the dataloader
  mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te

  return (x, adj, y), (mask_tr, mask_va, mask_te)

In [19]:
def apx_fisher(model, inputs):
  """ Approximate the fisher matrices of the model parameters """
  fisher_matrices = {p: tf.zeros_like(v.value()) for p,v in enumerate(model.trainable_variables)}

  with tf.GradientTape() as tape:
    pred = model(inputs)

    # get loglikelihood of predictions
    loglike = tf.nn.log_softmax(pred)
  
  # compute gradient wrt loglikelihood
  loglike_grads = tape.gradient(loglike, model.trainable_variables)

  # approximate i-th Fisher matrix with gradients squared
  for i,g in enumerate(loglike_grads):
    fisher_matrices[i] += tf.reduce_mean(g**2, axis=0)
  
  return fisher_matrices

  def penalty_loss(fisher_matrix, current_weights, prior_weights, lambda_=0.1):
    loss = 0

    for u, v, w in zip(fisher_matrix, current_weights, prior_weights):
        loss += tf.reduce_sum(u*tf.square(v-w))
    
    return 0.5 * lambda_ * loss

In [65]:
from os import listdir

DATASET_NUM_CLASSES = {"twitch": 2, "pubmed": 3, "citeseer": 2}

class ModelEval():
    def __init__(self, dataset="twitch", path="./data/twitch/", print_freq=10):
        super(ModelEval, self).__init__()

        self.n_labels = 2
        dataset = dataset.lower()
        self.print_freq = print_freq
        self.tasks = dict()
    
    def add_task(self, dataset="pubmed", path=None, **kwargs):
        if dataset.lower() == "twitch":
            languages = list(filter(lambda x: "txt" not in x, listdir(path)))
            for lang in languages:
                data, masks = load_task(dataset=dataset, path=path, language=kwargs["language"])
                self.tasks[dataset][lang] = {"data": data, "masks": masks}
        else:
            data, masks = load_task(dataset=dataset)
            self.tasks[dataset] = {"data": data, "masks": masks}
        
        self.n_labels = DATASET_NUM_CLASSES[dataset]
    
    def train(self, dataset, epochs=20, cl=False, with_ewc=False, lr=0.001, lambda_=0.1):
        task = self.tasks[dataset]
        
        # Twitch dataset has multiple sub-tasks
        if language is not None:
            task = task[language]
        
        x, adj, y = task["data"]
        mask_tr, _, _ = task["masks"]

        # initialize model if not already trained
        if not cl:
            self.model = GNNNodeClassifier(n_labels=self.n_labels)
        
        optim = Adam(learning_rate=lr)
        loss_fn = CategoricalCrossentropy()
        accuracy = CategoricalAccuracy()
        
        for i in range(epochs):
            with tf.GradientTape() as tape:
                pred = self.model([x, adj])
                loss = loss_fn(y[mask_tr], pred[mask_tr])
                accuracy.update_state(y_pred=pred, y_true=y)

                # if continual learning and EWC is enabled add l2 loss wrt previous weights
                if cl and with_ewc:
                    loss += penalty_loss(self.fisher_matrix, self.model.get_weights(),
                                         self.prior_weights, lambda_=lambda_)
            
            acc = round(accuracy.result().numpy(), 3)
            
            if i % self.print_freq == 0:
                print(f"epoch={i}, train_loss={loss:.3f}, train_acc={acc:.3f}")
                
        
            gradients = tape.gradient(loss, self.model.trainable_variables)
            optim.apply_gradients(zip(gradients, self.model.trainable_variables))

        self.prior_weights = self.model.get_weights()
        self.fisher_matrix = apx_fisher(self.model, [x, adj])
    
    def test(self, language=None):
        if language is not None:
            task = self.tasks[language]
        else:
            task = self.tasks

        x, adj, y = task["data"]
        _, _, mask_te = task["masks"]


        pred = self.model([x, adj])
        acc = CategoricalAccuracy()(y_pred=pred, y_true=y).numpy()
        print(f"test_acc={acc:.3f}")

        return pred

In [66]:
model_eval = ModelEval(dataset="pubmed")
model_eval.train(epochs=50)
pred = model_eval.test()

#print(round(acc, 3))

Pre-processing node features


  self._set_arrayXarray(i, j, x)


epoch=0, train_loss=1.099, train_acc=0.382
epoch=10, train_loss=1.086, train_acc=0.658
epoch=20, train_loss=1.039, train_acc=0.675
epoch=30, train_loss=0.912, train_acc=0.671
epoch=40, train_loss=0.681, train_acc=0.667
test_acc=0.739


In [362]:
model_eval = ModelEval(dataset="pubmed")
model_eval.train("DE", epochs=100)
print("Accuracy on DE after training on DE: ", model_eval.test("DE"))

model_eval.train("RU", cl=True, with_ewc=False, lambda_=0.1, epochs=100)
print("Accuracy on RU after training on DE+FR w/o EWC: ", model_eval.test("RU"))
print("Accuracy on DE after training on DE+FR w/o EWC: ", model_eval.test("DE"))

Accuracy on DE after training on DE:  0.6042105263157894
Accuracy on RU after training on DE+FR w/o EWC:  0.7548460661345496
Accuracy on DE after training on DE+FR w/o EWC:  0.3957894736842105


In [364]:
model_eval = ModelEval()
model_eval.train("DE", epochs=50)
model_eval.train("RU", cl=True, with_ewc=True, lambda_=0.1, epochs=50)
print("Accuracy on FR after training on DE+FR w EWC: ", model_eval.test("DE"))
print("Accuracy on DE after training on DE+FR w EWC: ", model_eval.test("RU"))

1.0
Accuracy on FR after training on DE+FR w EWC:  0.6047368421052631
1.0
Accuracy on DE after training on DE+FR w EWC:  0.2451539338654504


In [251]:
model_eval = ModelEval()
model_eval.train("FR")
print("Accuracy on FR after training on FR w EWC: ", model_eval.test("RU"))

Accuracy on FR after training on FR w EWC:  0.7548460661345496


In [253]:
print("Accuracy on FR after training on DE+FR w EWC: ", model_eval.test("FR"))

Accuracy on FR after training on DE+FR w EWC:  0.6312977099236641


In [264]:
data, _ = load_task("DE", path="./data/twitch/")

In [269]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit_transform(data[0]).shape

(9498, 100)

In [56]:
(x, adj, y), (mask_tr, _, mask_te) = load_task(language="DE", path="./data/twitch/", dataset="Twitch")

In [52]:
mask_tr.sum() / len(mask_tr), mask_te.sum() / len(mask_te)

(0.5999157717414193, 0.20004211412929038)

In [66]:
tf.reduce_sum(tf.sparse.to_dense(adj)[0,:])

<tf.Tensor: shape=(), dtype=float64, numpy=0.19624381498210133>

In [366]:
from spektral.datasets.graphsage import GraphSage
x, adj, y, mask_tr, mask_va, mask_te = GraphSage("ppi")

Downloading ppi dataset.


ConnectionError: HTTPConnectionPool(host='snap.stanford.edu', port=80): Max retries exceeded with url: /graphsage/ppi.zip (Caused by NewConnectionError("<urllib3.connection.HTTPConnection object at 0x000001F768E8C940>: Failed to establish a new connection: [WinError 10060] Impossibile stabilire la connessione. Risposta non corretta della parte connessa dopo l'intervallo di tempo oppure mancata risposta dall'host collegato"))

In [373]:
from dataloader import Git

data = Git(path="./data/git_web_ml/")

37700 37700


In [375]:
data[0]

Graph(n_nodes=37700, n_node_features=4004, n_edge_features=None, n_labels=2)