In [106]:
%load_ext autoreload
%autoreload 2

import tensorflow as tf
from sklearn.model_selection import train_test_split
from spektral.layers import GCNConv
from spektral.transforms import LayerPreprocess, AdjToSpTensor
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from models import *
import numpy as np


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [150]:
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

def load_task(dataset="pubmed", n_splits=3, feature_dim=None):
  dataset = dataset.lower()

  if dataset in ["pubmed"]:
    dataset = Citation(dataset, normalize_x=True, transforms=[LayerPreprocess(GCNConv)])

  graph = dataset[0]
  x, adj, y = graph.x, graph.a, graph.y

  # the mask are computed randomly inside the dataloader
  mask_tr, mask_va, mask_te = dataset.mask_tr, dataset.mask_va, dataset.mask_te

  # split dataset for continual learning
  size = x.shape[0]
  idx_target = np.random.randint(low=0, high=n_splits, size=size)
  tasks = dict()

  for i in range(n_splits):
    idx = (idx_target==i)
    #sparse_adj = tf.sparse.from_dense(adj)[idx,idx]
    tasks[i] = (x[idx,:], adj[idx], y[idx,:]), (mask_tr[idx], mask_va[idx], mask_te[idx])

  return tasks

In [156]:
data[2].shape

(6566, 3)

In [151]:
data, masks = load_task()[0]
a = data[1]

Pre-processing node features


  self._set_arrayXarray(i, j, x)


In [117]:
size = data[0].shape[0]
idx_target = np.random.randint(low=0, high=3, size=size)
idx = np.where(idx_target==0)[0]


array([    0,     3,     4, ..., 19699, 19700, 19709], dtype=int64)

In [19]:
def apx_fisher(model, inputs):
  """ Approximate the fisher matrices of the model parameters """
  fisher_matrices = {p: tf.zeros_like(v.value()) for p,v in enumerate(model.trainable_variables)}

  with tf.GradientTape() as tape:
    pred = model(inputs)

    # get loglikelihood of predictions
    loglike = tf.nn.log_softmax(pred)
  
  # compute gradient wrt loglikelihood
  loglike_grads = tape.gradient(loglike, model.trainable_variables)

  # approximate i-th Fisher matrix with gradients squared
  for i,g in enumerate(loglike_grads):
    fisher_matrices[i] += tf.reduce_mean(g**2, axis=0)
  
  return fisher_matrices

  def penalty_loss(fisher_matrix, current_weights, prior_weights, lambda_=0.1):
    loss = 0

    for u, v, w in zip(fisher_matrix, current_weights, prior_weights):
        loss += tf.reduce_sum(u*tf.square(v-w))
    
    return 0.5 * lambda_ * loss

In [96]:
from os import listdir

class ModelEval():
    def __init__(self, dataset="pubmed", print_freq=10):
        super(ModelEval, self).__init__()

        self.n_labels = 2
        dataset = dataset.lower()
        self.print_freq = print_freq
        self.tasks = dict()
    
    def add_task(self, dataset="pubmed"):
        data, masks = load_task(dataset=dataset)
        self.tasks[dataset] = {"data": data, "masks": masks}
        self.n_labels = DATASET_NUM_CLASSES[dataset]
    
    def train_step(self, mask_tr, continual_learning=False, use_ewc=False, lambda_=0.1):
        with tf.GradientTape() as tape:
            pred = self.model([x, adj])
            loss = self.loss_fn(y[mask_tr], pred[mask_tr])
            self.accuracy.update_state(y_pred=pred, y_true=y)

            # if continual learning and EWC is enabled add l2 loss wrt previous weights
            if continual_learning and use_ewc:
                loss += penalty_loss(self.fisher_matrix, self.model.get_weights(),
                                        self.prior_weights, lambda_=lambda_)
            
            gradients = tape.gradient(loss, self.model.trainable_variables)
            self.optim.apply_gradients(zip(gradients, self.model.trainable_variables))
            
        return round(self.accuracy.result().numpy(), 3), loss
    
    def train(self, dataset="pubmed", epochs=20, continual_learning=False, use_ewc=False,
              lr=0.001, lambda_=0.1, verbose=True):
        task = self.tasks[dataset]
        x, adj, y = task["data"]
        mask_tr, _, _ = task["masks"]

        # initialize model if not already trained
        if not continual_learning:
            self.model = GNNNodeClassifier(n_labels=self.n_labels)
            self.optim = Adam(learning_rate=lr)
            self.loss_fn = CategoricalCrossentropy()
        
        self.accuracy = CategoricalAccuracy()
        
        for i in range(epochs):
            acc, loss = self.train_step(mask_tr=mask_tr, continual_learning=continual_learning,
                                        use_ewc=use_ewc, lambda_=lambda_)
            
            if i % self.print_freq == 0 and verbose:
                print(f"epoch={i}, train_loss={loss:.3f}, train_acc={acc:.3f}")

        self.prior_weights = self.model.get_weights()
        self.fisher_matrix = apx_fisher(self.model, [x, adj])
    
    def test(self, dataset="pubmed", verbose=True):
        task = self.tasks[dataset]
        x, adj, y = task["data"]
        _, _, mask_te = task["masks"]


        pred = self.model([x, adj])
        acc = CategoricalAccuracy()(y_pred=pred[mask_te], y_true=y[mask_te]).numpy()

        if verbose:
            print(f"test_acc={acc:.3f}")

        return acc

In [101]:
data, masks = load_task("citeseer")
print(data[2].shape)

Pre-processing node features


  r_inv = np.power(rowsum, -1).flatten()
  self._set_arrayXarray(i, j, x)


(3327, 6)


In [98]:
model_eval = ModelEval()
dataset = "pubmed"
model_eval.add_task(dataset)
model_eval.train(dataset, epochs=50, verbose=False)
pubmed_acc = model_eval.test(dataset, verbose=False)

model_eval = ModelEval()
dataset = "citeseer"
model_eval.add_task(dataset)
model_eval.train(dataset, epochs=50, verbose=False)
citeseer_acc = model_eval.test(dataset, verbose=False)

print(f"Pubmed test_acc={pubmed_acc}")
print(f"Citeseer test_acc={citeseer_acc}")

Pre-processing node features


  self._set_arrayXarray(i, j, x)


Pre-processing node features


  r_inv = np.power(rowsum, -1).flatten()
  self._set_arrayXarray(i, j, x)


IndexError: boolean index did not match indexed array along dimension 0; dimension is 19717 but corresponding boolean dimension is 3327

In [None]:
dataset = "citeseer",
model_eval = ModelEval()
model_eval.add_task(dataset)
model_eval.train(dataset=dataset, epochs=50)
acc = model_eval.test(dataset)

In [95]:
model_eval.add_task(dataset="citeseer")

Pre-processing node features


  r_inv = np.power(rowsum, -1).flatten()
  self._set_arrayXarray(i, j, x)


In [362]:
model_eval.train("citeseer", epochs=50, continual_learning=True)
print("Accuracy after training on Pubmed and Cite")
print("Accuracy on Citeseer after training on Pubmed+Citeseer: ", model_eval.test("citeseer"))
print("Accuracy on Pubmed")

model_eval.train("RU", cl=True, with_ewc=False, lambda_=0.1, epochs=100)
print("Accuracy on RU after training on DE+FR w/o EWC: ", model_eval.test("RU"))
print("Accuracy on DE after training on DE+FR w/o EWC: ", model_eval.test("DE"))

Accuracy on DE after training on DE:  0.6042105263157894
Accuracy on RU after training on DE+FR w/o EWC:  0.7548460661345496
Accuracy on DE after training on DE+FR w/o EWC:  0.3957894736842105


In [364]:
model_eval = ModelEval()
model_eval.train("DE", epochs=50)
model_eval.train("RU", cl=True, with_ewc=True, lambda_=0.1, epochs=50)
print("Accuracy on FR after training on DE+FR w EWC: ", model_eval.test("DE"))
print("Accuracy on DE after training on DE+FR w EWC: ", model_eval.test("RU"))

1.0
Accuracy on FR after training on DE+FR w EWC:  0.6047368421052631
1.0
Accuracy on DE after training on DE+FR w EWC:  0.2451539338654504


In [251]:
model_eval = ModelEval()
model_eval.train("FR")
print("Accuracy on FR after training on FR w EWC: ", model_eval.test("RU"))

Accuracy on FR after training on FR w EWC:  0.7548460661345496


In [253]:
print("Accuracy on FR after training on DE+FR w EWC: ", model_eval.test("FR"))

Accuracy on FR after training on DE+FR w EWC:  0.6312977099236641


In [264]:
data, _ = load_task("DE", path="./data/twitch/")

In [269]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit_transform(data[0]).shape

(9498, 100)

In [56]:
(x, adj, y), (mask_tr, _, mask_te) = load_task(language="DE", path="./data/twitch/", dataset="Twitch")

In [52]:
mask_tr.sum() / len(mask_tr), mask_te.sum() / len(mask_te)

(0.5999157717414193, 0.20004211412929038)

In [66]:
tf.reduce_sum(tf.sparse.to_dense(adj)[0,:])

<tf.Tensor: shape=(), dtype=float64, numpy=0.19624381498210133>

In [366]:
from spektral.datasets.graphsage import GraphSage
x, adj, y, mask_tr, mask_va, mask_te = GraphSage("ppi")

Downloading ppi dataset.


ConnectionError: HTTPConnectionPool(host='snap.stanford.edu', port=80): Max retries exceeded with url: /graphsage/ppi.zip (Caused by NewConnectionError("<urllib3.connection.HTTPConnection object at 0x000001F768E8C940>: Failed to establish a new connection: [WinError 10060] Impossibile stabilire la connessione. Risposta non corretta della parte connessa dopo l'intervallo di tempo oppure mancata risposta dall'host collegato"))

In [373]:
from dataloader import Git

data = Git(path="./data/git_web_ml/")

37700 37700


In [375]:
data[0]

Graph(n_nodes=37700, n_node_features=4004, n_edge_features=None, n_labels=2)