<a href="https://colab.research.google.com/github/markkod/pc3-enzymes/blob/exploratory_da/task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#preinstalled version of pytorch has to be the same as the pre-compiled versions of the pytorch-geometric packages that we download later on.
#versions might change quickly, so if you get a strange error later on, check the torch version of Google colab later on as follows:

import torch
torch.__version__

'1.7.0+cu101'

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
# Script to generate variations of the kernels yourself
# https://ucloud.univie.ac.at/index.php/s/E3YKph0jkpbw8TN


# #Download the TUDataset Repository with
!git clone https://github.com/chrsmrrs/tudataset.git
# #move this script to tudataset/tud_benchmark

# #Install pytorch geometric: https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html
# #Here is the gpu cuda installation, for the cpu version replace cu102 with cpu
%pip --no-cache-dir install torch-scatter==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.7.0.html
%pip --no-cache-dir install torch-sparse==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.7.0.html
%pip --no-cache-dir install torch-cluster==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.7.0.html
%pip --no-cache-dir install torch-spline-conv==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.7.0.html
%pip --no-cache-dir install torch-geometric

%pip --no-cache-dir install pybind11
!sudo apt-get install libeigen3-dev



Cloning into 'tudataset'...
remote: Enumerating objects: 485, done.[K
remote: Counting objects: 100% (485/485), done.[K
remote: Compressing objects: 100% (366/366), done.[K
remote: Total 3344 (delta 244), reused 300 (delta 114), pack-reused 2859[K
Receiving objects: 100% (3344/3344), 8.47 MiB | 22.59 MiB/s, done.
Resolving deltas: 100% (2371/2371), done.
Looking in links: https://pytorch-geometric.com/whl/torch-1.7.0.html
Collecting torch-scatter==latest+cu101
[?25l  Downloading https://pytorch-geometric.com/whl/torch-1.7.0/torch_scatter-latest%2Bcu101-cp36-cp36m-linux_x86_64.whl (11.9MB)
[K     |████████████████████████████████| 11.9MB 10.9MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.5
Looking in links: https://pytorch-geometric.com/whl/torch-1.7.0.html
Collecting torch-sparse==latest+cu101
[?25l  Downloading https://pytorch-geometric.com/whl/torch-1.7.0/torch_sparse-latest%2Bcu101-cp36-cp36m-linux_x86_64.whl (24.3MB)
[K    

In [11]:
%cd ..
%cd /content/tudataset/tud_benchmark/kernel_baselines/
! ls
! g++ -I /usr/include/eigen3 -O3 -shared -std=c++11 -fPIC `python3 -m pybind11 --includes`  kernel_baselines.cpp src/*cpp -o ../kernel_baselines`python3-config --extension-suffix`
%cd ..

/content/tudataset
/content/tudataset/tud_benchmark/kernel_baselines
kernel_baselines.cpp  src
/content/tudataset/tud_benchmark


In [12]:
!ls -al /usr/local/cuda

lrwxrwxrwx 1 root root 9 Dec 21 17:24 /usr/local/cuda -> cuda-10.1


In [53]:
import os
import torch
import numpy as np
import kernel_baselines as kb
import auxiliarymethods
from auxiliarymethods import datasets as dp
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from auxiliarymethods import auxiliary_methods as aux
from auxiliarymethods import kernel_evaluation as ke

In [14]:
def setup_directory(dir_name, verbose=False):
    """
    Setup directory in case it does not exist
    Parameters:
    -------------
    dir_name: str, path + name to directory
    verbose: bool, indicates whether directory creation should be printed or not.
    """
    if not os.path.exists(dir_name):
        try:
            os.makedirs(dir_name)
            if verbose:
                print("Created Directory: {}".format(dir_name))
        except Exception as e:
            raise RuntimeError(
                "Could not create directory: {}\n {}".format(dir_name, e))


In [16]:
use_edge_labels = False
for USE_LABELS in [True, False]:# Except IMDB-BINARY
    for dataset, use_labels in [["ENZYMES", USE_LABELS]]:
        if use_labels:
            base_path = os.path.join("kernels","node_labels")
        else:
            base_path = os.path.join("kernels","without_labels")
        setup_directory(base_path)
        print("Start processing data set ", dataset)
        # Download dataset.
        classes = dp.get_dataset(dataset)
        # *Weisfeihler-Lehman*
        print("Start computing Weisfeihler-Lehman gram matrix and vector representations")
        iterations = 6
        #0 taking just the nodelabels themselves into account; 
        #1 considers nearest-neighbours, 2 one layer deeper and so on
        for i in range(1, iterations):
            print("Start iteration ", i)
            #Gram Matrix for the Weisfeiler-Lehman subtree kernel
            gram_matrix_wl = kb.compute_wl_1_dense(dataset, i, use_labels, use_edge_labels)
            np.savetxt(os.path.join(base_path,f"{dataset}_gram_matrix_wl{i}.csv"),
                    gram_matrix_wl,
                    delimiter=";")
            #Sparse Vectors for the Weisfeiler-Lehmann subtree kernel
            vectors_wl = kb.compute_wl_1_sparse(dataset, i, use_labels, use_edge_labels)
            save_npz(os.path.join(base_path,f"{dataset}_vectors_wl{i}.npz"),
                    vectors_wl, compressed=True)

        # *Graphlet kernel*
        print("Start computing Graphlet gram matrix")

        #Gram Matrix for the Graphlet kernel
        gram_matrix_graphlet= kb.compute_graphlet_dense(dataset, use_labels, use_edge_labels)
        np.savetxt(os.path.join(base_path,f"{dataset}_gram_matrix_graphlet.csv"),
                gram_matrix_graphlet,
                delimiter=";")

        print("Start computing Graphlet vector representation")
        #Sparse Vectors for the Graphlet kernel
        vectors_graphlet = kb.compute_graphlet_sparse(dataset, use_labels, use_edge_labels)
        save_npz(os.path.join(base_path,f"{dataset}_vectors_graphlet.npz"),
                vectors_graphlet, compressed=True)


        print("Start computing Shortest path gram matrix")

        #Gram Matrix for the Shortest path kernel
        gram_matrix_shortestpath = kb.compute_shortestpath_dense(dataset, use_labels)
        np.savetxt(os.path.join(base_path,f"{dataset}_gram_matrix_shortestpath.csv"),
                gram_matrix_shortestpath,
                delimiter=";")

        print("Start computing Shortest path vector representation")

        #Sparse Vectors for the Shortest path kernel
        vectors_shortestpath = kb.compute_shortestpath_sparse(dataset, use_labels)
        save_npz(os.path.join(base_path,f"{dataset}_vectors_shortestpath.npz"),
                vectors_shortestpath, compressed=True)



Start processing data set  ENZYMES
Start computing Weisfeihler-Lehman gram matrix and vector representations
Start iteration  1
Start iteration  2
Start iteration  3
Start iteration  4
Start iteration  5
Start computing Graphlet gram matrix
Start computing Graphlet vector representation
Start computing Shortest path gram matrix
Start computing Shortest path vector representation
Start processing data set  ENZYMES
Start computing Weisfeihler-Lehman gram matrix and vector representations
Start iteration  1
Start iteration  2
Start iteration  3
Start iteration  4
Start iteration  5
Start computing Graphlet gram matrix
Start computing Graphlet vector representation
Start computing Shortest path gram matrix
Start computing Shortest path vector representation


In [54]:
def find_keys_with_condition(data, cond):
    return list(filter(lambda x: cond in x, data.keys()))

def load_data():
  result = {}
  extensions = ['csv', 'npz']
  types = ['gram_matrix', 'vectors']
  algos = ['wl1', 'wl2', 'wl3', 'wl4', 'wl5', 'shortestpath', 'graphlet']
  base_name = '/content/tudataset/tud_benchmark/kernels/node_labels/ENZYMES_{0}_{1}.{2}'

  for t, e in zip(types, extensions):
    result[t] = {}
    for a in algos:
      algo_name = 'wl' if 'wl' in a else a

      if algo_name not in result[t].keys():
        result[t][algo_name] = []

      file_name = base_name.format(t, a, e)

      if e == 'csv':
        f = np.loadtxt(file_name, delimiter=';')
      else:
        f = load_npz(file_name)
      
      result[t][algo_name].append(f)
  return result

In [55]:
data = load_data()
data

{'gram_matrix': {'graphlet': [array([[11177.,  6596.,  4333., ..., 11081., 13908., 10164.],
          [ 6596.,  4879.,  3005., ...,  4882.,  5786.,  3278.],
          [ 4333.,  3005.,  2132., ...,  2814.,  3236.,  1674.],
          ...,
          [11081.,  4882.,  2814., ..., 20397., 22060., 18291.],
          [13908.,  5786.,  3236., ..., 22060., 28850., 24883.],
          [10164.,  3278.,  1674., ..., 18291., 24883., 22359.]])],
  'shortestpath': [array([[ 73924.,  30546.,  35514., ...,  79166., 117398.,  96672.],
          [ 30546.,  15744.,  16882., ...,  36894.,  56388.,  46384.],
          [ 35514.,  16882.,  21316., ...,  33136.,  54940.,  44536.],
          ...,
          [ 79166.,  36894.,  33136., ..., 316796., 227164., 199264.],
          [117398.,  56388.,  54940., ..., 227164., 337252., 289668.],
          [ 96672.,  46384.,  44536., ..., 199264., 289668., 256200.]])],
  'wl': [array([[ 874.,  502.,  585., ...,  973.,  998.,  935.],
          [ 502.,  360.,  362., ...,  59

In [64]:
def eval_kernel(kernel, classes, mode, n_reps=10, all_std=True):
  normalized = []
  print(f'Starting normalization of {len(kernel)} elements...')
  for array in kernel:
    if mode == 'LINEAR':
      normalized.append(aux.normalize_feature_vector(array))
    else:
      normalized.append(aux.normalize_gram_matrix(array))
  print(f'Normalization finished, starting {mode} SVM...')
  if mode == 'LINEAR':
    return ke.linear_svm_evaluation(normalized, classes, num_repetitions=n_reps, all_std=all_std)
  return ke.kernel_svm_evaluation(normalized, classes, num_repetitions=n_reps, all_std=all_std)

def eval_all(data):
  classes = dp.get_dataset('ENZYMES')
  result = {}
  for data_type in data.keys():
    mode = 'LINEAR' if data_type == 'vectors' else 'KERNEL'
    result[data_type] = {}
    print('MODE:', mode)
    for kernel in data[data_type]:
      print(f'Evaluating {kernel} SVM...')
      result[data_type][kernel] = eval_kernel(data[data_type][kernel], classes, mode)
      print(f'{data_type}-{kernel} : {result[data_type][kernel]}')
  return result



In [65]:
eval_all(data)

MODE: KERNEL
Evaluating wl SVM...
Starting normalization of 5 elements...
Normalization finished, starting KERNEL SVM...
gram_matrix-wl : (50.33333333333333, 1.1279282877125771, 6.411794687223782)
Evaluating shortestpath SVM...
Starting normalization of 1 elements...
Normalization finished, starting KERNEL SVM...
gram_matrix-shortestpath : (41.15, 1.131493604832911, 6.1540185606191065)
Evaluating graphlet SVM...
Starting normalization of 1 elements...
Normalization finished, starting KERNEL SVM...
gram_matrix-graphlet : (30.56666666666667, 0.727247474309049, 5.422279143599222)
MODE: LINEAR
Evaluating wl SVM...
Starting normalization of 5 elements...
Normalization finished, starting LINEAR SVM...
vectors-wl : (51.36666666666666, 1.2106013198223256, 6.567512635863157)
Evaluating shortestpath SVM...
Starting normalization of 1 elements...
Normalization finished, starting LINEAR SVM...
vectors-shortestpath : (40.06666666666667, 1.2498888839501803, 6.337367136455188)
Evaluating graphlet SVM

  return self.astype(np.float_)._mul_scalar(1./other)


Normalization finished, starting LINEAR SVM...
vectors-graphlet : (30.85, 0.8214147686901061, 5.619880386232038)


{'gram_matrix': {'graphlet': (30.56666666666667,
   0.727247474309049,
   5.422279143599222),
  'shortestpath': (41.15, 1.131493604832911, 6.1540185606191065),
  'wl': (50.33333333333333, 1.1279282877125771, 6.411794687223782)},
 'vectors': {'graphlet': (30.85, 0.8214147686901061, 5.619880386232038),
  'shortestpath': (40.06666666666667, 1.2498888839501803, 6.337367136455188),
  'wl': (51.36666666666666, 1.2106013198223256, 6.567512635863157)}}