In [1]:
from torch_geometric.datasets import Planetoid
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import numpy as np
import copy
import time 
import matplotlib.pyplot as plt
import pickle # Lokales Speichern von Objekten

from GNM_Toolbox.tools.tools import *
from GNM_Toolbox.gnm import *
from GNM_Toolbox.data.dataloader import *

dataset = load_dataset('Cora')

Found dataset on harddrive.


In [2]:
# Gegeben sei eine target_list (a_0, a_1, a_2, ...)
# und eine out_list ((b_0, x), (b_1, x), (b_2, x), (b_3, x), ...)
# Gesucht wird eine Liste l von Indizes, sodass für i < len(target_list): abs(target_list[i] - out_list[l[i]][0]) minimal ist
def find_each_nearest(target_list, out_list):
    # Each list is expected to be sorted
    i, j = 0, 0
    result = list()
    while True:
        diff_0 = abs(target_list[i] - out_list[j][0])
        diff_1 = abs(target_list[i] - out_list[j+1][0])
        
        if diff_0 >= diff_1:
            j += 1
        elif diff_0 < diff_1:
            result.append(j)
            i += 1
        if i >= len(target_list):
            return result
        if j+1 >= len(out_list):
            while i < len(target_list):
                result.append(j)
                i += 1
            return result
            
def get_best_values_indices(targets, lambdas):
    lambdas.sort(key = lambda x: x[0])
    return find_each_nearest(targets, lambdas)

In [3]:
def h(x):
    a0, a1, a2, a3 = 13, 4, 15, 15
    return torch.exp(torch.sum(x, dim=1)/a0 - a1) - ((torch.sum(x, dim=1) - a2 ) / a3)
    
def pi_test(X, y):
    a0, a1, a2 = -torch.log(torch.tensor(35.)), 1, 1.6
    return torch.sigmoid(a0 + a1 * h(X) + a2 * y)

def pi_complicated(X, y):
    tmp = torch.sum(X,axis=1) 
    h = torch.exp(tmp/13-4)-(tmp-15)/15
    pi = 1/(1+35*np.exp(h[:]-1.6*y[:]))
    return pi

def pi_simple(x, y):
    a = 0
    b = 1
    return torch.sigmoid(a + b*y)

def create_mask_from_pi(data, pi):
    p = pi(data.x, data.y)
    mask = torch.tensor((np.random.binomial(size = p.shape[0], n = 1, p = p) == 1))        
    return mask.bool()

def split_known_mask_into_val_and_train_mask(known, ratio=0.8):
    val_mask = torch.zeros_like(known) == 1
    train_mask = torch.zeros_like(known) == 1
    for i in range(len(known)):
        if known[i] == True:
            if np.random.binomial(1, ratio) == 1:
                train_mask[i] = True
            else:
                val_mask[i] = True
    return val_mask, train_mask

def calculate_lambda(train_mask, y):
    a = 0 # Anzahl an Klasse 0
    b = 0 # Anzahl an Klasse 1
    for yy in y[train_mask]:
        if yy == 0:
            a += 1
        elif yy == 1:
            b += 1
    return b/a

def time_to_string(i):
    # float i
    hours = int(i/3600)
    i -= hours * 3600
    minutes = int(i/60)
    i -= minutes * 60
    seconds = int(i)
    if hours > 0:
        return '{}h {}min {}sec'.format(hours, minutes, seconds)
    elif minutes > 0:
        return '{}min {}sec'.format(minutes, seconds)
    else:
        return '{:.2f}sec'.format(i)

def print_status(i, N, starttime = None):
    # Es wird angenommen, dass i von 0 bis N-1 läuft
    # Länge Ladebalken = 20
    l = 20
    done = int(i/(N-1) * l)
    counterstring = '({}/{})'.format(i+1, N)
    barstring = '|'+u'\u2588'*done + ' '*(l-done)+'|'
    if starttime is None:
        print('{} {}'.format(counterstring, barstring), end='\r')
    else:
        t_1 = time.time()
        time_spent = t_1 - starttime
        iterations_done = i
        iterations_to_go = N-i-1
        if iterations_done == 0:
            time_per_iteration = 0
        else:
            time_per_iteration = time_spent/iterations_done
        time_to_go = time_per_iteration * iterations_to_go
        
        time_string = '({}|{}|{})'.format(time_to_string(time_spent), time_to_string(time_to_go), time_to_string(time_per_iteration))
        
        print('{} {} {}'.format(counterstring, barstring, time_string), end='\r')
        
def print_double_status(i, N, j, M, starttime = None):
    # Es wird angenommen, dass i von 0 bis N-1 läuft und j von 0 bis M-1
    # Länge Ladebalken = 20
    l = 20
    done = int(i/(N-1) * l)
    counterstring = '({}/{})'.format(i+1, N)
    barstring = '|'+u'\u2588'*done + ' '*(l-done)+'|'
    if starttime is None:
        print('{} {}               '.format(counterstring, barstring), end='\r')
    else:
        t_1 = time.time()
        time_spent = t_1 - starttime
        iterations_done = i+1
        iterations_to_go = N-i-1
        time_per_iteration = time_spent/iterations_done
        time_to_go = time_per_iteration * iterations_to_go
        
        time_string = '({}|{}|{})             '.format(time_to_string(time_spent), time_to_string(time_to_go), time_to_string(time_per_iteration))
        
        print('{} {} {}               '.format(counterstring, barstring, time_string), end='\r')
    

In [4]:
# Set up data
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = dataset[0].to(device)
print(type(data))
data.num_classes = 2
# Klassen 0,1,2,4,5,6 werden zu Klasse 1, Klasse 3 wird zu Klasse 0
y = torch.zeros_like(data.y)
y[data.y == 3] = 1
data.y = y
print(type(data))

<class 'torch_geometric.data.data.Data'>
<class 'torch_geometric.data.data.Data'>


In [None]:
# Find masks such that lambda has specific values
targets = [1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7]
masks = dict()
worst_allowed_diff = 0.005
worst_diff = 1

for t in targets:
    masks[t] = None
    
i = 0
while worst_diff > worst_allowed_diff:
    i += 1
    known_mask = create_mask_from_pi(data, pi_complicated)
    val_mask, train_mask = split_known_mask_into_val_and_train_mask(known_mask)
    l = calculate_lambda(train_mask, data.y)
    
    # Update masks
    for t in targets:
        diff = abs(l - t)
        best = masks[t]
        if best is None:
            masks[t] = (l, train_mask, val_mask)
        elif abs(best[0] - t) > diff:
            masks[t] = (l, train_mask, val_mask)
    
    # Update worst_diff
    worst_diff_old = worst_diff
    worst_diff = 0
    for t in targets:
        diff = abs(t - masks[t][0])
        if diff > worst_diff:
            worst_diff = diff
    print('At iteration {:6} worst_diff is {:.4f}'.format(i, worst_diff), end='\r')

# Safe masks
with open('masks.pkl', 'ab') as output:
    pickle.dump(masks, output, pickle.HIGHEST_PROTOCOL)

In [5]:
# Get fitting subset of masks
with open('masks.pkl', 'rb') as instream:
    all_masks = pickle.load(instream)
subset = [1, 1.2, 1.5, 1.7, 2, 2.2, 2.5]
masks = {k: all_masks[k] for k in subset}

In [6]:
# Real Data analysis with GNM and SM
N = 120
M = len(masks)
t_0 = time.time()

all_models = dict()

# Iteriere über Masken
for i, h in enumerate(masks):
    var_lambda, train_mask, val_mask = masks[h]
    gnm_models = list()
    sm_models = list()
    
    # Trainiere jeweils N Modelle
    for j in range(N):
        print_status(i*N + j, N*M, t_0)
        gnm_models.append(train_net_with_gnm(data, train_mask, val_mask)[1:])
        sm_models.append(train_one_net(data, train_mask, val_mask)[1:])
    all_models[var_lambda] = (gnm_models, sm_models)
    
with open('real-data-results.pkl', 'ab') as output:
    pickle.dump(all_models, output, pickle.HIGHEST_PROTOCOL)

(840/840) |████████████████████| (3h 49min 33sec|0.00sec|16.42sec))))))))

In [None]:
# Real Data analysis with SM and SM5
N = 30
M = len(masks)
t_0 = time.time()

all_models = dict()

# Iteriere über Masken
for i, h in enumerate(masks):
    var_lambda, train_mask, val_mask = masks[h]
    sm5_models = list()
    sm_models = list()
    
    # Trainiere jeweils N Modelle
    for j in range(N):
        print_status(i*N + j, N*M, t_0)
        sm5_models.append(train_one_net(data, train_mask, val_mask))
        sm_models.append(train_net(data, train_mask, val_mask))
    all_models[var_lambda] = (sm5_models, sm_models)
    
with open('real-data-results-3.pkl', 'ab') as output:
    pickle.dump(all_models, output, pickle.HIGHEST_PROTOCOL)