In [1]:
from torch_geometric.datasets import Planetoid
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import numpy as np
import copy
import time 
import matplotlib.pyplot as plt
import pickle # Lokales Speichern von Objekten
import keyboard

from GNM_Toolbox.tools.tools import *
from GNM_Toolbox.gnm import *
from GNM_Toolbox.data.dataloader import *

dataset = load_dataset('Cora')

Found dataset on harddrive.


# Some Helping Functions

In [2]:
# Gegeben sei eine target_list (a_0, a_1, a_2, ...)
# und eine out_list ((b_0, x), (b_1, x), (b_2, x), (b_3, x), ...)
# Gesucht wird eine Liste l von Indizes, sodass für i < len(target_list): abs(target_list[i] - out_list[l[i]][0]) minimal ist
def find_each_nearest(target_list, out_list):
    # Each list is expected to be sorted
    i, j = 0, 0
    result = list()
    while True:
        diff_0 = abs(target_list[i] - out_list[j][0])
        diff_1 = abs(target_list[i] - out_list[j+1][0])
        
        if diff_0 >= diff_1:
            j += 1
        elif diff_0 < diff_1:
            result.append(j)
            i += 1
        if i >= len(target_list):
            return result
        if j+1 >= len(out_list):
            while i < len(target_list):
                result.append(j)
                i += 1
            return result
            
def get_best_values_indices(targets, lambdas):
    lambdas.sort(key = lambda x: x[0])
    return find_each_nearest(targets, lambdas)

def h(x):
    a0, a1, a2, a3 = 13, 4, 15, 15
    return torch.exp(torch.sum(x, dim=1)/a0 - a1) - ((torch.sum(x, dim=1) - a2 ) / a3)
    
def pi_test(X, y):
    a0, a1, a2 = -torch.log(torch.tensor(35.)), 1, 1.6
    return torch.sigmoid(a0 + a1 * h(X) + a2 * y)

def pi_complicated(X, y):
    tmp = torch.sum(X,axis=1) 
    h = torch.exp(tmp/13-4)-(tmp-15)/15
    pi = 1/(1+35*np.exp(h[:]-1.6*y[:]))
    return pi

def pi_simple(x, y):
    a = 0
    b = 1
    return torch.sigmoid(a + b*y)

def create_mask_from_pi(data, pi):
    p = pi(data.x, data.y)
    mask = torch.tensor((np.random.binomial(size = p.shape[0], n = 1, p = p) == 1))        
    return mask.bool()

def split_known_mask_into_val_and_train_mask(known, ratio=0.8):
    val_mask = torch.zeros_like(known) == 1
    train_mask = torch.zeros_like(known) == 1
    for i in range(len(known)):
        if known[i] == True:
            if np.random.binomial(1, ratio) == 1:
                train_mask[i] = True
            else:
                val_mask[i] = True
    return val_mask, train_mask

def calculate_lambda(train_mask, y):
    a = 0 # Anzahl an Klasse 0
    b = 0 # Anzahl an Klasse 1
    for yy in y[train_mask]:
        if yy == 0:
            a += 1
        elif yy == 1:
            b += 1
    return b/a
        
def insert_into_list(l, item, t):
    # l list, i item to insert, target
    def diff(a, b):
        return abs(a-b)
    N = len(l)
    if N == 0:
        l.insert(0, item)
        return
    d = diff(t, item[0])
    d_0 = diff(t, l[0][0])
    if d <= d_0:
        l.insert(0, item)
        return
    for i in range(N-1):
        d_0 = diff(t, l[i][0])
        d_1 = diff(t, l[i+1][0])
        if d_0 <= d and d <= d_1:
            l.insert(i+1, item)
            return
    l.append(item)

# Data Setup

In [3]:
# Set up data
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = dataset[0].to(device)
data.num_classes = 2
# Klassen 0,1,2,4,5,6 werden zu Klasse 1, Klasse 3 wird zu Klasse 0
y = torch.zeros_like(data.y)
y[data.y == 3] = 1
data.y = y

# Find Fitting Train and Val Masks

In [None]:
# Find masks such that lambda has specific values
targets = [1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2, 2.1, 2.2, 2.3, 2.4, 2.5]
masks = dict()
fitting_masks = dict()
targets_done = set()
worst_allowed_diff = 0.005
worst_diff = 1
num_masks = 40 # num_masks_per_lambda

for t in targets:
    masks[t] = list()
    fitting_masks[t] = 0
    
#pickle_write('m_masks.pkl', (masks, fitting_masks, targets_done))

def print_fitting(fitting_masks, i):
    k = fitting_masks.keys()
    string = ''
    for kk in k:
        if fitting_masks[kk] <= 40:
            string += ' {}'.format(fitting_masks[kk])
        else:
            string += ' 40+'
    print(string + ' at iteration {}'.format(i), end='\r')

In [None]:
masks, fitting_masks, targets_done = pickle_read('m_masks.pkl')
i = 0
t_start = time.time()
while worst_diff > worst_allowed_diff and i < 40000 and len(set(targets)-targets_done) > 0:
    i += 1
    known_mask = create_mask_from_pi(data, pi_complicated)
    val_mask, train_mask = split_known_mask_into_val_and_train_mask(known_mask)
    l = calculate_lambda(train_mask, data.y)
    item = (l, train_mask, val_mask)
    
    # Update masks
    for t in set(targets)-targets_done:
        diff = abs(l - t)
        masks_ind = masks[t]
        if len(masks_ind) < num_masks:
            insert_into_list(masks_ind, item, t)
            if diff < worst_allowed_diff:
                fitting_masks[t] += 1
                #print_fitting(fitting_masks, i)
                if fitting_masks[t] > 40:
                    targets_done.add(t)
        elif abs(masks_ind[num_masks-1][0] - t) > diff:
            insert_into_list(masks_ind, item, t)
            masks[t] = masks_ind[0:num_masks]
            if diff < worst_allowed_diff:
                fitting_masks[t] += 1
                print_fitting(fitting_masks, i)
                if fitting_masks[t] > 40:
                    targets_done.add(t)
    
    # Update worst_diff
    worst_diff_old = worst_diff
    worst_diff = 0
    for t in set(targets)-targets_done:
        length = len(masks[t])
        diff = abs(t - masks[t][length-1][0])
        if diff > worst_diff:
            worst_diff = diff
            
# Safe masks
pickle_write('m_masks.pkl', (masks, fitting_masks, targets_done))

# Actual Training

In [6]:
# Load masks
all_masks,_,_ = pickle_read('m_masks.pkl')
subset = [1.2, 1.5, 1.7, 2, 2.2]
choosen_masks = {k: all_masks[k] for k in subset}

In [12]:
# Real Data analysis with different GNMs and SM
IT_per_mask = 5
NB_masks = len(choosen_masks[1.5])
M = len(choosen_masks)
t_0 = time.time()

all_models = dict()

# Iteriere über Masken
for i, l in enumerate(choosen_masks):
    gnm_models_one_net_optimized = list()
    gnm_models_one_net = list()
    gnm_models_5_nets = list()
    sm_models = list()
    for j, mask_tupel in enumerate(choosen_masks[l]):
        _, train_mask, val_mask = mask_tupel

        # Trainiere jeweils N Modelle
        for k in range(IT_per_mask):
            print_status(i * NB_masks * IT_per_mask + j * IT_per_mask + k, M * NB_masks * IT_per_mask, t_0)
            gnm_models_one_net_optimized.append((*train_net_with_gnm_one_model(data, train_mask, val_mask)[1:], j))
            gnm_models_one_net.append((*train_net_with_gnm_one_model(data, train_mask, val_mask)[1:], j))
            gnm_models_5_nets.append((*train_net_with_gnm_new_model_each_epoch(data, train_mask, val_mask)[1:], j))
            sm_models.append((*train_one_net(data, train_mask, val_mask)[1:], j))
    all_models[l] = (gnm_models_one_net_optimized, gnm_models_one_net, gnm_models_5_nets, sm_models)
    pickle_write('real-data-results-final-part-{}.pkl'.format(i), all_models)
    
pickle_write('real-data-results-final.pkl', all_models)

(637/1000) |████████████        | (22h 24min 50sec|12h 47min 34sec|2min 6sec)))

KeyboardInterrupt: 

In [10]:
(*(1, 2, 3), 1)

(1, 2, 3, 1)