# L1 Analyse Citeseer 2 class

Um dieses Experiment selbst auszuführen, führe die Zellen von oben nach unten aus

In [None]:
from torch_geometric.datasets import Planetoid
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import numpy as np
import copy
import time 
import matplotlib.pyplot as plt
import pickle # Lokales Speichern von Objekten
import keyboard
from scipy import interpolate

from GNM_Toolbox import *

dataset = load_dataset('Citeseer')

## Einige Hilfsfunktionen

In [None]:
# Gegeben sei eine target_list (a_0, a_1, a_2, ...)
# und eine out_list ((b_0, x), (b_1, x), (b_2, x), (b_3, x), ...)
# Gesucht wird eine Liste l von Indizes, sodass für i < len(target_list): abs(target_list[i] - out_list[l[i]][0]) minimal ist
def find_each_nearest(target_list, out_list):
    # Each list is expected to be sorted
    i, j = 0, 0
    result = list()
    while True:
        diff_0 = abs(target_list[i] - out_list[j][0])
        diff_1 = abs(target_list[i] - out_list[j+1][0])
        
        if diff_0 >= diff_1:
            j += 1
        elif diff_0 < diff_1:
            result.append(j)
            i += 1
        if i >= len(target_list):
            return result
        if j+1 >= len(out_list):
            while i < len(target_list):
                result.append(j)
                i += 1
            return result
            
def get_best_values_indices(targets, lambdas):
    lambdas.sort(key = lambda x: x[0])
    return find_each_nearest(targets, lambdas)

def create_mask_from_pi(data, pi):
    p = pi(data.x, data.y)
    mask = torch.tensor((np.random.binomial(size = p.shape[0], n = 1, p = p) == 1))        
    return mask.bool()

def split_known_mask_into_val_and_train_mask(known, ratio=0.8):
    val_mask = torch.zeros_like(known) == 1
    train_mask = torch.zeros_like(known) == 1
    for i in range(len(known)):
        if known[i] == True:
            if np.random.binomial(1, ratio) == 1:
                train_mask[i] = True
            else:
                val_mask[i] = True
    return val_mask, train_mask

def calculate_lambda(train_mask, y):
    a = 0 # Anzahl an Klasse 0
    b = 0 # Anzahl an Klasse 1
    for yy in y[train_mask]:
        if yy == 0:
            a += 1
        elif yy == 1:
            b += 1
    return b/a
        
def insert_into_list(l, item, t):
    # l list, i item to insert, target
    def diff(a, b):
        return abs(a-b)
    N = len(l)
    if N == 0:
        l.insert(0, item)
        return
    d = diff(t, item[0])
    d_0 = diff(t, l[0][0])
    if d <= d_0:
        l.insert(0, item)
        return
    for i in range(N-1):
        d_0 = diff(t, l[i][0])
        d_1 = diff(t, l[i+1][0])
        if d_0 <= d and d <= d_1:
            l.insert(i+1, item)
            return
    l.append(item)
    
def setBoxColors(bp, edge_color='red', face_color='red'):
    for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']:
        plt.setp(bp[element], color=edge_color)
    plt.setp(bp["boxes"], facecolor=face_color)
    plt.setp(bp["fliers"], markeredgecolor=face_color)
    
def plot_data(all_models, colors, names, dist = [-0.15, 0.15]):
    M = len(all_models)
    
    # Set up 
    fig = plt.figure(figsize=(18, 8))
    ax = plt.axes()
    minimum = 1
    maximum = 0
    for i, l in zip(range(1, M+1), all_models):
        for models, color, (dd, d) in zip(all_models[l], colors, enumerate(dist)):
            acc = list()
            for model in models:
                if type(model[1]) == type(list()):
                    acc.append(model[1][len(model[1])-1])
                else:
                    acc.append(model[1])
            if dd == 0:
                print('\\hline')
                print('{} & \\textbf{{NLL}} & \\textbf{{{:.3f}}} & \\textbf{{{}}} \\\\'.format(l, np.mean(acc), np.format_float_scientific(np.var(acc), precision=2)))
            elif dd == 1:
                print('& $\\textbf{{L}}_1$ & \\textbf{{{:.3f}}} & \\textbf{{{}}} \\\\'.format(np.mean(acc), np.format_float_scientific(np.var(acc), precision=2)))
            else:
                sigma = [0, 0.0025, 0.005, 0.0075, 0.01, 0.0125, 0.015, 0.0175, 0.02]
                print('& $L_1$ + $\\sigma$ = {} & {:.3f} & {} \\\\'.format(sigma[dd-1], np.mean(acc), np.format_float_scientific(np.var(acc), precision=2)))
            
            # Find min und max
            if minimum > min(acc):
                minimum = min(acc)
            if maximum < max(acc):
                maximum = max(acc)
            bp = plt.boxplot([acc], positions = [i+d], widths = 0.5 / len(dist), patch_artist=True)
            setBoxColors(bp, 'black', color)
        
    # Set x-axis labels
    seperation_lines = [i + 0.5 for i in range(1, M)]
    plt.vlines(seperation_lines, 0, 1, colors='grey', linestyles='dashed')
    ax.set_xticks(list(range(1, M+1)))
    ax.set_yticks([0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.725, 0.75, 0.775, 0.8, 0.825, 0.85, 0.875, 0.9])
    lambdas = list(all_models.keys())
    lambdas.sort()
    ax.set_xticklabels([int(l * 100)/100 for l in lambdas])
    #plt.ylim(minimum - 0.01, maximum + 0.01)
    plt.ylim(0.65, maximum + 0.01)
    plt.xlim(0.4, M + 0.6)
    
    # draw temporary red and blue lines and use them to create a legend
    helps = list()
    for color in colors:
        h, = plt.plot([1, 1], color=color)
        helps.append(h)
    ax.legend(helps, names, prop={'size': 16}) #plot.legend(loc=2, prop={'size': 6})
    ax.grid(True, axis='y')
    ax.set_xlabel(r'$\lambda$')
    ax.set_ylabel(r'Genauigkeit auf unbekannten Daten')
    for h in helps:
        h.set_visible(False)
        

def print_fitting(fitting_masks, i, num_masks):
    k = fitting_masks.keys()
    string = ''
    for kk in k:
        if fitting_masks[kk] <= num_masks:
            string += ' {}'.format(fitting_masks[kk])
        else:
            string += ' {}+'.format(num_masks)
    print(string + ' at iteration {}'.format(i), end='\r')

## Informationen über das Setup

Citeseer Datasets besteht aus 6 Klassen. 

Die Verteilung der Knoten ist wie folgt 249, 590, 668, 701, 596, 523.

Deshalb setzen wir die neue Klasse 0 als Klasse 3 und die neue Klasse 1 als die restlichen Klassen.

Insgesamt haben wir 3327 Knoten mit je 3703 Features.

## Data Setup

In [None]:
def h(x):
    a0, a1, a2, a3 = 13, 4, 15, 15
    return torch.exp(torch.sum(x, dim=1)/a0 - a1) - ((torch.sum(x, dim=1) - a2 ) / a3)
    
def pi(X, y):
    a0, a1, a2 = -torch.log(torch.tensor(4.)), 1, -1
    return torch.sigmoid(a0 + a1 * h(X) + a2 * y)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = dataset[0].to(device)
y_tmp = data.y == 3
data.y = torch.ones_like(data.y)
data.y[y_tmp] = 0
data.num_classes = 2

In [None]:
lambdas = list()

for i in range(1000):
    mask = create_mask_from_pi(data, pi)
    var_lambda = calculate_lambda(mask, data.y)
    lambdas.append(var_lambda)
    print('{:.3f}'.format(sum(mask)/(len(mask)*1.0)), end='\r')
    time.sleep(1)

plt.hist(lambdas, bins=40);

## Finde passende Masken

In [None]:
# Find masks such that lambda has specific values
targets = [1, 1.5, 2]
masks = {t: list() for t in targets} # Safes actual masks
num_fitting_masks = {t: 0 for t in targets} # num of masks which fit for a target
targets_done = set() # Safes which targets have enough masks
worst_allowed_diff = 0.01
worst_diff = 1
num_masks = 40 # num_masks_per_lambda

In [None]:
i = 0
t_start = time.time()
while worst_diff > worst_allowed_diff and i < 40000 and len(set(targets)-targets_done) > 0:
    i += 1
    known_mask = create_mask_from_pi(data, pi)
    val_mask, train_mask = split_known_mask_into_val_and_train_mask(known_mask)
    l = calculate_lambda(train_mask, data.y)
    item = (l, train_mask, val_mask)
    
    # Update masks
    for t in set(targets)-targets_done:
        diff = abs(l - t)
        masks_ind = masks[t]
        if len(masks_ind) < num_masks:
            insert_into_list(masks_ind, item, t)
            if diff < worst_allowed_diff:
                num_fitting_masks[t] += 1
                print_fitting(num_fitting_masks, i, num_masks)
                if num_fitting_masks[t] > num_masks:
                    targets_done.add(t)
        elif abs(masks_ind[num_masks-1][0] - t) > diff:
            insert_into_list(masks_ind, item, t)
            masks[t] = masks_ind[0:num_masks]
            if diff < worst_allowed_diff:
                num_fitting_masks[t] += 1
                print_fitting(num_fitting_masks, i, num_masks)
                if num_fitting_masks[t] > num_masks:
                    targets_done.add(t)
    
    # Update worst_diff
    worst_diff_old = worst_diff
    worst_diff = 0
    for t in set(targets)-targets_done:
        length = len(masks[t])
        diff = abs(t - masks[t][length-1][0])
        if diff > worst_diff:
            worst_diff = diff
            
# Safe masks
#pickle_write('masks_citeseer_small.pkl', (masks, fitting_masks, targets_done))

## Analysiere on Loss Function $L_1$ tatsächlich besser als NLL ist

In [None]:
# Evaluate actuall pi
pi_true = pi(data.x, data.y)

# Load masks
all_masks = masks
subset = [1, 1.5, 2]
choosen_masks = {k: all_masks[k] for k in subset}

# Drei Masken: 10, 15, 20
# Acht +1 Noiselevel: 0, 0.0025, 0.005, 0.0075, 0.01, 0.0125, 0.015, 0.0175, 0.02

In [None]:
# Eval 'Perfect' Loss, 'Perfect' Loss with noise and NLL
IT_per_mask = 4
NB_masks = len(choosen_masks[1.0])
M = len(choosen_masks)
t_0 = time.time()
noise_levels = [0, 0.0025, 0.005, 0.0075, 0.01, 0.0125, 0.015, 0.0175, 0.02]
all_models = dict()

# Iteriere über Masken
for i, l in enumerate(choosen_masks):
    sms_models = list() # SM Standard
    smn_models = [list() for _ in range(9)] # SM Advanced with noises
    
    for j, mask_tupel in enumerate(choosen_masks[l]):
        _, train_mask, val_mask = mask_tupel

        # Trainiere jeweils N Modelle
        for k in range(IT_per_mask):
            print_status(i * NB_masks * IT_per_mask + j * IT_per_mask + k, M * NB_masks * IT_per_mask, t_0)
            noise = torch.randn_like(pi_true)
            for m, noise_level in enumerate(noise_levels):
                pi = pi_true + noise_level * noise
                smn_models[m].append((*train_one_net(data, 
                                                  train_mask, 
                                                  val_mask,
                                                  loss_function=inverse_weighted_categorial_crossentropy_loss(pi[train_mask], reduction='mean'),
                                                  val_loss_function=inverse_weighted_categorial_crossentropy_loss(pi[val_mask], reduction='mean'))[1:], j))
            sms_models.append((*train_one_net(data, train_mask, val_mask)[1:], j))
    all_models[l] = (sms_models, *smn_models)
    
#pickle_write('l1_analysis-citeseer_small.pkl', all_models)

In [None]:
plot_data(all_models, 
          ('b', *[(0.3 + i * 0.0875, 0.8 - i * 0.1, 0.2 - i * 0.01) for i in range(9)]), 
          ['NLL', r'$L_1$ + $\pi$', r'$L_1$ + $\pi_{0.0025}$', r'$L_1$ + $\pi_{0.005}$', r'$L_1$ + $\pi_{0.0075}$', r'$L_1$ + $\pi_{0.01}$', r'$L_1$ + $\pi_{0.0125}$', r'$L_1$ + $\pi_{0.015}$', r'$L_1$ + $\pi_{0.0175}$', r'$L_1$ + $\pi_{0.02}$'], 
          dist = [-0.4, -0.31, -0.22, -0.13, -0.04, 0.04, 0.13, 0.22, 0.31, 0.4])