In [None]:
%config IPCompleter.greedy=True
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")
    
def moving_average(l, n=10):
    cumsum, moving_aves = [0], []
    for i, x in enumerate(l, 1):
        cumsum.append(cumsum[i-1] + x)
        if i>=n:
            moving_ave = (cumsum[i] - cumsum[i-n])/n
            moving_aves.append(moving_ave)
    return moving_aves

In [None]:
meta_train_train_loss, meta_train_valid_loss = [], []
meta_train_train_acc, meta_train_valid_acc = [], []
meta_valid_train_loss, meta_valid_valid_loss = [], []
meta_valid_train_acc, meta_valid_valid_acc = [], []

meta_train_loss_ratio = []
meta_train_acc_ratio = []
meta_valid_loss_ratio = []
meta_valid_acc_ratio = []

base_log_dir = '../log'


for meta in open(base_log_dir + '/meta_training_history.txt'): 
    meta = meta.split()
    title = meta[0]
    meta_results = meta[1].split(',')
    if title == "META_TRAIN_TRAIN:":
        meta_train_train_loss.append(float(meta_results[0]))
        meta_train_train_acc.append(float(meta_results[1]))
    elif title == "META_TRAIN_VALID:":
        meta_train_valid_loss.append(float(meta_results[0]))
        meta_train_valid_acc.append(float(meta_results[1]))
    elif title == "META_VALID_TRAIN:": 
        meta_valid_train_loss.append(float(meta_results[0]))
        meta_valid_train_acc.append(float(meta_results[1]))
    elif title == "META_VALID_VALID:":
        meta_valid_valid_loss.append(float(meta_results[0]))
        meta_valid_valid_acc.append(float(meta_results[1]))
    elif title == "META_TRAIN_RATIO:":
        meta_train_loss_ratio.append(float(meta_results[0]))
        meta_train_acc_ratio.append(float(meta_results[1]))
    elif title == "META_VALID_RATIO:": 
        meta_valid_loss_ratio.append(float(meta_results[0]))
        meta_valid_acc_ratio.append(float(meta_results[1]))
        
n_train_sets = 64
meta_batch_size = 8
meta_batches_per_epoch = n_train_sets//meta_batch_size
        
avg_n = meta_batches_per_epoch*4

meta_train_train_loss = moving_average(meta_train_train_loss, n=avg_n)
meta_train_valid_loss = moving_average(meta_train_valid_loss, n=avg_n)
meta_train_train_acc = moving_average(meta_train_train_acc, n=avg_n)
meta_train_valid_acc = moving_average(meta_train_valid_acc, n=avg_n)
meta_train_loss_ratio = moving_average(meta_train_loss_ratio, n=avg_n)
meta_train_acc_ratio = moving_average(meta_train_acc_ratio, n=avg_n)

In [None]:
import matplotlib.pyplot as plt

training_conf_log = {}

for line in open(base_log_dir + '/train-meta-model.log'):
    if not line.startswith('2018'):
        split_line = line.strip().split(': ')
        training_conf_log[split_line[0]] = split_line[1]
        if line.startswith('lr_schedule:'):
            lr_schedule = line.split('lr_schedule:')[-1]
            lr_schedule = eval(lr_schedule)

print('TRAINING CONFIGURATION:')
print('\n'.join(map(str, training_conf_log.items())))

plt.rcParams["figure.figsize"] = (16,10)


def draw_figure(train_data, valid_data, title, min_fun = None,
                train_label='train', valid_label='valid', points_per_epoch=1):
    train, = plt.plot(train_data, '-o', label=train_label)
    legend_handles = [train]
    if valid_data is not None:
        valid, = plt.plot(valid_data, '-o', label=valid_label)
        legend_handles.append(valid)
        
    ax = plt.gca()
        
    if valid_data is not None and min_fun is not None:
        shift = 0.01
        if min_fun is min:
            shift *= -1
        begin_handle, = plt.plot([0], [valid_data[0]], marker='o', markersize=8, color="orange", label='starting')
        
        ax = plt.gca()
        ax.annotate('{:.5f}'.format(valid_data[0]), xy=(0, valid_data[0]), xytext=(-0.5, valid_data[0] + shift), 
                    horizontalalignment='left', verticalalignment='top')
        
        
        best = min_fun(valid_data)
        best_ind = valid_data.index(best)
        print("Best at:", best_ind)
        best_handle, = plt.plot([best_ind], [best], marker='o', markersize=8, color="green", label='best')
        ax = plt.gca()
        ax.annotate('{:.5f}'.format(best), xy=(best_ind, best), xytext=(best_ind - 0.5, best + shift), 
                    horizontalalignment='left', verticalalignment='top')
        
        legend_handles += [begin_handle, best_handle]
        
    ax2 = ax.twinx()
    ax2.set_ylabel('meta-learning rate', color='black')
    lr_x = [lr[0] * points_per_epoch for lr in lr_schedule]
    lr_x = [x for x in lr_x if x < len(train_data)]
    lr_y = [lr[1] for lr in lr_schedule[0:len(lr_x)]]
    ax2.set_yscale("log", nonposy='clip')
    ax2.step(lr_x, lr_y, where='post', color='black', linewidth='0.5')

    plt.legend(handles=legend_handles)
    plt.title(title)
    plt.show()    
    
    
draw_figure(meta_train_train_loss, meta_train_valid_loss,
           "Average meta-batch loss (meta-train, moving average n = {})".format(avg_n), min,
           points_per_epoch=meta_batches_per_epoch)

draw_figure(meta_train_train_acc, meta_train_valid_acc,
           "Average meta-batch accuracy (meta-train, moving average n = {})".format(avg_n), max,
           points_per_epoch=meta_batches_per_epoch)

draw_figure(meta_train_loss_ratio, None,
           "Average meta-batch ratio of valid loss to train loss (meta-train, moving average n = {})".format(avg_n),
           train_label='valid/train ratio loss ratio (meta-train)',
           points_per_epoch=meta_batches_per_epoch)

draw_figure(meta_valid_train_loss, meta_valid_valid_loss, "Average loss (meta-valid)", min)

draw_figure(meta_valid_train_acc, meta_valid_valid_acc, "Average accuracy (meta-valid)", max)

draw_figure(meta_valid_loss_ratio, None,
           "Average ratio of valid loss to train loss (meta-valid)".format(avg_n),
           train_label='valid/train ratio loss ratio (meta-train)')

In [None]:
# Eigenvals analysis
import os
import numpy as np

eigen = {}

d = base_log_dir + '/eigenvals'
for eigen_dir in (o for o in os.listdir(d) if os.path.isdir(os.path.join(d,o))):
    epoch = int(eigen_dir.split('_')[-1])
    eigen_path = os.path.join(d, eigen_dir)
    for eigen_file in os.listdir(os.path.join(d, eigen_dir)):
        step = int(eigen_file.split('_')[1])
        file_path = os.path.join(eigen_path, eigen_file)
        vals = np.load(file_path)
        if epoch not in eigen:
            eigen[epoch] = {}
        eigen[epoch][step] = vals['E'][:4]

In [None]:
eigenvals = [[], [], [], []]
# std_eigenvals = []

for epoch in sorted(eigen.keys()):
    eig = [np.mean(list(v[i] for v in eigen[epoch].values())) for i in range(4)]
    for i in range(4):
        eigenvals[i].append(eig[i])
    # std = np.std(list(eigen[epoch].values()))
        
# std_lower_draw = [avg - 0.5*std for avg, std in zip(avg_eigenvals, std_eigenvals)]
# std_upper_draw = [avg + 0.5*std for avg, std in zip(avg_eigenvals, std_eigenvals)]

for i in range(4):
    plt.plot(eigenvals[i], label='eigenval {}'.format(i+1))
plt.legend()

# ax.plot(std_upper_draw, c='r', label='avg_eig + 0.5*std', linewidth=0.5)
# ax.plot(std_lower_draw, c='r', label='avg_eig - 0.5*std', linewidth=0.5)

plt.title("Average absolute values of 3 biggest eigenval of minimum")
plt.xlabel('Meta-Validation epoch')
plt.ylabel('Average absolute value of 1st eigenvalue')
plt.show()

In [None]:
# Learning/Forget rate analysis
import os
import numpy as np

avg_f, std_f = [], []
avg_lr, std_lr = [], []

d = base_log_dir + '/stats'
epochs = 0
for stats_dir in sorted((o for o in os.listdir(d) if os.path.isdir(os.path.join(d,o))),
                       key=lambda d: int(d.split('_')[-1])):
    if stats_dir.endswith('0'):
        continue
    avg_f_ep, std_f_ep = 0, 0
    avg_lr_ep, std_lr_ep = 0, 0
    n_steps = 0
    epoch = int(stats_dir.split('_')[-1])
    stats_path = os.path.join(d, stats_dir)
    for stats_file in os.listdir(os.path.join(d, stats_dir)):
        step = int(stats_file.split('_')[1])
        file_path = os.path.join(stats_path, stats_file)
        vals = np.load(file_path)
        
        if 'forget_rate_history' in vals:
            avg_f_ep += np.mean(vals['forget_rate_history'])
            std_f_ep += np.std(vals['forget_rate_history'])
        avg_lr_ep += np.mean(vals['learning_rate_history'])
        std_lr_ep += np.std(vals['learning_rate_history'])
        
        n_steps += 1
    # ignore epochs that are not yet run
    if n_steps < 32:
        continue
    avg_f_ep /= n_steps
    avg_lr_ep /= n_steps
    std_f_ep /= n_steps
    std_lr_ep /= n_steps
    
    avg_f.append(avg_f_ep)
    avg_lr.append(avg_lr_ep)
    std_f.append(std_f_ep)
    std_lr.append(std_lr_ep)
    
    epochs += 1
    

In [None]:
std_lower_draw = [avg - 0.5*std for avg, std in zip(avg_f, std_f)]
std_upper_draw = [avg + 0.5*std for avg, std in zip(avg_f, std_f)]

fig = plt.figure()
plt.title("Average forget rates")
plt.xlabel('Meta-Validation epoch')
plt.ylabel('Average forget rate')
ax = fig.add_subplot(111)
ax.plot(avg_f,  c='b', label='avg_f', linewidth=2.0)
ax.plot(std_upper_draw, c='r', label='avg_f + 0.5*std', linewidth=0.5)
ax.plot(std_lower_draw, c='r', label='avg_f - 0.5*std', linewidth=0.5)
plt.legend()
plt.show()


std_lower_draw = [avg - 0.5*std for avg, std in zip(avg_lr, std_lr)]
std_upper_draw = [avg + 0.5*std for avg, std in zip(avg_lr, std_lr)]
    
fig = plt.figure()
plt.title("Average learning rates")
plt.xlabel('Meta-Validation epoch')
plt.ylabel('Average learning rate')
ax = fig.add_subplot(111)
ax.plot(avg_lr,  c='b', label='avg_lr', linewidth=2.0)
ax.plot(std_upper_draw, c='r', label='avg_lr + 0.5*std', linewidth=0.5)
ax.plot(std_lower_draw, c='r', label='avg_lr - 0.5*std', linewidth=0.5)
plt.legend()
plt.show()

In [None]:
# Prepare data for comparison of meta-optimizer with SGD and Adam

import os
from src.datasets.cifar import load_cifar100, cifar_input_shape
from src.datasets.metadataset import load_meta_dataset

from src.training.training_configuration import read_configuration
from src.model.learner.simple_cnn import build_simple_cnn
from keras.optimizers import SGD, Adam

train_conf_path = os.path.join(os.environ['LOG_DIR'], 'training_configuration.yml')
conf = read_configuration(train_conf_path)

X_train, y_train, X_test, y_test = load_cifar100()

meta_dataset_path = '../data/cifar100_64_64_2.h5'
meta_dataset = load_meta_dataset(meta_dataset_path, X_train)

learner = build_simple_cnn(cifar_input_shape, conf.classes_per_learner_set)
learner.load_weights(os.path.join(os.environ['CONF_DIR'], 'initial_learner_weights.h5'))
initial_learner_weights = learner.get_weights()

best_sgd_lr = 0.03
best_adam_lr = 0.03

learner.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.0),
                metrics=['accuracy'])  # dummy optimizer

In [None]:
# find best LR for SGD optimizer

from src.utils.comparison import learning_rate_grid_search
from keras.optimizers import SGD

trainings_per_dataset = 4

def sgd_factory(lr):
    return SGD(lr=lr)

best_sgd_lr = learning_rate_grid_search(
                 optimizer_factory=sgd_factory,
                 meta_dataset=meta_dataset,
                 lr_values=[0.01, 0.03, 0.05],
                 n_learner_batches=conf.n_learner_batches,
                 learner_batch_size=conf.learner_batch_size,
                 learner=learner,
                 initial_learner_weights=initial_learner_weights,
                 trainings_per_dataset=trainings_per_dataset)

print("Best SGD LR:", best_sgd_lr)

In [None]:
# find best LR for Adam optimizer

from src.utils.comparison import learning_rate_grid_search
from keras.optimizers import Adam

trainings_per_dataset = 4

def adam_factory(lr):
    return Adam(lr=lr)
    
best_adam_lr = learning_rate_grid_search(
                 optimizer_factory=adam_factory,
                 meta_dataset=meta_dataset,
                 lr_values=[0.025, 0.03, 0.035],
                 n_learner_batches=conf.n_learner_batches,
                 learner_batch_size=conf.learner_batch_size,
                 learner=learner,
                 initial_learner_weights=initial_learner_weights,
                 trainings_per_dataset=trainings_per_dataset)

print("Best Adam LR:", best_adam_lr)

In [None]:
from src.utils.comparison import compare_optimizers
from keras.optimizers import SGD, Adam
from src.model.meta_learner.lstm_model import lstm_meta_learner
from src.isotropy.lanczos import TopKEigenvaluesBatched
import tensorflow as tf
import keras.backend as K

eigenvals_callback = TopKEigenvaluesBatched(K=4, batch_size=conf.learner_batch_size, logger=None,
                                            save_dir="", save_eigenv=1)
eigenvals_callback.model = learner
eigenvals_callback.compile()


meta_model = lstm_meta_learner(learner, eigenvals_callback, conf)

meta_model.predict_model.compile(loss='mae',  # we don't use loss here anyway
                                 optimizer=SGD(lr=0.0))  # dummy optimizer

best_meta_learner_weights_path = os.path.join(os.environ['LOG_DIR'], "meta_weights.h5")
meta_model.load_weights(best_meta_learner_weights_path)

meta_optimizer = meta_model.predict_model

def sgd_opt(train_x, train_y):
    return SGD(lr=best_sgd_lr)

def adam_opt(train_x, train_y):
    return Adam(lr=best_adam_lr)

def meta_opt(train_x, train_y):
    eigenvals_callback.X = train_x
    eigenvals_callback.y = train_y

    meta_optimizer.reset_states()
    return meta_optimizer


optimizer_factories = [meta_opt, sgd_opt, adam_opt]

trainings_per_dataset = 4

comparison_results = compare_optimizers(meta_dataset=meta_dataset,
                                        optimizer_factories=optimizer_factories,
                                        n_learner_batches=conf.n_learner_batches,
                                        learner_batch_size=conf.learner_batch_size,
                                        learner=learner,
                                        initial_learner_weights=initial_learner_weights,
                                        trainings_per_dataset=trainings_per_dataset)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

comparison_results = [np.array(r) for r in comparison_results]
best_sgd, best_meta, best_adam = 0, 0, 0

for i in range(len(comparison_results[0])):
    res = [comparison_results[0][i], comparison_results[1][i], comparison_results[2][i]]
    min_ind = res.index(min(res))
    if min_ind == 0:
        best_meta += 1
    elif min_ind == 1:
        best_sgd += 1
    elif min_ind == 2:
        best_adam += 1
        
denom = 100.0/len(comparison_results[0])
print("SGD was best in {:d} trainings ({:.2f}%)".format(best_sgd, best_sgd * denom))
print("Adam was best in {:d} trainings ({:.2f}%)".format(best_adam, best_adam * denom))
print("META was best in {:d} trainings ({:.2f}%)".format(best_meta, best_meta * denom))

plt.rcParams["figure.figsize"] = (10,10)

plt.pie([best_sgd, best_adam, best_meta], labels=['SGD', 'Adam', 'META'])
plt.title("Best trainings (in terms of valid loss)")
plt.show()

sgd_vs_adam = [sgd - adam for sgd, adam in zip(comparison_results[1], comparison_results[2])]
meta_vs_sgd = [meta - sgd for meta, sgd in zip(comparison_results[0], comparison_results[1])]
meta_vs_adam = [meta - adam for meta, adam in zip(comparison_results[0], comparison_results[2])]

plt.rcParams["figure.figsize"] = (16,10)

plt.hist(sgd_vs_adam, bins=64)
plt.title("SGD vs ADAM (loss with sgd - loss with adam)")
plt.show()

worse_count = sum(1 if s > 0 else 0 for s in sgd_vs_adam)
worse_percent = worse_count / len(sgd_vs_adam) * 100
print("SGD was worse than ADAM in {}% trainings".format(worse_percent))

plt.hist(meta_vs_sgd, bins=64)
plt.title("META vs SGD (loss with meta - loss with sgd)")
plt.show()

worse_count = sum(1 if s > 0 else 0 for s in meta_vs_sgd)
worse_percent = worse_count / len(meta_vs_sgd) * 100
print("META was worse than SGD in {}% trainings".format(worse_percent))

plt.hist(meta_vs_adam, bins=64)
plt.title("META vs ADAM (loss with meta - loss with adam)")
plt.show()

worse_count = sum(1 if s > 0 else 0 for s in meta_vs_adam)
worse_percent = worse_count / len(meta_vs_adam) * 100
print("META was worse than ADAM in {}% trainings".format(worse_percent))

In [None]:
from src.utils.comparison import analyze_training
import numpy as np

def analyze_optimizer_training(optimizer_factory, trainings_per_dataset = 1):
    train_losses, train_accuracies, valid_losses, valid_accuracies, hessian_eigen =\
                        analyze_training(meta_dataset=meta_dataset,
                                         optimizer_factory=optimizer_factory,
                                         n_learner_batches=conf.n_learner_batches,
                                         learner_batch_size=conf.learner_batch_size,
                                         learner=learner,
                                         initial_learner_weights=initial_learner_weights,
                                         trainings_per_dataset=trainings_per_dataset)
    train_losses = np.array(train_losses)
    train_accuracies = np.array(train_accuracies)
    valid_losses = np.array(valid_losses)
    valid_accuracies = np.array(valid_accuracies)
    hessian_eigen = np.array(hessian_eigen)

    for metric_name, set_name, data in [('loss', 'train', train_losses),
                                        ('accuracy', 'train', train_accuracies),
                                        ('loss', 'valid', valid_losses),
                                        ('accuracy', 'valid', valid_accuracies),
                                        ('hessian spectral norm', 'train', hessian_eigen)]:
        avg = np.mean(data, axis=0)
        std = np.std(data, axis=0)

        std_lower_draw = [avg - 0.5*std for avg, std in zip(avg, std)]
        std_upper_draw = [avg + 0.5*std for avg, std in zip(avg, std)]

        fig = plt.figure()
        plt.title("Average {} during training ({} set)".format(metric_name, set_name))
        plt.xlabel('Training step')
        plt.ylabel('Average {} ({} set)'.format(metric_name, set_name))
        ax = fig.add_subplot(111)
        short_label = 'avg_{}_{}'.format(set_name, metric_name)
        ax.plot(avg,  c='b', label=short_label, linewidth=2.0)
        ax.plot(std_upper_draw, c='r', label=short_label + ' + 0.5*std', linewidth=0.5)
        ax.plot(std_lower_draw, c='r', label=short_label + ' - 0.5*std', linewidth=0.5)
        plt.legend()
        plt.show()
        
    return train_losses, train_accuracies, valid_losses, valid_accuracies, hessian_eigen

In [None]:
def sgd_opt(train_x, train_y):
    return SGD(lr=best_sgd_lr)

print("*" * 50)
print("ANALYSIS OF SGD OPTIMIZER")
print("*" * 50)
sgd_analysis = analyze_optimizer_training(sgd_opt, trainings_per_dataset = 3)

In [None]:
def adam_opt(train_x, train_y):
    return Adam(lr=best_sgd_lr)

print("*" * 50)
print("ANALYSIS OF ADAM OPTIMIZER")
print("*" * 50)
adam_analysis = analyze_optimizer_training(adam_opt, trainings_per_dataset = 3)

In [None]:
from src.model.meta_learner.lstm_model import lstm_meta_learner
from src.isotropy.lanczos import TopKEigenvaluesBatched

eigenvals_callback = TopKEigenvaluesBatched(K=conf.hessian_eigenvalue_features, 
                                            batch_size=conf.learner_batch_size, logger=None,
                                            save_dir="", save_eigenv=1)
eigenvals_callback.model = learner
eigenvals_callback.compile()

meta_model = lstm_meta_learner(learner, eigenvals_callback, conf)
meta_model.predict_model.compile(loss='mae',  # we don't use loss here anyway
                                 optimizer=SGD(lr=0.0))  # dummy optimizer

best_meta_learner_weights_path = os.path.join(os.environ['LOG_DIR'], "meta_weights.h5")
meta_model.load_weights(best_meta_learner_weights_path)

meta_optimizer = meta_model.predict_model

def meta_opt(train_x, train_y):
    eigenvals_callback.X = train_x
    eigenvals_callback.y = train_y

    meta_optimizer.reset_states()
    return meta_optimizer

print("*" * 50)
print("ANALYSIS OF META OPTIMIZER")
print("*" * 50)
meta_analysis = analyze_optimizer_training(meta_opt, trainings_per_dataset = 3)

In [None]:
import pandas as pd

train_losses, train_accuracies, valid_losses, valid_accuracies, hessian_eigen = meta_analysis

avg_train_losses = train_losses.mean(axis=0)
avg_train_accuracies = train_accuracies.mean(axis=0)
avg_valid_losses = valid_losses.mean(axis=0)
avg_valid_accuracies = valid_accuracies.mean(axis=0)
avg_hessian_eigen = hessian_eigen.mean(axis=0)

df = pd.DataFrame({'train_loss': avg_train_losses, 'train_acc': avg_train_accuracies, 
                   'valid_loss': avg_valid_losses, 'valid_acc': avg_valid_accuracies,
                   'hessian_eigen': avg_hessian_eigen})
df.to_hdf('../log/learner_training_shape.h5', 'training_history')