# Analysis
The previous notebook carried out the brute-force hyperparameter search. In contrast, this notebook focuses on the analysis of these data. For that purpose it plots the averaged results of the exhaustive parameter search per corpus and sample size used for training, followed by a discussion of the results.
The second part of this notebooks deals with the most successful configurations rather than averaged results.

The `plot_results` function plots the results of the parameter search of one corpus at a given sample size.

In [None]:
%pylab inline
from matplotlib import pyplot as plt
from itertools import product, cycle
from collections import OrderedDict, defaultdict

def plot_correletaion_matrix(df, parameter, sample_size, title):
    variables = list(parameter.keys())
    num_vars = len(variables)
    df = df.loc[df['sample_size'] == sample_size]

    grouped = OrderedDict({})
    for x, y in product(variables, variables):
        grouped[(y, x)] = df['mean'].groupby([df[x], df[y]])

    colors = ['tomato', 'black', 'green', 'c', 'gold', 'tan', 'royalblue', 'brown', 'purple']

    # calc min and max values per row
    min_max = defaultdict(lambda : {'min': 1, 'max': 0})
    for (_, variable), series in grouped.items():
        min_val = series.mean().min()
        max_val = series.mean().max()
        min_max[variable]['min'] = min_val if min_val < min_max[variable]['min'] else min_max[variable]['min']
        min_max[variable]['max'] = max_val if max_val > min_max[variable]['max'] else min_max[variable]['max']

    fig, ax = plt.subplots(num_vars, num_vars, figsize=(9.75, 9.75))

    for p_coor, (param_names, series) in zip(product(range(num_vars), range(num_vars)), grouped.items()):
        table = series.mean().unstack()
        data = {row: np.array([(x, y) for x, y in series.items()]) for row, series in table.iterrows()}

        # set log scale if config demands
        if plot_params[param_names[0]]['scale'] == 'log':
            ax[p_coor[0], p_coor[1]].set_xscale('log')

        for c, (k, v) in zip(cycle(colors), data.items()):
            axis = ax[p_coor[0], p_coor[1]]
            axis.plot(v[:, 0], v[:, 1], c=c, label=k)
            axis.set_ylim(
                max(min_max[param_names[1]]['min'] * .95, 0) , 
                min(min_max[param_names[1]]['max'] * 1.05, 1)
            )

            # special treatment for diagonal items
            if p_coor[0] == p_coor[1]:
                axis.legend(loc='center right')
                axis.tick_params(axis='both', which='both', left='off', bottom='off', right='off', top='off')
                axis.grid('off')


    for axis in ax[:-1, :].flatten():
        axis.tick_params(
            axis='x',
            which='both',
            top='off',
            labelbottom='off')
        axis.tick_params(
            axis='y',
            which='both',
            right='off')

    # horizontal labels
    for (variable, value), axis in zip(parameter.items(), ax[-1, :].flatten()):
        axis.xaxis.set_label_text(variable)
        axis.xaxis.set_ticks(value)

    # vertical labels
    for v, axis in zip(variables, ax[:, 0].flatten()):
        axis.yaxis.set_label_text(v)

    # disable y-axis tick label for most subplots
    for axis in ax[:, 1:].flatten():
        axis.tick_params(
            axis='y',
            labelleft='off')
        
    plt.tight_layout()
    fig.text(.5, 1, title, horizontalalignment='center', fontsize=20) 
    plt.show()

In the following cell the meta configuration (i.e. samples sizes and corpus configurations) from the previous notebook is loaded. Also the parameters for the plotting are set. **Note:** Enter the actual filename into the `results` tuple. Also, the `corpus` variable needs to be adapted to the corpus that you wish to analyze.

In [None]:
import pandas as pd
import pickle

experiment_data = None
corpus = 'amazon'
results = ('YYYYMMDD-HH:MM_gs_results.pkl') # enter actual filename here
with open(results[-1], 'rb') as fh:
    experiment_data = pickle.load(fh)
    
max_sample_size = max(experiment_data['sample_sizes'])

experiment_data['param_labels'] = {
    'trans__size' : r'd',
    'trans__iter': r'epoch',
    'trans__alpha': r'\alpha',
    'trans__negative': r'ns',
    'trans__window': r'win',
    'trans__hs': r'hs',
    'trans__dm': r'arch',
    'cls__k': r'k',        
}

plot_params = { # This dict configures x-axis scale and the data type of th ecorresponding variables
    'trans__size' :     {'scale': 'linear'},
    'trans__iter':      {'scale': 'linear'},
    'trans__alpha':     {'scale': 'log'},
    'trans__min_alpha': {'scale': 'log'},
    'trans__negative':  {'scale': 'linear'},
    'trans__min_count': {'scale': 'linear'},
    'trans__window':    {'scale': 'linear'},
    'trans__hs':        {'scale': 'linear'},
    'trans__dm':        {'scale': 'linear'},
    'cls__k':           {'scale': 'linear'},
    'sample_size':      {'scale': 'log'},
}

# Plotting style

In [None]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from itertools import product

# colors and fonts
plt.style.use('seaborn-paper')
sns.set_style("white")
sns.set_context("paper")
mpl.rc("font", family="cmr10")

# sizes
plot_width=4.802 # the width of a lncs column
plot_height = plot_width / 1.618
fontsize = 8

# linestyles
def linestyle_gen():
    colors = [(0, 0, 0), (.6, .6, .6)]
    linestyles = ['-', '--', ':']
    return product(colors, linestyles)

# Plot description
The following plots depict the results per corpus/sample size manner (that is, each plot deals with a given sample size on one corpus).  
Each plot creates an overview of all parameters that were taken into account. The plots are organized in a row-wise manner; elements on the diagonal of these plots contain the legend for all the other plots in that row. Likewise, the labels on the left of the leftmost subplot in a row specify the variable of interest in that row.
All y-axes in the plots depict the mean success rate of the classification task. Each subplot describes the influence of the corresponding variables on the mean results of the parameter search.

In [None]:
from itertools import product

for (name, subexperiment), size in product(experiment_data['corpora'].items(), experiment_data['sample_sizes']):
    plot_correletaion_matrix(subexperiment['gs_results'],
                             experiment_data['gs_params'],
                             size, 
                             '{}@{} Samples'.format(name, size))

# Closer look onto some of the previous plots

In [None]:
import matplotlib.pylab as plt
from itertools import cycle

def plot_correlations(experiment_data, corpus, sample_size, params, y_margin=.1):
    df = experiment_data['corpora'][corpus]['gs_results']
    df = df.loc[df['sample_size'] == sample_size]

    fig, ax = plt.subplots(figsize=(plot_width, plot_height))
    
    param1, param2, s, t = params
        
    grouped = df['mean'].groupby([df[param1], df[param2]])
    mean = grouped.mean()
    std = grouped.mean()
    for (instance, series), (c, linestyle) in zip(mean.unstack().iterrows(), cycle(linestyle_gen())):
        param1_name = experiment_data['param_labels'][param1]
        param2_name = experiment_data['param_labels'][param2]
        data = np.array(list(series.items()))
        min_max_x = min(data[:, 0]), max(data[:, 0])
        min_max_y = (
            max(0, min(data[:, 1])*(1-y_margin)), 
            min(1, max(data[:, 1])*(1+y_margin)))
        label = r'$\theta_{' + param1_name + r'}=' + str(t(instance)) + r'$'

        ax.plot(data[:, 0], data[:, 1], linestyle, color=c, label=label)
        ax.grid(True)
        ax.set_ylabel('Accuracy')
        ax.set_xlabel(r'$\theta_{' + param2_name + r'}$')
        ax.set_xlim(min_max_x)
        ax.set_ylim(min_max_y)
        ax.set_xscale(s)
        ax.legend(loc='upper left')
    
    param_name = experiment_data['param_labels'][params[0]]
    plt.savefig('figures/correlation_{}_{}_{}@{}.eps'
        .format(param1, param2, corpus, sample_size), bbox_inches='tight', )
    plt.show()

In [None]:
variables = experiment_data['gs_params'].keys()

for param1, param2 in product(variables, variables):
    if param1 == param2:
        continue
    params = ((param1, param2, 'linear', float))
    plot_correlations(experiment_data, corpus, max_sample_size, params, y_margin=.4)

# Best scoring results

In [None]:
def load_dataframe(experiment_data, corpus_name='amazon', sample_size=10000, start=None, end=None):
    df = experiment_data['corpora'][corpus_name]['gs_results']
    sorted_df = df.sort_values('mean', ascending=False)
    return sorted_df.loc[sorted_df['sample_size'] == sample_size].iloc[start:end]

In [None]:
df = load_dataframe(experiment_data, corpus_name='20newsgroups', sample_size=100000)
df.iloc[:10]

In [None]:
df.iloc[int((len(df)/2)-5):int((len(df)/2)+5)]

In [None]:
df.iloc[-10:]

In [None]:
from itertools import cycle

def plot_best_classifiers(experiment_data,
    parameter='trans__size'):

    corpora = list(experiment_data['corpora'].keys())
    sample_sizes = experiment_data['sample_sizes']
    param_name = experiment_data['param_labels'][parameter]
    
    fig, axis = plt.subplots(2, 2, figsize=(plot_width, plot_height))

    for ax, (corpus, sample_size) in zip(axis.flatten(), product(corpora, sample_sizes)):
        for instance, (c, s) in zip(experiment_data['gs_params'][parameter], cycle(linestyle_gen())):
            df = load_dataframe(experiment_data, corpus_name=corpus, sample_size=sample_size)
            df_sub = df.loc[df[parameter] == instance]
            num_data = len(df_sub)
            x = range(num_data)
            label=r'$\theta_{' + param_name + '}=' + str(instance) + '$'
            ax.plot(x, df_sub['mean'], color=c, linestyle=s, label=label)
            ax.set_xlabel('Rank', fontsize=fontsize)
            ax.set_ylabel('Accuracy', fontsize=fontsize)
            ax.set_title('{}, {} training docs'.format(corpus, sample_size), fontsize=fontsize)
            ax.set_ylim((0, 1))
            ax.set_xticks([int(x) for x in np.arange(0, len(df_sub)+1,  (len(df_sub))/5)])
            ax.grid(True)
            ax.set_axis_bgcolor('white')

    plt.rc('xtick', labelsize=fontsize)
    plt.rc('ytick', labelsize=fontsize)
    plt.rc('axes', labelsize=fontsize)

    plt.legend(loc="upper left", bbox_to_anchor=(1.1, 2.8))
    plt.subplots_adjust(wspace=.35, hspace=.65)

    plt.savefig('figures/best_classifiers_{}.eps'.format(param_name), bbox_inches='tight')
    plt.show()

In [None]:
for param in experiment_data['gs_params']:
    plot_best_classifiers(experiment_data, parameter=param)

# Probabilistic View 
Let's assume there is a function $f(\theta) \mapsto [0, 1]$ (with $\theta \in \mathbb{R}^{n}$) that defines the success rate of our model in the classification task at hand. As this function is unknown we can't examine it analytically.  
But by means of the gridsearch that we have carried out, we possess several instances of this function like for example:

$$f(\theta_1=20, \theta_2=0.1, ..., \theta_n=100) = 0.659$$.

We can gain further insight if we investigate these instances probabilistically. We can chose an arbitrary instance of a variable (i.e. $\theta_{learning\,rate}=.25$) and consider all data points that fulfill this constraint as being sampled from:

$$f(\theta \setminus \theta_{learning\,rate} \, \mid \,\theta_{learning\,rate}=.25) \sim \mathcal{N}(\mu,\,\sigma^{2})$$.

Calculating $\mu$ and $\sigma^{2}$ allows us to plot and compare for example:  
$$f(\theta \setminus \theta_{learning\,rate} \, \mid \,\theta_{learning\,rate}=.25)$$
$$f(\theta \setminus \theta_{learning\,rate} \, \mid \,\theta_{learning\,rate}=.1)$$

In the following we will try this approach by using the $\theta_{size}$ parameter which determines the embedding size. We will plot $\mathcal{N}(\mu_{\theta_{size}=2},\,\sigma^{2}_{\theta_{size}=2})$, $\mathcal{N}(\mu_{\theta_{size}=8},\,\sigma^{2}_{\theta_{size}=8})$ and $\mathcal{N}(\mu_{\theta_{size}=24},\,\sigma^{2}_{\theta_{size}=24})$ as well as the histogram of the underlying data.

In [None]:
import numpy as np
import scipy.stats as stats
from matplotlib import pyplot as plt

parameter = 'trans__dm'
df = load_dataframe(experiment_data, corpus_name='amazon', sample_size=max_sample_size)
for instance in experiment_data['gs_params'][parameter]:
    sub_df = df.loc[df[parameter] == instance]
    data = sorted(sub_df['mean'])
    fit = stats.norm.pdf(data, np.mean(data), np.std(data))
    plt.plot(data, fit, '-', label=r'$\mathcal{N}_{size}=' + str(instance) + '$')
    plt.hist(data, alpha=.2, normed=True)

plt.yticks([])
plt.legend(loc=2)

plt.show()

From this plot we can see two interesting aspects:
1. We can see that 8 and 24 dimensional embeddings on average perform better than 2 dimensional ones.
2. The assumption that the sampled data is normally distributed is inapplicable.

### Gaussian Kernel Density Estimation
By means of Kernel Density Estimation(KDE) we can obtain a distribution that models the observed data much better than the normal distributions in the previous section. The following plot depict the distributions probability density functions received from KDE. Comparing those to the histogram in the previous plot, it can be seen that they resemble the underlying data much better.

In [None]:
from scipy.stats.kde import gaussian_kde
from numpy import linspace
import matplotlib.pyplot as plt


def p_given(experiment_data, parameter, corpus_name='amazon', sample_size=10000):
    df = load_dataframe(experiment_data, corpus_name=corpus_name, sample_size=sample_size)
    values = experiment_data['gs_params'][parameter]
    param_name = experiment_data['param_labels'][parameter]
    
    # sort the data
    sorted_data = []
    for value in values:
        sorted_data.append(sorted(df[df[parameter] == value]['mean']))

    # kernel density esimation
    KDEs = []
    for succ_rate in sorted_data:
        KDEs.append(gaussian_kde(succ_rate))

    min_val = df['mean'].min()
    max_val = df['mean'].max()
    x = linspace(min_val, max_val, 100)

    num_data = len(sorted_data[0])

    fig, ax = plt.subplots()
    fig.set_size_inches(plot_width, plot_height)
    
    for kde, (c, style), label in zip(KDEs, cycle(linestyle_gen()), values):
        label = r'$P(\theta \: | \: \theta_{' + param_name + '}=' + str(label) + ')$'
        ax.plot(x, kde(x)/num_data, color=c, linestyle=style, label=label)

    plt.rc('xtick', labelsize=fontsize)
    plt.rc('ytick', labelsize=fontsize)
    plt.rc('axes', labelsize=fontsize)

    plt.legend(loc=2)
    plt.grid(False)
    plt.xlabel('Accuracy', fontsize=fontsize)
    plt.ylabel('Probability', fontsize=fontsize)
    plt.xlim((min_val, max_val))

    plt.savefig('figures/PDF_{}_{}@{}.eps'.format(param_name, corpus, sample_size), bbox_inches='tight')
    plt.show()

In [None]:
for parameter in experiment_data['gs_params'].keys():
    p_given(experiment_data, parameter, corpus_name=corpus, sample_size=max_sample_size)

The vast majority of the probability mass of $KDE(\theta_{size}=2)$ is between $0.6$ and $0.8$. in contrast $KDE(\theta_{size}=8)$ and $KDE(\theta_{size}=24)$ perform better. But from this graph it's difficult to decide which of these is actually better. $KDE(\theta_{size}=8)$ has more probability mass between $0.7$ and $0.8$ but peaks higher at around $0.9$.