In [1]:
%matplotlib qt4
from __future__ import division

from models import tools, optimize, models, filters
from models.tests import PerformanceTest

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# The Naive Way

In [None]:
data = tools.load_data(limit=40000, offset=2400000)

In [None]:
grad = optimize.NaiveDescent(data)

In [None]:
descent1 = grad.search_pfa(1.5, -2, step_size=3, maxiter=100, precision=0.005)

In [None]:
descent2 = grad.search_pfa(5, 0.5, step_size=2.5, maxiter=100, precision=0.005)

In [None]:
descent3 = grad.search_pfag(1.5, -2, step_size=20, maxiter=36, precision=0.005)

In [None]:
elo = models.EloModel()
pfa = models.PFAModel(elo, gamma=2.99622612646, delta=-0.476090204636)
pfa_test = PerformanceTest(pfa, data)
pfa_test.run()

In [None]:
pfa_test.results['train']

In [None]:
def annotate(descent, number, mark, xadd, yadd):
    row = descent.params.loc[number]
    grad = descent.grads.loc[number]
    plt.annotate(r'$\gamma={}$, $\delta={}$'.format(round(row.gamma, 2), round(row.delta, 2)),
                 xy=(number, grad), xycoords='data',
                 xytext=(number + xadd, grad + yadd), textcoords='data',
                 bbox=dict(boxstyle="round", fc="w"))
    plt.plot(number, grad, mark)
    
#annotate(descent1, 1, 'go', 0.8, -0.006)
#annotate(descent1, 10, 'go', 0.8, -0.006)
annotate(descent1, 34, 'go', -8, -0.009)

#annotate(descent3, 1, 'ro', 0.7, 0.004)
#annotate(descent3, 11, 'ro', 0.8, 0.004)
annotate(descent3, 20, 'ro', 0.8, 0.006)

plt.xlabel('number of iteration')
plt.ylabel('predicted - observed')

plt.xlim([0, 35])
plt.ylim([-0.08, 0.03])

line1, = plt.plot(descent1.grads[:35], 'g', label=r'step size = $3$')
line2, = plt.plot(descent3.grads[:36], 'r', label=r'step size = $20$')

plt.legend(handles=[line1, line2], loc='lower right')

plt.show()

# The Proper Way

In [22]:
reload(optimize)

<module 'models.optimize' from '/home/pavel/Projects/thesis/models/optimize.pyc'>

In [3]:
data = tools.load_data(limit=300000, offset=1500000)

Loaded 275846 answers.


In [4]:
data = data[filters.sequentize(data)]
print len(data)

68155


In [35]:
descent = optimize.GradientDescent(data)
descents = {
    'Random Factor': (descent, {'random_factor': 1,
                                'random_chance': 20}),
    # 'No Random Factor': (descent, {}),
}

In [36]:
#dresults = {}
for name, (descent, kwargs),  in descents.items():
    tools.echo(name, clear=False)
    dresults[name] = descent.search_staircase(
        init_learn_rate=0.015,
        number_of_iter=20,
        **kwargs
    )

Random Factor
   2.50000    0.80000        inf
  10.84222   15.21756    0.00004
   3.96198    3.51653   -0.00421
   2.49202    1.47933    0.00191
   2.10914    1.87744    0.00170
   2.71558    2.00644    0.00207
   2.23710    1.57726    0.00201
   1.30333    1.02334    0.00115
   1.47044    1.23090    0.00153
   1.60860    1.27317    0.00121
   1.64358    1.15131    0.00068
   1.70722    1.18367    0.00060
   1.73790    1.02023   -0.00017
   1.76808    1.20257    0.00034
   1.91400    1.21920    0.00056
   1.78801    1.14813    0.00064
   1.68225    1.10568    0.00111
   1.74443    1.08445    0.00086
   1.73182    1.05565    0.00131
   1.65636    1.03002    0.00150
   1.71638    1.08090    0.00048


In [37]:
plots = []
for name, dresult in dresults.items():
    p, = dresult.plot()
    plots += [(name, p, dresult)]

if len(plots) > 1:
    gamma_delta = ' ($\gamma = {0[gamma]:.3f}, \delta = -{0[delta]:.3f}$)'
    plt.legend([item[1] for item in plots],
               [n + gamma_delta.format(r.best) for n, p, r in plots])

In [38]:
train_data = []
for i in range(10):
    limit, offset = 500000, i * 500000 + 1000000
    df = tools.load_data(limit=limit, offset=offset, echo_loaded=False)
    df = df[filters.sequentize(df)]
    train_data.append(df.copy())
    tools.echo('[{}]: Loaded {} answers.'.format(i, len(df)), clear=False)

tools.echo('Data loaded.', clear=False)

results = []
for i in range(10):
    descent = optimize.GradientDescent(train_data[i])
    res = descent.search_staircase(init_learn_rate=0.01, number_of_iter=25,
                                   echo_iterations=False, random_factor=1,
                                   random_chance=20)
    results.append(res)
    tools.echo('[{}]: done!'.format(i), clear=False)

[0]: Loaded 91242 answers.
[1]: Loaded 146596 answers.
[2]: Loaded 106387 answers.
[3]: Loaded 76013 answers.
[4]: Loaded 52801 answers.
[5]: Loaded 37838 answers.
[6]: Loaded 26913 answers.
[7]: Loaded 50341 answers.
[8]: Loaded 40548 answers.
[9]: Loaded 32522 answers.
Data loaded.
[0]: done!
[1]: done!
[2]: done!
[3]: done!
[4]: done!
[5]: done!
[6]: done!
[7]: done!
[8]: done!
[9]: done!


In [48]:
x_matrix = []
y_matrix = []
for res in results:
    stairs = sorted(res.staircases[-1].items(), key=lambda x: x[0])
    staircase_times = res.model.metadata['staircase_times']

    xi_axis = [np.mean(staircase_times[i]) for i in res.intervals]
    yi_axis = [value for interval, value in stairs]
    
    x_matrix.append(xi_axis)
    y_matrix.append(yi_axis)

x_axis = []
y_axis = []
e_vals = []
for i in range(len(x_matrix[0])):
    x_axis += [np.mean([x_matrix[j][i] for j in range(len(x_matrix))])]
    y_axis += [np.mean([y_matrix[j][i] for j in range(len(x_matrix))])]
    e_vals += [np.std([y_matrix[j][i] for j in range(len(x_matrix))]) / 2.]

plt.figure(num=None, figsize=(8, 6), dpi=160)
plt.errorbar(x_axis, y_axis, e_vals,
             ecolor='orange', elinewidth='2',
             linestyle='--', linewidth='2',
             capthick='2', capsize=4,
             color='#02A5F4', marker='o')

plt.xscale('log')
plt.xlabel('Time from previous attempt in seconds')
plt.ylabel('Increase in memory activation')
plt.xlim([30, 1e6])

plt.show()
plt.tight_layout()

In [45]:
gamma_std = np.std([res.gammas[-1] for res in results])
delta_std = np.std([res.deltas[-1] for res in results])
gamma_mean = np.mean([res.gammas[-1] for res in results])
delta_mean = np.mean([res.deltas[-1] for res in results])
tools.echo('std: gamma={:.3f}, delta={:.3f}'.format(gamma_std, delta_std), clear=False)
tools.echo('avg: gamma={:.3f}, delta={:.3f}'.format(gamma_mean, delta_mean), clear=False)

std: gamma=0.276, delta=0.095
avg: gamma=1.814, delta=0.827
