In [1]:
%matplotlib qt4
from __future__ import division

from models import tools, optimize, models, filters
from models.tests import PerformanceTest

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# The Naive Way

In [None]:
data = tools.load_data(limit=40000, offset=2400000)

In [None]:
grad = optimize.NaiveDescent(data)

In [None]:
descent1 = grad.search_pfa(1.5, -2, step_size=3, maxiter=100, precision=0.005)

In [None]:
descent2 = grad.search_pfa(5, 0.5, step_size=2.5, maxiter=100, precision=0.005)

In [None]:
descent3 = grad.search_pfag(1.5, -2, step_size=20, maxiter=36, precision=0.005)

In [None]:
elo = models.EloModel()
pfa = models.PFAModel(elo, gamma=2.99622612646, delta=-0.476090204636)
pfa_test = PerformanceTest(pfa, data)
pfa_test.run()

In [None]:
pfa_test.results['train']

In [None]:
def annotate(descent, number, mark, xadd, yadd):
    row = descent.params.loc[number]
    grad = descent.grads.loc[number]
    plt.annotate(r'$\gamma={}$, $\delta={}$'.format(round(row.gamma, 2), round(row.delta, 2)),
                 xy=(number, grad), xycoords='data',
                 xytext=(number + xadd, grad + yadd), textcoords='data',
                 bbox=dict(boxstyle="round", fc="w"))
    plt.plot(number, grad, mark)
    
#annotate(descent1, 1, 'go', 0.8, -0.006)
#annotate(descent1, 10, 'go', 0.8, -0.006)
annotate(descent1, 34, 'go', -8, -0.009)

#annotate(descent3, 1, 'ro', 0.7, 0.004)
#annotate(descent3, 11, 'ro', 0.8, 0.004)
annotate(descent3, 20, 'ro', 0.8, 0.006)

plt.xlabel('number of iteration')
plt.ylabel('predicted - observed')

plt.xlim([0, 35])
plt.ylim([-0.08, 0.03])

line1, = plt.plot(descent1.grads[:35], 'g', label=r'step size = $3$')
line2, = plt.plot(descent3.grads[:36], 'r', label=r'step size = $20$')

plt.legend(handles=[line1, line2], loc='lower right')

plt.show()

# The Proper Way

In [48]:
reload(filters)

<module 'models.filters' from '/home/pavel/Projects/thesis/models/filters.py'>

In [53]:
data = tools.load_data(limit=500000, offset=1500000)

Loaded 474169 answers.


In [54]:
data = data[filters.sequentize(data)]
print len(data)

146596


In [55]:
descent = optimize.GradientDescent(data)

In [56]:
descent_result = descent.search_staircase(init_learn_rate=0.015, number_of_iter=20)

   2.50000    0.80000        inf
   2.08562    0.50126    0.00253
   1.99518    0.64475    0.00089
   1.97997    0.76950    0.00112
   1.95207    0.82218    0.00128
   1.92511    0.83874    0.00137
   1.90404    0.83897    0.00141
   1.88922    0.83201    0.00143
   1.87962    0.82218    0.00145
   1.87397    0.81153    0.00145
   1.87118    0.80106    0.00146
   1.87035    0.79121    0.00146
   1.87084    0.78215    0.00146
   1.87218    0.77394    0.00146
   1.87402    0.76653    0.00145
   1.87614    0.75987    0.00145
   1.87837    0.75390    0.00145
   1.88061    0.74856    0.00145
   1.88278    0.74377    0.00145
   1.88484    0.73949    0.00145
   1.88675    0.73568    0.00145


In [57]:
descent_result.plot()

[<matplotlib.lines.Line2D at 0x7f492d7cb090>]

In [17]:
classmates_p, = classmates.plot(color='orange')
non_classmates_p, = non_classmates.plot(color='#02A5F4')
gamma_delta = '($\gamma = {0[gamma]:.3f}, \delta = -{0[delta]:.3f}$)'
plt.legend([classmates_p, non_classmates_p],
           [('Classmates ' + gamma_delta).format(classmates.best),
            ('All the others ' + gamma_delta).format(non_classmates.best)])

<matplotlib.legend.Legend at 0x7fe4ca850350>

In [2]:
train_data = []
for i in range(10):
    limit, offset = 500000, i * 500000 + 1000000
    df = tools.load_data(limit=limit, offset=offset, echo_loaded=False)
    df = df[filters.sequentize(df)]
    train_data.append(df.copy())
    tools.echo('[{}]: Loaded {} answers.'.format(i, len(df)), clear=False)

tools.echo('Data loaded.', clear=False)

results = []
for i in range(10):
    descent = optimize.GradientDescent(train_data[i])
    res = descent.search_staircase(init_learn_rate=0.01, number_of_iter=25,
                                   echo_iterations=False)
    results.append(res)
    tools.echo('[{}]: done!'.format(i), clear=False)

[0]: Loaded 91242 answers.
[1]: Loaded 146596 answers.
[2]: Loaded 106387 answers.
[3]: Loaded 76013 answers.
[4]: Loaded 52801 answers.
[5]: Loaded 37838 answers.
[6]: Loaded 26913 answers.
[7]: Loaded 50341 answers.
[8]: Loaded 40548 answers.
[9]: Loaded 32522 answers.
Data loaded.
[0]: done!
[1]: done!
[2]: done!
[3]: done!
[4]: done!
[5]: done!
[6]: done!
[7]: done!
[8]: done!
[9]: done!


In [29]:
x_matrix = []
y_matrix = []
for res in results:
    stairs = sorted(res.staircases[-1].items(), key=lambda x: x[0])
    staircase_times = res.model.metadata['staircase_times']

    xi_axis = [np.mean(staircase_times[i]) for i in res.intervals]
    yi_axis = [value for interval, value in stairs]
    
    x_matrix.append(xi_axis)
    y_matrix.append(yi_axis)

x_axis = []
y_axis = []
e_vals = []
for i in range(len(x_matrix[0])):
    x_axis += [np.mean([x_matrix[j][i] for j in range(len(x_matrix))])]
    y_axis += [np.mean([y_matrix[j][i] for j in range(len(x_matrix))])]
    e_vals += [np.std([y_matrix[j][i] for j in range(len(x_matrix))]) / 2.]

plt.figure(num=None, figsize=(8, 6), dpi=160)
plt.errorbar(x_axis, y_axis, e_vals,
             ecolor='orange', elinewidth='2',
             linestyle='--', linewidth='2',
             capthick='2', capsize=4,
             color='#02A5F4', marker='o')

plt.xscale('log')
plt.xlabel('Time from previous attempt in seconds.')
plt.ylabel('Increase in memory activation')
plt.xlim([30, 1e6])

plt.show()
plt.tight_layout()

In [None]:
gamma_std = np.mean([res.gammas[-1] for res in results])
delta_std = np.mean([res.deltas[-1] for res in results])
tools.echo('gamma={:.3f}, delta={:.3f}'.format(gamma_std, delta_std), clear=False)