In [None]:
%matplotlib inline
import pandas as pd
import glob
import os
import numpy as np
import json
from bokeh.plotting import show, output_notebook, figure, ColumnDataSource
from bokeh.charts import Scatter
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import BoxZoomTool
from lightjob.cli import load_db
from lightjob.db import SUCCESS

def moving(l):
    v = 0.
    lm = []
    for val in l:
        v = v * 0.9999 + val * 0.0001
        lm.append(v)
    return lm
output_notebook()

def get_df():
    db = load_db('../students/.lightjob')
    rows = []
    jobs = db.jobs_with(state=SUCCESS)
    for job in jobs:
        folder = '../students/jobs/{}'.format(job['summary'])
        stats = os.path.join(folder, 'stats.csv')
        if not os.path.exists(stats):
            continue
        df_stats = pd.read_csv(stats)
        df_valid = pd.read_csv(os.path.join(folder, 'valid.csv'))
        df_valid = df_valid[df_valid.columns[0]].values
        hypers = job['content']['hypers']
        start_time = job['life'][0]['dt']
        end_time = job['life'][0]['dt']    
        acc = moving(df_stats['acc'])
        row = job['content'].copy()
        del row['hypers']
        row.update(hypers)

        row['train_acc'] = acc[::1250]
        row['valid_acc'] = df_valid.tolist()
        row['max_valid_acc'] = float(df_valid.max())
        row['last_train_acc'] = row['train_acc'][-1]
        row['max_train_acc'] = np.max(row['train_acc'])
        row['id'] = os.path.basename(folder)
        row['nb_updates'] = len(df_stats)
        row['start_time'] = start_time
        row['end_time'] = end_time
        row['n_epochs'] = (len(df_stats)*32)/40000
        rows.append(row)
        
    df = pd.DataFrame(rows)
    return df


In [None]:
df = get_df()

In [None]:
df.sort_values(by='max_valid_acc', ascending=False)

In [None]:
hover = HoverTool(
    tooltips=[
        ('valid_acc', '@max_valid_acc'),
        ('train_acc', '@last_train_acc')
    ]
)
p = figure(tools=[hover])
p.circle('fc', 'max_valid_acc', source=ColumnDataSource(df))
show(p)

In [None]:
from bokeh.charts import Bar, output_file, show
p = Bar(df, 'data_source', values='max_valid_acc', agg='median')
p.legend.location = (0,0)
show(p)

In [None]:
best = df.sort_values(by='max_valid_acc', ascending=False).iloc[0].to_dict()
print(best['data_source'])
p = figure(tools=[HoverTool(), BoxZoomTool()])
e = np.arange(len(best['train_acc']))
p.line(e, best['train_acc'], color='blue', legend='train acc')
p.line(e, best['valid_acc'], color='orange', legend='valid acc')
p.legend.location = 'bottom_right'
show(p)