In [1]:
import re
import os
import pandas as pd
#from bkcharts import Scatter, output_file, show
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import row
from bokeh.models import ColumnDataSource, CategoricalColorMapper, Jitter

In [11]:
# define helpers for parsing files, extracting values
def parse_filename(f):
    parts = f.strip().split("_")
    k = parts[1]
    stride = parts[3].split(".")[0]
    return k, stride
    
def detect_macrobatch(line, macrobatch_detector):
    m = macrobatch_detector.match(line)
    return m.groups()[0] if m is not None else None

def detect_loss(line, loss_detector):
    m = loss_detector.match(line)
    return m.groups()[0] if m is not None else None

In [12]:
# define the REs for extracting values
macrobatch_detector = re.compile('.*macrobatch\ (\d+)')
loss_detector = re.compile('.*loss: (\d+\.\d+)')

In [13]:
# get list of files
txt_files = [f for f in os.listdir(os.path.expanduser("~/projects/embedding/results/debug_output")) if f.endswith('filtered.txt')]

# build df in tidy-data fashion
dfs = []

In [14]:
for fname in txt_files:
    K, stride = parse_filename(fname)
    loss_list = []
    macrobatch_list = []
    epoch_list = []
    my_f = open(os.path.join(os.path.expanduser("~/projects/embedding/results/debug_output"),fname), encoding='utf-8')
    for line in my_f:
        # scrape the data
        line = line.strip()
        m = detect_macrobatch(line, macrobatch_detector)
        if m is not None:
            macrobatch_list.append(m)
        l = detect_loss(line, loss_detector)
        if l is not None:
            loss_list.append(l)
    my_f.close()
    
    # construct the DF, ensuring that all macrobatch, loss and epoch lists are the same length
    epoch = 0
    prev = 0
    for i, m in enumerate(macrobatch_list):
        if abs(int(m) - prev) > 1:
            epoch += 1
        epoch_list.append(epoch)
        prev = int(m)
        
    this_df = pd.DataFrame.from_items([('K', [int(K) for i in range(len(loss_list))]),('stride', [int(stride) for i in range(len(loss_list))]),('epoch', epoch_list),('macrobatch', [int(m) for m in macrobatch_list]),('loss',[float(l) for l in loss_list])], orient='columns')    
    dfs.append(this_df)   

In [15]:
# put into one df, see if it gets built properly
my_df = pd.concat(dfs)

# check the 8-mers
eight_mers = my_df.loc[my_df['K'] == 8]
eightmer_cds = ColumnDataSource(eight_mers)
eight_mers.head()

Unnamed: 0,K,stride,epoch,macrobatch,loss
0,8,1,0,0,11.090357
1,8,1,0,1,11.090354
2,8,1,0,2,11.090347
3,8,1,0,3,11.090337
4,8,1,0,4,11.090348


In [16]:
# check the 6-mers
six_mers = my_df.loc[my_df['K'] == 6]
sixmer_cds = ColumnDataSource(six_mers)
six_mers.head()

Unnamed: 0,K,stride,epoch,macrobatch,loss
0,6,1,0,0,11.090319
1,6,1,0,1,11.090282
2,6,1,0,2,11.090195
3,6,1,0,3,11.089995
4,6,1,0,4,11.089683


In [17]:
color_mapper = CategoricalColorMapper(
 factors=[1, 2, 4],
 palette=['red', 'blue', 'green'])

#p = figure(x_range=(0, 7), y_range=(0, 3), height=300, tools='save')
#p.circle(
#    x='x', y='y', radius=0.5, source=source,
#    color={'field': 'stride', 'transform': color_mapper},
#    legend='label'
#)

p8 = figure(title = "8-mers with various strides")
p8.title.align = "center"
p8.xaxis.axis_label = "Epoch"
p8.yaxis.axis_label = "NCE logistic loss / macrobatch"

p8.circle(x='epoch', y='loss', color={'field': 'stride', 'transform': color_mapper}, 
          legend='stride', fill_alpha=0.3, size=9, source=eightmer_cds)

p6 = figure(title = "6-mers with various strides")
p6.title.align = "center"
p6.xaxis.axis_label = "Epoch"
p6.yaxis.axis_label = "NCE logistic loss / macrobatch"

p6.circle(x='epoch', y='loss', color={'field': 'stride', 'transform': color_mapper},
          legend='stride', fill_alpha=0.3, size=9, source=sixmer_cds)

output_notebook()

show(row(p8, p6))

In [18]:
from bkcharts import Scatter, output_notebook as boutput_nbk, show as bshow
p8s = Scatter(eight_mers, x='epoch', y='loss', color='stride', marker='stride',
            title="8-mers with various strides", legend="top_right",
            xlabel="Epoch", ylabel="NCE logistic loss")
p8s.title.align = "center"


p6s = Scatter(six_mers, x='epoch', y='loss', color='stride', marker='stride',
            title="6-mers with various strides", legend="top_right",
            xlabel="Epoch", ylabel="NCE logistic loss")
p6s.title.align = "center"

boutput_nbk()
bshow(row(p8s, p6s))