In [1]:
import time
import ast
import os
import numpy as np
import pandas as pd
import shutil
from pprint import pprint

In [2]:
from utils import *

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [3]:
def merge_confs(ids_list):

    identifier = str(int(time.time()))

    all_keys = ['id','ModelClass','OptimizerClass','training_set','validation_set',
                'epochs','learning_rate','batch_size','num_workers','device','parent_id',
                'N_vec','phases','xscale']
    non_string_keys = ['training_set','validation_set','epochs','learning_rate',
                       'batch_size','num_workers','parent_id','N_vec','phases']
    list_keys = ['training_set','validation_set','N_vec','phases']

    ds = []
    for i in ids_list:

        d = pd.read_csv( f"results/exp_{i}/exp_{i}_conf.csv",index_col=0).T.replace(np.nan, '').to_dict(
            orient='list')
        d = {k:v[0] for k,v in d.items()}

        for k,v in d.items():
            if k in non_string_keys:
                if v:
                    d[k] = ast.literal_eval(v)

        d['parent_id'] = d['parent_id'] if isinstance(d['parent_id'],list) else str(d['parent_id'])

        ds.append( d )

    new_d = dict()

    for d in ds:
        for k in all_keys:
            if not isinstance(new_d.get(k),list):
                new_d[k] = []
            if isinstance(d[k],list):
                new_d[k] += d[k]
            else:
                new_d[k].append(d[k])

    new_d = {k:sorted(list(set(v))) for k,v in new_d.items()}

    new_d = {k:(v[0] if (len(v)==1) and (k not in list_keys) else v) for k,v in new_d.items() }

    new_d['parent_id'] = new_d['id']
    new_d['id'] = identifier

    return new_d

In [4]:
def get_model_src_name(ids_list,N):
    """
    Args:
        ids_list : experiments ids given in order
        N : size of neighborhood
    """
    model_names = []
    for i in ids_list:
        model_name = f"results/exp_{i}/exp_{i}_{N:03d}_model.pt"
        if os.path.isfile(model_name):
            model_names.append(model_name)

    return model_names[-1]

def get_min_valid_loss_model_src_name(id_of_min_loss,N):
    model_name = f"results/exp_{id_of_min_loss}/exp_{id_of_min_loss}_{N:03d}_min_valid_loss_model.pt"
    return model_name

In [5]:
def get_model_dst_name(identifier,N):
    return f"results/exp_{identifier}/exp_{identifier}_{N:03d}_model.pt"

def get_min_valid_loss_model_dst_name(identifier,N):
    return f"results/exp_{identifier}/exp_{identifier}_{N:03d}_min_valid_loss_model.pt"


In [6]:
def merge_N_data(ids_list,N,phase):
    """
    The ids in ids_list must be given in order
    
    Args:
        ids_list : experiments ids given in order
        N : N data to marge
        phase : phase to merge
    """
    
    csv_names = []
    filtered_ids_list = []
    for i in ids_list:
        csv_name = f"results/exp_{i}/exp_{i}_{N:03d}_{phase}_values.csv"
        if os.path.isfile(csv_name):
            csv_names.append(csv_name)
            filtered_ids_list.append(i)

    dfs = []
    mins_list = []
    for i,csv_name in zip(filtered_ids_list,csv_names):
        df = pd.read_csv( csv_name ,index_col=0)
        mins_list.append(min(df['mlp'].values))
#         print(f"{i} min : {min(df['mlp'].values)} at index {np.argmin(df['mlp'].values)}")
#         print(f"{i} last : {df['mlp'].values[-1]}")
        dfs.append( df )
        
    id_of_min_loss = filtered_ids_list[::-1][np.argmin(mins_list[::-1])]
    
    final_df = pd.concat(dfs, ignore_index=True,axis=0)
    final_df.index.name = "epoch"
    
    return final_df , id_of_min_loss

In [7]:
def init_empty_data(phases):
    data = dict()
    for phase in phases:
        data[phase] = {
            "mlp": [], 
            "static": [],    
            "cabac": []          
        }
    return data

In [8]:


def merge_ids(ids_list):

    configs = merge_confs(ids_list)
    
    os.makedirs(f'results/exp_{configs["id"]}')

    data = init_empty_data(configs["phases"])

    for N in configs["N_vec"]:

        print(f"--------------------- context size : {N} ---------------------")
        
        N_data = init_empty_data(configs["phases"])

        for phase in configs["phases"]:
            d,id_of_min_loss = merge_N_data(ids_list,N,phase)
            N_data[phase] = d.to_dict(orient='list')
            if (phase == 'valid') and ('train' in configs["phases"]) and (N>0):
                print(id_of_min_loss)
#                 shutil.copy2(
#                     get_min_valid_loss_model_src_name(id_of_min_loss,N), 
#                     get_min_valid_loss_model_dst_name(configs["id"],N))

        save_N_data({k: (v if k != 'epochs' else len(d.index.values)) for k,v in configs.items()},N,N_data)
        
        if ('train' in configs["phases"]) and (N>0):
            shutil.copy2(get_model_src_name(ids_list,N), get_model_dst_name(configs["id"],N))

        for phase in configs["phases"]:
#             data[phase]["mlp"].append(min(N_data[phase]["mlp"]))
            data[phase]["mlp"].append(N_data[phase]["mlp"][-1])
            data[phase]["static"].append(N_data[phase]["static"][-1])
            data[phase]["cabac"].append(N_data[phase]["cabac"][-1])

    save_final_data(configs,data)

In [9]:
def append_graph(fig,xvalues,values,linestyle,color,label,marker):
    ax, = fig.axes
    handles, labels = ax.get_legend_handles_labels()
    new_mlp_handle,= ax.plot(xvalues, values, linestyle=linestyle, color=color, 
                         label=label, marker=marker)
    handles.append(new_mlp_handle)
    ax.legend(handles=handles,loc="upper right")

In [10]:
def replace_labels(fig,old_labels,new_labels):
    ax, = fig.axes
    labels = ax.get_legend_handles_labels()[1]
    for olb,nlb in zip(old_labels,new_labels):
        labels[labels.index(olb)] = nlb
    ax.legend(labels=labels)

In [12]:
# merge_ids(["1649217265","1649245701"])

# Merging (graph for very large N - up to 170)

1646351538 : training on page 3 ; validation on page 5; MLP_N_64N_32N_1; neighborhoods 0, 2, 4, 10, 26, 67, 170; 420 epochs; 1e-05 learning_rate; 2048 batch size; parents 1646255547, 1646256140 and 1646334695 .


In [11]:
# ids_list = ["1646108338", "1646138205" , "1646147306", "1646182637"] #-> 1646255547

In [12]:
# d = merge_N_data(ids_list,170,"train")

In [13]:
# d.to_dict(orient='list')

In [12]:
# import matplotlib.pyplot as plt
# plt.plot(d.index,d['mlp'].values)

In [13]:
# merge_ids(ids_list)

In [14]:
# import filecmp

In [15]:
# filecmp.cmp("results/exp_1646255547_170_model.pt" , "results/exp_1646147306_170_model.pt")

In [16]:
# ids_list = ["1646255547","1646256140","1646334695"] #-> 1646351538

In [17]:
# merge_ids(ids_list)

# Abandoned Experiment : limiting the hidden units for N=67 and 170

1646351538 : training on page 3 ; validation on page 5; MLP_N_64N_32N_1; neighborhoods 0, 2, 4, 10, 26, 67, 170; 420 epochs; 1e-05 learning_rate; 2048 batch size; parents 1646255547, 1646256140 and 1646334695 .

1646397720 : training on page 3 ; validation on page 5; MLP_N_2048_1024_1; neighborhoods 67, 170; 420 epochs; 1e-05 learning_rate; 2048 batch size . 

1646523973 : training on pages 1 and 3 ; validation on page 5; MLP_N_64N_32N_1 and MLP_N_2048_1024_1; neighborhoods 0, 2, 4, 10, 26, 67, 170; 420 epochs; 1e-05 learning_rate; 2048 batch size; parents 1646427815, 1646449472 and 1646504279 . 

In [18]:
# ids_list = ["1646427815","1646449472","1646504279"] #-> 1646523973

In [20]:
# merge_ids(ids_list)

In [83]:
new_data = dict()
new_data["train"] = dict() 
new_data["train"]["mlp"] = pd.read_csv( f"results/exp_1646397720_train_values.csv" ,index_col=0)['mlp'].values
new_data["valid"] = dict() 
new_data["valid"]["mlp"] =pd.read_csv( f"results/exp_1646397720_valid_values.csv" ,index_col=0)['mlp'].values
data = dict()
data['train'] = pd.read_csv( f"results/exp_1646351538_train_values.csv" ,index_col=0).to_dict(orient="list")
data['valid'] = pd.read_csv( f"results/exp_1646351538_valid_values.csv" ,index_col=0).to_dict(orient="list")

In [87]:
fig_t = plot_comparison([0,2,4,10,26,67,170],data['train'],"context size",xscale = "symlog")
append_graph(fig_t,[67,170],new_data["train"]["mlp"],'dashed','blue','mlp*','o')
fig_v = plot_comparison([0,2,4,10,26,67,170],data['valid'],"context size",xscale = "symlog")
append_graph(fig_v,[67,170],new_data["valid"]["mlp"],'dashed','blue','mlp*','o')

In [88]:
# fig_t

In [89]:
# fig_v

In [66]:
data['train']['mlp'][-2:] = new_data["train"]["mlp"]
data['valid']['mlp'][-2:] = new_data["valid"]["mlp"]

In [90]:
fig_t = plot_comparison([0,2,4,10,26,67,170],data['train'],"context size",xscale = "symlog")

In [91]:
fig_v = plot_comparison([0,2,4,10,26,67,170],data['valid'],"context size",xscale = "symlog")

In [77]:
other_data = dict()
other_data['train'] = pd.read_csv( f"results/exp_1646523973_train_values.csv" ,index_col=0).to_dict(orient="list")
other_data['valid'] = pd.read_csv( f"results/exp_1646523973_valid_values.csv" ,index_col=0).to_dict(orient="list")

In [78]:
append_graph(fig_t,[0,2,4,10,26,67,170],other_data["train"]["mlp"],'dashed','blue','mlp*','o')
append_graph(fig_t,[0,2,4,10,26],other_data["train"]["cabac"][:-2],'dashed','green','cabac*','^')

In [80]:
append_graph(fig_v,[0,2,4,10,26,67,170],other_data["valid"]["mlp"],'dashed','blue','mlp*','o')
append_graph(fig_v,[0,2,4,10,26],other_data["valid"]["cabac"][:-2],'dashed','green','cabac*','^')

In [92]:
# fig_t

In [93]:
# fig_v

# Impact of the amount of pages in the training set

1646735060 : training on pages 1 and 3 ; validation on page 5; MLP_N_64N_32N_1; neighborhoods 0, 2, 4, 10, 26, 67, 170; 420 epochs; 1e-05 learning_rate; 2048 batch size; parents 1646449472 and 1646527898 . 

1646351538 : training on page 3 ; validation on page 5; MLP_N_64N_32N_1; neighborhoods 0, 2, 4, 10, 26, 67, 170; 420 epochs; 1e-05 learning_rate; 2048 batch size; parents 1646255547, 1646256140 and 1646334695 .

linestyles : https://matplotlib.org/2.1.2/api/_as_gen/matplotlib.pyplot.plot.html

In [52]:
# ids_list = ["1646449472","1646527898"] # -> 1646735060

In [53]:
# merge_ids(ids_list)

In [83]:
data3 = dict()
data3['train'] = pd.read_csv( f"results/exp_1646351538_train_values.csv" ,index_col=0).to_dict(orient="list")
data3['valid'] = pd.read_csv( f"results/exp_1646351538_valid_values.csv" ,index_col=0).to_dict(orient="list")
fig_t = plot_comparison([0,2,4,10,26,67,170],data3['train'],"context size",xscale = "symlog")
fig_v = plot_comparison([0,2,4,10,26,67,170],data3['valid'],"context size",xscale = "symlog")

In [67]:
data13 = dict()
data13['train'] = pd.read_csv( f"results/exp_1646735060_train_values.csv" ,index_col=0).to_dict(orient="list")
data13['valid'] = pd.read_csv( f"results/exp_1646735060_valid_values.csv" ,index_col=0).to_dict(orient="list")

In [68]:
append_graph(fig_t,[0,2,4,10,26,67,170],data13["train"]["mlp"],'dashed',(0.2,0,0.8),'mlp (pp.1,3)','o')
append_graph(fig_t,[0,2,4,10,26],data13["train"]["cabac"][:-2],'dashed',(0.2,0.8,0),'cabac (pp.1,3)','^')

In [69]:
append_graph(fig_v,[0,2,4,10,26,67,170],data13["valid"]["mlp"],'dashed',(0.2,0,0.8),'mlp (pp.1,3)','o')
append_graph(fig_v,[0,2,4,10,26],data13["valid"]["cabac"][:-2],'dashed',(0.2,0.8,0),'cabac (pp.1,3)','^')

In [70]:
data123 = dict()
data123['train'] = pd.read_csv( f"results/exp_1647694682_train_values.csv" ,index_col=0).to_dict(orient="list")
data123['valid'] = pd.read_csv( f"results/exp_1647694682_valid_values.csv" ,index_col=0).to_dict(orient="list")

In [71]:
append_graph(fig_t,[0,2,4,10,26,67,170],data123["train"]["mlp"],'dotted',(0.4,0,0.6),'mlp (pp.1,2,3)','o')
append_graph(fig_t,[0,2,4,10,26],data123["train"]["cabac"][:-2],'dotted',(0.4,0.6,0),'cabac (pp.1,2,3)','^')

In [72]:
append_graph(fig_v,[0,2,4,10,26,67,170],data123["valid"]["mlp"],'dotted',(0.4,0,0.6),'mlp (pp.1,2,3)','o')
append_graph(fig_v,[0,2,4,10,26],data123["valid"]["cabac"][:-2],'dotted',(0.4,0.6,0),'cabac (pp.1,2,3)','^')

In [73]:
data1234 = dict()
data1234['train'] = pd.read_csv( f"results/exp_1647894746_train_values.csv" ,index_col=0).to_dict(orient="list")
data1234['valid'] = pd.read_csv( f"results/exp_1647894746_valid_values.csv" ,index_col=0).to_dict(orient="list")

In [74]:
append_graph(fig_t,[0,2,4,10,26,67,170],data1234["train"]["mlp"],'dashdot',(0.6,0,0.4),'mlp (pp.1,2,3,4)','o')
append_graph(fig_t,[0,2,4,10,26],data1234["train"]["cabac"][:-2],'dashdot',(0.6,0.4,0),'cabac (pp.1,2,3,4)','^')

In [75]:
append_graph(fig_v,[0,2,4,10,26,67,170],data1234["valid"]["mlp"],'dashdot',(0.6,0,0.4),'mlp (pp.1,2,3,4)','o')
append_graph(fig_v,[0,2,4,10,26],data1234["valid"]["cabac"][:-2],'dashdot',(0.6,0.4,0),'cabac (pp.1,2,3,4)','^')

In [76]:
replace_labels(fig_t,['mlp','cabac'],['mlp (p.3)','cabac (p.3)'])
replace_labels(fig_v,['mlp','cabac'],['mlp (p.3)','cabac (p.3)'])

In [79]:
# fig_t

In [80]:
# fig_v

In [81]:
# save_fig(f"results/exp_1646351538_1646735060_1647694682_1647894746_train_graph",fig_t)

In [82]:
# save_fig(f"results/exp_1646351538_1646735060_1647694682_1647894746_valid_graph",fig_v)

# Seeing data

In [36]:
# pd.read_csv( f"results/exp_1646396496_valid_values.csv",index_col=0)

In [37]:
# pd.read_csv( f"results/exp_1646735060_valid_values.csv" ,index_col=0)

# Varying pages

In [None]:
import pandas as pd
from perceptronac.utils import *

In [None]:
ids_list = ["1646351538","1646735060","1647694682","1647894746","1648672366",
            "1648693742","1648724675","1648767269","1648810180","1648845111"]

In [None]:
data = dict()
data['train'] = dict()
data['valid'] = dict()
data['train']['mlp'] = []
data['valid']['mlp'] = []

In [None]:
for i in ids_list:
    data['train']['mlp'].append(pd.read_csv(f"results/exp_{i}_train_values.csv",index_col=0).loc[67,'mlp'])
    data['valid']['mlp'].append(pd.read_csv(f"results/exp_{i}_valid_values.csv",index_col=0).loc[67,'mlp'])

In [None]:
data

In [None]:
fig_t = plot_comparison(np.arange(1,11),data['train'],"number of pages",xscale = "linear")
fig_t.axes[0].set_xticks(np.arange(1,11))
fig_t.axes[0].legend(['mlp N=67'])
fig_v = plot_comparison(np.arange(1,11),data['valid'],"number of pages",xscale = "linear")
fig_v.axes[0].set_xticks(np.arange(1,11))
fig_v.axes[0].legend(['mlp N=67'])

In [None]:
fig_v

In [None]:
save_fig(f"results/exp_1646351538_to_1648845111_067_valid_graph",fig_v)

# Average graphs

In [None]:
import pandas as pd
import numpy as np
from perceptronac.utils import plot_comparison
from perceptronac.utils import save_fig

In [None]:
ids_to_avg = [
  "1648504002",  "1648505291",  "1648506606",  "1648508102",  "1648509733",
  "1648504655",  "1648505962",  "1648507460",  "1648508751",  "1648510660",
]

In [None]:
def average_experiments(ids):

    dfs = []
    for i_d in ids:
        dfs.append(pd.read_csv( f"results/exp_{i_d}/exp_{i_d}_valid_values.csv" ,index_col=0))

    assert len(set([len(df.index.values) for df in dfs])) == 1
    assert len(set([v for df in dfs for v in df.index.values]) ) == len(dfs[0].index.values)

    assert len(set([len(df.columns) for df in dfs])) == 1
    assert len(set([v for df in dfs for v in df.columns]) ) == len(dfs[0].columns)

    new_df = pd.DataFrame(
        data = np.zeros((len(dfs[0].index.values),len(dfs[0].columns))),
        columns=dfs[0].columns, index=dfs[0].index.values)

    for R in dfs[0].index.values:
        for C in dfs[0].columns:
            for df in dfs:
                new_df.loc[R,C] = new_df.loc[R,C] + df.loc[R,C] 

    return new_df/len(dfs)

In [None]:
new_df=average_experiments(ids_to_avg)

In [None]:
# fig_v = plot_comparison(new_df.index.values,new_df.to_dict(orient="list"),"context size",xscale = "symlog")

In [None]:
save_fig(f"results/exp_1648504002_to_1648510660_valid_graph",fig_v)

In [None]:
# new_df[::-1]

In [None]:
# new_df

In [None]:
# pd.read_csv( f"results/exp_1648083609/exp_1648083609_train_values.csv" ,index_col=0)[::-1]