## Imports

In [1]:
import json
import os
import sys
from collections import defaultdict
import glob
from pprint import pprint
from os import PathLike
from statistics import mean
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from g_index import Experiment
from node_utils import node_divergence
from utils import *
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets


#Experiment Settings
EXP_ROOT = '../experiments/'
EXP_FILES = glob.glob(EXP_ROOT+"/*.json")
DD_CACHE = cache_dd()
DOMAIN_LENGTHS = get_domain_lengths(AVAILABLE_DOMAINS)

##Plot Settings
ALPHA,S=0.90,240
plt.rcParams.update({
    "figure.figsize":(16,9),
    "font.size": 24,
    "lines.color": "black",
    "patch.edgecolor": "white",
    "axes.edgecolor": "0.15",
    "axes.linewidth":1.25,
    "axes.grid":True,
    "savefig.facecolor": "white",
    "savefig.edgecolor": "white",
    "xtick.major.size":20,
    'legend.fontsize': 20,
    'legend.handlelength': 2
    })

colors = ["#375E97","#FB6542","#FFBB00","#3F681C"]
models = ['gpt2-774M','gpt2-345M','gpt2-1.5B','gptneo-2.7B']
model_colormap = {model:color for color,model in zip(colors,models) }
name_correction = {'gpt2-medium': 'gpt2-345M',
                   'gpt2-large': 'gpt2-774M',
                   'gpt2-xl': 'gpt2-1.5B',
                   'EleutherAI/gpt-neo-2.7B': 'gptneo-2.7B'}

## 1. Domains

### What are domains

TODO

### List of Available Domains

In [None]:
for i,domain in enumerate(AVAILABLE_DOMAINS):
    print(i," ",domain)

### What is Domain Distance

TODO

### Calculating Domain Distance between two domains

TODO

In [None]:
domain_1 = 'telegram-2-reply'
domain_2 = 'telegram-3-reply'

calculate_dd(domain_1,domain_2,verbose=True)

### Calculating Domain Distance between two JSON files

TODO

In [None]:
domain_1 = '../domains/youtube-pause/youtube-pause-6e696d.json'
domain_2 = '../domains/facebook/facebook-b4ee9a.json'

calculate_dd(domain_1,domain_2,verbose=True)

### Plotting the Domain Distance Matrix

In [None]:
sns.set(rc={'figure.figsize':(16,12)})
generate_dd_matrix()

## 2. Experiments

### What is an Experiment? 

TODO

### Components of an experiment

The Major Components of an experiment are:
1. Intelligent System(IS): 
2. Curricula Domains: 
3. Task Domains:
4. Experience(E):
5. Generalization Difficulty(GD): 

In [None]:
EXP_FILES

In [None]:
exp_file = EXP_FILES[0]
exp = Experiment(exp_file)
exp.get_exp_components(truncate_domains=True,print_members=True)

## 3. g-index

### What is g-index?

TODO

### Calculating g-index from an experiment files

TODO

In [None]:
experiment_benchmark = Benchmark(exp_files[0])
#Now let's get the GIndex
experiment_benchmark.calculate_g_index()

### Calculating G-index from Values

In [2]:
n_tasks_domain=5
n_curricula_domain=40
sim_GD=1.0
sim_P=1e-4
sim_E=10
sim_PTheta=1.0
exp = Experiment()
exp.simulate_g_index(n_tasks_domain=n_tasks_domain, n_curricula_domain=n_curricula_domain, sim_GD=sim_GD, sim_P=sim_P, sim_E=sim_E, sim_PTheta=sim_PTheta)


30121.205

In [3]:
interact(exp.simulate_g_index,n_tasks_domain=n_tasks_domain, n_curricula_domain=n_curricula_domain, sim_GD=sim_GD, sim_P=sim_P, sim_E=sim_E, sim_PTheta=sim_PTheta);

interactive(children=(IntSlider(value=5, description='n_tasks_domain', max=15, min=-5), IntSlider(value=40, de…

## 4. Reproducing results mentioned in the paper

### Section-3 Simulation Plots

In [None]:
figsize=(16,9)
alpha=0.90
s=240
fontsize=24
labelsize=20
plt.rcParams.update({
    "font.size": 24,
    "lines.color": "black",
    "patch.edgecolor": "white",
    "axes.edgecolor": "0.15",
    "axes.linewidth":1.25,
    "axes.grid":True,
    "savefig.facecolor": "white",
    "savefig.edgecolor": "white",
    "xtick.major.size":20
    })

#### g-index vs Training Samples ( for varying $\theta$ ) 

#### g-index vs Compute ( for varying $\theta$ ) 

#### G-index vs $\theta$ ( for varying $\Omega$ ) HeatMap

### Section-4 Experiment Plots

#### $\theta$ vs Training Samples

In [None]:
results = {"theta":[],"total_samples":[],"IS":[],"g_index":[]}
for exp_file in tqdm(exp_files):
    
    exp_benchmark = Benchmark(exp_file)    
    theta = exp_benchmark.get_avg_perf()
    total_samples = exp_benchmark.get_curricula_domains(return_total=True)
    IS = exp_benchmark.get_IS_name()
    g_index = exp_benchmark.GetExperimentIndices().GIndex
    
    results['theta'].append(theta)
    results['total_samples'].append(total_samples)
    results['IS'].append(model_name)
    results['g_index'].append(g_index)

In [None]:
gi = results['g_index']
g_i_dots_scales = [ scale_dots( min(gi),max(gi), c) for c in gi ]
data_list = [ [x,y,model_name,marker_size] for (x,y,model_name,marker_size) in zip(results['total_samples'],results['theta'],results['IS'],g_i_dots_scales) ]

In [None]:
ylabel = "$\\theta$"
xlabel = "Training Samples"

fig,axes = plt.subplots()
plt.xlabel(xlabel)
plt.ylabel(ylabel)
axes.tick_params(labelsize=20)
axes.tick_params(labelsize=20)
plt.ylim(0,1)
for entry in data_list:
    color = model_colormap[name_correction[entry[2]]]
    sns.scatterplot(x=[entry[0]],y=[entry[1]],color=color,s=S,alpha=ALPHA)

plt.tight_layout()
markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in model_colormap.values()]
plt.legend(markers,model_colormap.keys(),fontsize=20)
plt.title(f"{ylabel} vs {xlabel}",loc="center",fontsize=22)
# fig.savefig(f'../plots/final/{xlabel}_theta_scatterplot_varying_sizes.jpg', bbox_inches='tight')
plt.show()

#### Program Size Vs $\theta$ **

In [None]:
domains_considered = AVAILABLE_DOMAINS

results_pst = {"IS":[],"domain_name":[],"length":[],"theta":[]}
for exp_file in EXP_FILES:
    exp = Experiment(exp_file)
    for domain in domains_considered:
        IS = exp.get_exp_components().IS
        length = DOMAIN_LENGTHS[domain]
        domain_details = next( item for item in exp["performance"]["domains"] if item["name"] == domain )
        
        results_pst["IS"].append(IS)
        results_pst["domain_name"].append(domain)
        results_pst["length"].append(length)
        results_pst["theta"].append( 1 - domain_details['divergence'])
        
df = pd.DataFrame(results_pst).groupby(by=['domain_name','IS']).mean()
{k: v for k, v in sorted(temp_lengths.items(), key=lambda item: item[1])}

In [None]:
xlabel = "Program size"
ylabel = "$\\theta$"

fig,axes = plt.subplots()
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.ylim(0,1)

rl = list(results_avp.values())
for temp in templates_considered:
    tdf = df.loc[temp]
    tdf = tdf.reindex(["gpt2-medium","gpt2-large","gpt2-xl","EleutherAI/gpt-neo-2.7B"])
    x = tdf['inflated_length']
    y = tdf['theta']
    for model in name_correction.keys():
        x = tdf.loc[model]['inflated_length']
        y = tdf.loc[model]['theta']
        color = model_colormap[name_correction[model]]
        sns.scatterplot(x=[x],y=[y],color=color,s=240,alpha=alpha)

plt.tight_layout()
markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in model_colormap.values()]
plt.legend(markers,model_colormap.keys(),fontsize=20)
plt.title(f"{ylabel} vs {xlabel}",loc="center",fontsize=22)
# fig.savefig(f'../plots/final/{xlabel}_theta_scatterplot_varying_sizes.jpg', bbox_inches='tight')
plt.show()

#### Program Size  vs Skill level

In [None]:
results_def = {"model_name":[],"domain_name":[],"program_size":[],"skill_level":[]}

for exp_file in exp_files:
    exp = json.load(open(exp_file))
    for temp in templates_considered:
        results_def["model_name"].append(exp["model"]["train_params"]["model_name"])
        results_def["inflated_length"].append(temp_lengths[temp])
        results_def["temp_name"].append(temp)
        temp = next( item for item in exp["performance"]["templates"] if item["name"] == temp )
        results_def["perfects"].append(temp['perfects'])
df = pd.DataFrame(results_def).groupby(by=['temp_name','model_name']).mean()
{k: v for k, v in sorted(temp_lengths.items(), key=lambda item: item[1])}

In [None]:
xlabel = "Program size"
ylabel = "Skill level"

fig,axes = plt.subplots()
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.ylim(0,1)

rl = list(results_def.values())
for temp in templates_considered:
    tdf = df.loc[temp]
    tdf = tdf.reindex(["gpt2-medium","gpt2-large","gpt2-xl","EleutherAI/gpt-neo-2.7B"])
    x = tdf['inflated_length']
    y = tdf['perfects']
    for model in name_correction.keys():
        x = tdf.loc[model]['inflated_length']
        y = tdf.loc[model]['perfects']
        color = model_colormap[name_correction[model]]
        sns.scatterplot(x=[x],y=[y],color=color,s=240,alpha=alpha)

plt.tight_layout()
markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in model_colormap.values()]
plt.legend(markers,model_colormap.keys(),fontsize=20)
plt.title(f"{ylabel} vs {xlabel}",loc="center",fontsize=22)
# fig.savefig(f'../plots/final/{xlabel}_{ylabel}_scatterplot_varying_sizes.jpg', bbox_inches='tight')
plt.show()

#### Compute Vs $\theta$

## 5. Request the data

You can send us a mail at [humans@mayahq.com](mailto:humans@mayahq.com) breifly describing your use case to get the data.

## 6. Cite us!

TODO