## Imports

In [None]:
import json
import os
import sys
from collections import defaultdict
import glob
from pprint import pprint
from os import PathLike
from statistics import mean
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from g_index import Experiment
from node_utils import node_divergence
from utils import *
from ipywidgets import interact,FloatLogSlider,IntSlider,FloatSlider
import ipywidgets as widgets


#Experiment Settings
EXP_ROOT = '../experiments/'
EXP_FILES = glob.glob(EXP_ROOT+"/*.json")
DD_CACHE = cache_dd()
DOMAIN_LENGTHS = get_domain_lengths(AVAILABLE_DOMAINS)

##Plot Settings
ALPHA,S=0.90,240
plt.rcParams.update({
    "figure.figsize":(16,9),
    "font.size": 24,
    "lines.color": "black",
    "patch.edgecolor": "white",
    "axes.edgecolor": "0.15",
    "axes.linewidth":1.25,
    "axes.grid":True,
    "savefig.facecolor": "white",
    "savefig.edgecolor": "white",
    "xtick.major.size":20,
    'legend.fontsize': 20,
    'legend.handlelength': 2,
    'axes.titlesize':22
    })

colors = ["#375E97","#FB6542","#FFBB00","#3F681C"]
models = ['gpt2-774M','gpt2-345M','gpt2-1.5B','gptneo-2.7B']
model_colormap = {model:color for color,model in zip(colors,models) }
model_name_correction = {'gpt2-medium': 'gpt2-345M',
                   'gpt2-large': 'gpt2-774M',
                   'gpt2-xl': 'gpt2-1.5B',
                   'EleutherAI gpt-neo-2.7B': 'gptneo-2.7B'}

## 1. Domains

### What are domains

TODO

### List of Available Domains

In [None]:
for i,domain in enumerate(AVAILABLE_DOMAINS):
    print(i," ",domain)

### What is Domain Distance

TODO

### Calculating Domain Distance between two domains

TODO

In [None]:
domain_1 = 'telegram-2-reply'
domain_2 = 'telegram-3-reply'

calculate_dd(domain_1,domain_2,verbose=True)

### Calculating Domain Distance between two JSON files

TODO

In [None]:
domain_1 = '../domains/youtube-pause/youtube-pause-6e696d.json'
domain_2 = '../domains/facebook/facebook-b4ee9a.json'

calculate_dd(domain_1,domain_2,verbose=True)

### Plotting the Domain Distance Matrix

In [None]:
sns.set(rc={'figure.figsize':(16,12)})
generate_dd_matrix()

## 2. Experiments

### What is an Experiment? 

TODO

### Components of an experiment

The Major Components of an experiment are:
1. Intelligent System(`IS`): 
2. Curricula Domains: 
3. Task Domains:
4. Experience(`E`):
5. Generalization Difficulty( `GD` ): 
6. Performance Details 

In [None]:
EXP_FILES

In [None]:
exp_file = EXP_FILES[0]
exp = Experiment(exp_file)
exp.get_exp_components(truncate_domains=True,print_members=True)

## 3. g-index

### What is `g-index`?

TODO

### Calculating `g-index` from an experiment files

Steps:
1. Load an Experiment file in the Experiment Class
2. Run `calculate_g_index` function

In [None]:
exp_file = EXP_FILES[0]
exp = Experiment(exp_file)
exp.get_exp_components(print_members=True,truncate_domains=True)

In [None]:
#Use `calculate_g_index` to calculate the value for this experiment
exp.calculate_g_index()

### Calculating `g-index` from Values

`g-index` can be calculated by
- The Ipywidget given below OR
- Create a dummy experiment and give the function `simulate_g_index` the desired values 

In [None]:
exp = Experiment()

In [None]:
#Create a dummy Experiment Class
style = {'description_width': '100px'}
interact(exp.simulate_g_index,
         n_tasks_domain=widgets.IntSlider(value=5,min=1,max=100,step=1,description="Samples per Task domain",style=style), 
         n_curricula_domain=IntSlider(value=5,min=1,max=100,step=1,description="Samples per Curricula domain",style=style),
         sim_GD=FloatSlider(value=0.0,min=0.0,max=1.00,step=0.01,description="Generalization Difficulty",style=style), 
         sim_P=FloatLogSlider(value=-3,min=-10,max=10,step=1,description="Prior",style=style),
         sim_E=FloatLogSlider(value=1,min=1,max=12,step=1,description="Experience",style=style),
         sim_PTheta=FloatSlider(value=0.1,min=0.00,max=1.00,step=0.01,description="Performance Î¸",style=style));

## 4. Reproducing results mentioned in the paper

### Section-3 Simulation Plots

#### g-index vs Training Samples ( for varying $\theta$ ) 

#### g-index vs Compute ( for varying $\theta$ ) 

#### G-index vs $\theta$ ( for varying $\Omega$ ) HeatMap

### Section-4 Experiment Plots

#### Average Performance ( $\theta$ )  vs Training Samples

In [None]:
results = {"IS":[],"avg_perf":[],"total_samples":[],"g_index":[]}
for exp_file in EXP_FILES:
    exp = Experiment(exp_file)    
    exp_c = exp.get_exp_components()
    avg_perf = exp_c.AveragePerformance
    IS = exp_c.IS
    CurriculaDomains = exp_c.CurriculaDomains
    total_samples = sum( [ domain['num_samples'] for domain in CurriculaDomains])
    g_index = exp.calculate_g_index()
    
    results['IS'].append(IS)
    results['avg_perf'].append(avg_perf)
    results['total_samples'].append(total_samples)
    results['g_index'].append(g_index)

In [None]:
gi = results['g_index']
g_i_dots_scales = [ scale_dots( min(gi),max(gi), c) for c in gi ]
data_list = [ [x,y,model_name,marker_size] for (x,y,model_name,marker_size) in zip(results['total_samples'],results['avg_perf'],results['IS'],g_i_dots_scales) ]

In [None]:
fig,axes = plt.subplots()
ylabel = "$\\theta$"
xlabel = "Training Samples"
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.ylim(0,1)

for entry in data_list:
    color = model_colormap[model_name_correction[entry[2]]]
    sns.scatterplot(x=[entry[0]],y=[entry[1]],color=color,s=S,alpha=ALPHA)

plt.tight_layout()
#Dummy plot to generate legend
markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in model_colormap.values()]
plt.legend(markers,model_colormap.keys())
plt.title(f"{ylabel} vs {xlabel}",loc="center")
plt.show()

#### Average Domain Performance ( $\theta$ ) vs Program Size

In [None]:
domains_considered = AVAILABLE_DOMAINS
results_pst = {"IS":[],"domain_name":[],"program_size":[],"theta":[]}

for exp_file in EXP_FILES:
    exp = Experiment(exp_file)
    for domain in domains_considered:
        exp_c = exp.get_exp_components()
        IS = exp_c.IS
        perf_details = exp_c.PerformanceDetails
        perf_details = next( item for item in perf_details if item["name"] == domain )
        program_size = DOMAIN_LENGTHS[domain]
        
        results_pst["IS"].append(IS)
        results_pst["domain_name"].append(domain)
        results_pst["program_size"].append(program_size)
        results_pst["theta"].append( perf_details['performance'])        
df = pd.DataFrame(results_pst).groupby(by=['domain_name','IS']).mean()

In [None]:
fig,axes = plt.subplots()
xlabel = "Program size"
ylabel = "$\\theta$"
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.ylim(0,1)

rl = list(results_pst.values())
for temp in domains_considered:
    tdf = df.loc[temp]
    tdf = tdf.reindex(list(model_name_correction.keys()))
    x = tdf['program_size']
    y = tdf['theta']
    for model in model_name_correction.keys():
        x = tdf.loc[model]['program_size']
        y = tdf.loc[model]['theta']
        color = model_colormap[model_name_correction[model]]
        sns.scatterplot(x=[x],y=[y],color=color,s=S,alpha=ALPHA)

plt.tight_layout()
#Dummy Plot to Generate Legend
markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in model_colormap.values()]
plt.legend(markers,model_colormap.keys())
plt.title(f"{ylabel} vs {xlabel}",loc="center")
plt.show()

#### Skill level vs Program Size 

In [None]:
results_def = {"IS":[],"domain_name":[],"program_size":[],"skill_level":[]}
for exp_file in EXP_FILES:
    exp = Experiment(exp_file)
    for domain in domains_considered:
        exp_c = exp.get_exp_components()
        IS = exp_c.IS
        perf_details = exp_c.PerformanceDetails
        perf_details = next( item for item in perf_details if item["name"] == domain )
        program_size = DOMAIN_LENGTHS[domain]
        results_def["IS"].append(IS)
        results_def["domain_name"].append(domain)
        results_def["program_size"].append(program_size)
        results_def["skill_level"].append(perf_details['perfects'])
df = pd.DataFrame(results_def).groupby(by=['domain_name','IS']).mean()

In [None]:
fig,axes = plt.subplots()
xlabel = "Program size"
ylabel = "Skill level"
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.ylim(0,1)

rl = list(results_def.values())
for domain in domains_considered:
    tdf = df.loc[domain]
    tdf = tdf.reindex(model_name_correction.keys())
    x = tdf['program_size']
    y = tdf['skill_level']
    for model in model_name_correction.keys():
        x = tdf.loc[model]['program_size']
        y = tdf.loc[model]['skill_level']
        color = model_colormap[model_name_correction[model]]
        sns.scatterplot(x=[x],y=[y],color=color,s=S,alpha=ALPHA)

plt.tight_layout()
#Dummy Plot to Generate Legend
markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in model_colormap.values()]
plt.legend(markers,model_colormap.keys())
plt.title(f"{ylabel} vs {xlabel}",loc="center")
plt.show()

#### Compute Vs Average Performance ( $\theta$ ) 

In [None]:
results_ct = {"IS":[],"avg_perf":[],"compute":[],"g_index":[]}
for exp_file in EXP_FILES:
    exp  = Experiment(exp_file)
    exp_c = exp.get_exp_components(return_raw_experience=True)
    IS = exp_c.IS
    avg_perf = exp_c.AveragePerformance
    compute = exp_c.E
    g_index = exp.calculate_g_index()
    results_ct['IS'].append(model_name_correction[IS])
    results_ct['avg_perf'].append(avg_perf)
    results_ct['compute'].append(compute)
    results_ct['g_index'].append(g_index)

In [None]:
g_i_dots_scales = [ scale_dots( min(gi),max(gi), c) for c in results_ct['g_index'] ]
data_list = [ [x,y,marker_size,model_name] for (x,y,marker_size,model_name) in zip(results_ct['compute'],results_ct['avg_perf'],g_i_dots_scales,results_ct['IS']) ]

In [None]:
fig,axes = plt.subplots()
ylabel = "Performance"
xlabel = "Compute"
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.ylim(0,1.01)

for entry in data_list:
    sns.scatterplot(x=[entry[0]],y=[entry[1]],color=model_colormap[entry[3]],s=entry[2],alpha=0.85)
markers = [plt.Line2D([0,0],[0,0],color=color, marker='o', linestyle='') for color in model_colormap.values()]
plt.legend(markers,model_colormap.keys())
plt.tight_layout()
plt.title(f"{ylabel} vs {xlabel}",loc="center")
plt.show()

## 5. Request the data

You can send us a mail at [humans@mayahq.com](mailto:humans@mayahq.com) breifly describing your use case to get the data.

## 6. Cite us!

TODO