In [8]:
import nbformat as nbf

In [9]:
nb_id = "airbnb/airbnb"
nb = nbf.read(f"{nb_id}.ipynb",as_version=4) #the toy original is the pandas-only control
code = "import lux"
nb["cells"].insert(0,nbf.v4.new_code_cell(code))

# Explicit compute_meta_recs at every non-print cell
i=0
for cell in nb["cells"]:
    if i>2:
        if ("# {{NO LUX}}"in cell["source"]):
            cell["source"] = cell["source"]+"\nairbnb.compute_meta_recs()"        
    i+=1

nbf.write(nb, f'{nb_id}_baseline.ipynb')

In [12]:
nb = nbf.read(f"{nb_id}.ipynb",as_version=4) 
code = "import lux\nlux.config.lazy_maintain = True"
nb["cells"].insert(0,nbf.v4.new_code_cell(code))
nbf.write(nb, f'{nb_id}_o1.ipynb')

In [13]:
nb = nbf.read(f"{nb_id}.ipynb",as_version=4) 
code = "import lux\nlux.config.lazy_maintain = True\nlux.config.early_pruning = True"
nb["cells"].insert(0,nbf.v4.new_code_cell(code))
nbf.write(nb, f'{nb_id}_o1o2.ipynb')

In [14]:
nb = nbf.read(f"{nb_id}.ipynb",as_version=4) 
code = "import lux\nlux.config.lazy_maintain = True\nlux.config.early_pruning = True\nlux.config.streaming = True"
nb["cells"].insert(0,nbf.v4.new_code_cell(code))
nbf.write(nb, f'{nb_id}_o1o2o3.ipynb')

------------------------------------------------------------------------------------------

In [35]:
import glob
import time
import papermill as pm
import pandas as pd
import numpy as np
import json
trial = []

for nb_name in glob.glob("airbnb/airbnb_*_output.ipynb"):
# for nb_name in glob.glob("airbnb/airbnb_*_*_output.ipynb"):
    snb = nb_name.split("_")
    if len(snb)==4:
        _, condition, nPts, _ = nb_name.split("_")
    elif len(snb)==3:
        condition = "pandas"
        _,  nPts, _ = nb_name.split("_")
    with open(nb_name) as json_file:
        data = json.load(json_file)
        for cell in data['cells']:
            label = cell["source"][0]
            cell_type = None
            if "# {{NO LUX}}" in label:
                cell_type = "python"
            elif "# {{PRINT SERIES}}" in label:
                cell_type = "print_series"
            elif "# {{PRINT DF}}" in label:
                cell_type = "print_dataframe"
            duration = cell["metadata"]["papermill"]["duration"]
#             print(nb_name, cell_type, duration)
            trial.append([condition, nPts,cell_type, duration])

In [36]:
trial = pd.DataFrame(trial,columns=["condition","nPts","cell_type","time"])

In [37]:
trial.groupby("cell_type").count()

Unnamed: 0_level_0,condition,nPts,time
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
print_dataframe,280,280,271
print_series,140,140,136
python,340,340,334


--------------

In [38]:
import pandas as pd
import altair as alt

In [39]:
overall = trial.groupby("condition").sum().reset_index()

In [41]:
overall

Unnamed: 0,condition,time
0,baseline,5363.754085
1,o1,1198.268738
2,o1o2,692.022234
3,o1o2o3,510.325552
4,pandas,172.361008


In [42]:
alt.Chart(overall).mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
)

In [47]:
overall = trial.groupby(["condition","nPts"]).sum().reset_index()
chart = alt.Chart(overall,title = "Overall").mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
    column="nPts",
    color="condition"
)
chart = chart.configure_title(fontSize=16, offset=5, orient='top', anchor='middle')
chart

In [45]:
overall = trial.groupby(["condition","nPts","cell_type"]).mean().reset_index()
pdf = overall[overall["cell_type"] =="python"]
chart = alt.Chart(pdf,title = "Non-Lux Python Operations").mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
    column="nPts",
    color="condition"
)
chart = chart.configure_title(fontSize=16, offset=5, orient='top', anchor='middle')
chart

In [28]:
overall = trial.groupby(["condition","nPts","cell_type"]).mean().reset_index()
pdf = overall[overall["cell_type"] =="print_series"]
chart = alt.Chart(pdf,title = "Print Series").mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
    column="nPts",
    color="condition"
)
chart = chart.configure_title(fontSize=16, offset=5, orient='top', anchor='middle')
chart

In [30]:
overall = trial.groupby(["condition","nPts","cell_type"]).mean().reset_index()
pdf = overall[overall["cell_type"] =="print_dataframe"]
chart = alt.Chart(pdf,title = "Print Dataframe").mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
    column="nPts",
    color="condition"
)
chart = chart.configure_title(fontSize=16, offset=5, orient='top', anchor='middle')
chart

------------

### Communities post processing

In [49]:
import glob
import time
import papermill as pm
import pandas as pd
import numpy as np
import json
trial = []

for nb_name in glob.glob("communities/communities_*_output.ipynb"):
# for nb_name in glob.glob("airbnb/airbnb_*_*_output.ipynb"):
    snb = nb_name.split("_")
    if len(snb)==4:
        _, condition, nPts, _ = nb_name.split("_")
    elif len(snb)==3:
        condition = "pandas"
        _,  nPts, _ = nb_name.split("_")
    with open(nb_name) as json_file:
        data = json.load(json_file)
        for cell in data['cells']:
            label = cell["source"][0]
            cell_type = None
            if "# {{NO LUX}}" in label:
                cell_type = "python"
            elif "# {{PRINT SERIES}}" in label:
                cell_type = "print_series"
            elif "# {{PRINT DF}}" in label:
                cell_type = "print_dataframe"
            duration = cell["metadata"]["papermill"]["duration"]
#             print(nb_name, cell_type, duration)
            trial.append([condition, nPts,cell_type, duration])

trial = pd.DataFrame(trial,columns=["condition","nPts","cell_type","time"])

In [50]:
overall = trial.groupby(["condition","nPts"]).sum().reset_index()
chart = alt.Chart(overall,title = "Overall").mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
    column="nPts",
    color="condition"
)
chart = chart.configure_title(fontSize=16, offset=5, orient='top', anchor='middle')
chart

In [51]:
overall = trial.groupby(["condition","nPts","cell_type"]).mean().reset_index()
pdf = overall[overall["cell_type"] =="python"]
chart = alt.Chart(pdf,title = "Non-Lux Python Operations").mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
    column="nPts",
    color="condition"
)
chart = chart.configure_title(fontSize=16, offset=5, orient='top', anchor='middle')
chart

In [52]:
overall = trial.groupby(["condition","nPts","cell_type"]).mean().reset_index()
pdf = overall[overall["cell_type"] =="print_series"]
chart = alt.Chart(pdf,title = "Print Series").mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
    column="nPts",
    color="condition"
)
chart = chart.configure_title(fontSize=16, offset=5, orient='top', anchor='middle')
chart

In [53]:
overall = trial.groupby(["condition","nPts","cell_type"]).mean().reset_index()
pdf = overall[overall["cell_type"] =="print_dataframe"]
chart = alt.Chart(pdf,title = "Print Dataframe").mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
    column="nPts",
    color="condition"
)
chart = chart.configure_title(fontSize=16, offset=5, orient='top', anchor='middle')
chart

----------------

In [6]:
df = pd.read_csv("macrobenchmark.csv")

In [7]:
df.nPts = df.nPts.astype(int)

In [8]:
df["condition"] = df.nb_name.replace({"toy_o1.ipynb":"o1",
                    "toy_o1o2o3.ipynb":"o1o2o3",
                    "toy_o1o2.ipynb":"o1o2",
                    "toy.ipynb":"pandas",
                    "toy_baseline.ipynb":"no opt"})

In [9]:
alt.Chart(df).mark_bar().encode(
    x = "condition",
    y = alt.Y("time",scale=alt.Scale(type='log'),title="log(total time)"),
    column="nPts",
    color="condition"
)

In [10]:
# # VegaLite can not do logarithmic stacked bar since it is misleading
# alt.Chart(df).mark_bar().encode(
#     x = alt.X("nPts",type="ordinal"),
#     y = alt.Y("time"),#,scale=alt.Scale(type='log'),title="log(total time)"),
#     color="condition"
# )

In [26]:
df = df.drop(columns="nb_name")

In [53]:
pdonly = df[df["condition"]=="pandas"]
best = df[df["condition"]=="o1o2o3"]
worst = df[df["condition"]=="no opt"]

In [None]:
import numpy as np

In [68]:
def pctChange(s1,s2):
    return (np.array(s1["time"])-np.array(s2["time"]))/np.array(s2["time"])

In [69]:
best["time"]

3      33.051949
8      24.736191
13     42.343533
18    197.560367
Name: time, dtype: float64

In [70]:
pctChange(best,pdonly)

array([0.52858007, 0.16597767, 0.74957558, 2.80147911])

In [71]:
pctChange(worst,pdonly)

array([ 3.77364417,  4.78507636, 15.92258916, 53.6757692 ])

In [73]:
pctChange(worst,best)

array([ 2.12292714,  3.96156705,  8.67239677, 13.38276199])