In [2]:
from src.distances import distances_df, merged_distances
from src.tokens import load_tokens
import plotly.express as px
import pandas as pd
import statsmodels.api as sm

In [6]:
gallery="movies"
distance = "cd"
df = (
    distances_df(gallery=gallery)
    .groupby(["notation", "from_spec"])[[distance, "from_length"]]
    .median()
    .reset_index().groupby(["notation"]).median().reset_index()
)
fig = px.scatter(
    df,
    x="from_length",
    y=distance,
    color="notation",
    text="notation",
    labels={
        distance: f"Specification Remoteness ({distance})",
        "from_length": "Size in bytes",
    },
    height=750,
)
fig.update_yaxes(rangemode="tozero")
fig.update_xaxes(rangemode="tozero")
fig.update_traces(textposition="top center")



The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [None]:
df1 = (
    load_tokens()
    .query(f"gallery == '{gallery}'")
    .groupby(["notation"])["token"]
    .nunique()
    .reset_index()
)

df2 = (
    distances_df(gallery=gallery)
    .groupby(["notation"])[[distance, "from_length"]]
    .median()
    .reset_index()
)
fig = px.scatter(
    pd.merge(df1, df2).reset_index(),
    x="token",
    y="from_length",
    color="notation",
    text="notation",
    height=750,
    labels={distance: "Dispersion", "token": "Vocabulary Size"},
)
fig.update_yaxes(rangemode="tozero")
fig.update_xaxes(rangemode="tozero")
fig.update_traces(textposition="top center")

In [4]:
notation = "ggplot2"
notation2 = "vega-lite"

merged_in = merged_distances(gallery, notation, "from_length", notation2, "from_length")

x = str(merged_in.columns[2])
y = str(merged_in.columns[3])

merged = merged_in.groupby("from_spec")[[x, y]].median().reset_index()

mn = 0  # min(merged[x].min(), merged[y].min())
mx = max(merged[x].max(), merged[y].max())
s = 0.1 * (mx - mn)
mx += s

fig = px.scatter(
    merged,
    x=x,
    y=y,
    width=500,
    height=500,
    labels={x: x + " remoteness", y: y + " remoteness"},
)
fig.update_traces(hoverinfo="none", hovertemplate="<extra></extra>")
fig.update_layout(showlegend=False)
fig.add_traces(
    px.line(sm.PCA(merged[[x, y]]).project(ncomp=1), x=x, y=y)
    .update_traces(
        line_color="grey", hoverinfo="skip", hovertemplate="<extra></extra>"
    )
    .data
)