In [1]:
import sys
import pandas as pd
import plotnine as pn
from altk.effcomm.tradeoff import interpolate_data
from altk.effcomm.analysis import (
    get_dataframe,
    pearson_analysis,
    trade_off_means,
    trade_off_ttest,
)
from misc.file_util import load_languages, load_configs, set_seed
from modals.modal_language import uegaki

In [2]:
prefix = "/Users/nathanielimel/clms/projects/modals-effcomm/"

In [3]:
config_fn = prefix + "configs/cogsci/dev_large.yml"
configs = load_configs(config_fn)
set_seed(configs["random_seed"])
# tell pandas to output all columns
pd.set_option("display.max_columns", None)

# Load languages
langs_fn = prefix + configs["file_paths"]["artificial_languages"]
nat_langs_fn = prefix + configs["file_paths"]["natural_languages"]
dom_langs_fn = prefix + configs["file_paths"]["dominant_languages"]

In [4]:
# Load analysis files
analysis_fns = configs["file_paths"]["analysis"]
df_fn = prefix + analysis_fns["data"]
pareto_df_fn = prefix + analysis_fns["pareto_data"]
plot_fn = prefix + analysis_fns["plot"]
correlations_fn = prefix + analysis_fns["correlations"]
means_fn = prefix + analysis_fns["means"]
ttest_natural_fn = prefix + analysis_fns["ttest_natural"]
ttest_dlsav_fn = prefix + analysis_fns["ttest_dlsav"]

In [5]:
# Load languages
result_sampled = load_languages(langs_fn)
result_dominant = load_languages(dom_langs_fn)
langs = result_sampled["languages"]
dom_langs = result_dominant["languages"]

In [None]:
# Record all observations, including duplicates, for statistical analyses
subset = ["complexity", "comm_cost"]
kwargs = {"subset": subset, "duplicates": "leave"}

data = get_dataframe(langs, **kwargs)
pareto_data = get_dataframe(dom_langs, **kwargs)

## Plot

In [None]:
naturalness = configs["universal_property"]

# Add counts only for plot
plot_data = data.copy()
vcs = plot_data.value_counts(subset=subset, sort=False)
plot_data = data.drop_duplicates(subset=subset)  # drop dupes from original
plot_data = plot_data.sort_values(by=subset)
plot_data["counts"] = vcs.values

### Inspect data

In [None]:
data

In [None]:
# smooth pareto curve again
pareto_df = pareto_data[["comm_cost", "complexity"]]
pareto_points = pareto_df.to_records(index=False).tolist()
pareto_points = interpolate_data(pareto_points)
pareto_smoothed = pd.DataFrame(pareto_points, columns=["comm_cost", "complexity"])

# aesthetics for all data
kwargs = {
    # "color": naturalness,
}

kwargs["shape"] = "uegaki"
kwargs["size"] = "uegaki"

# if counts:
#     kwargs["size"] = "counts"

plot = (
    # Set data and the axes
    pn.ggplot(data=data, mapping=pn.aes(x="complexity", y="comm_cost"))
    + pn.scale_y_continuous(limits=[0, 1])
    + pn.geom_point(  # all langs
        stroke=0,
        alpha=1,
        mapping=pn.aes(**kwargs),
    )
    + pn.geom_line(size=1, data=pareto_smoothed)
    + pn.xlab("Complexity")
    + pn.ylab("Communicative cost")
    + pn.scale_color_cmap("cividis")
    + pn.theme_classic()
)

In [None]:
print(plot)

In [None]:
plot.save("prior_skewed.png", width=10, height=10, dpi=300)