In [None]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import scipy
from teeplot import teeplot as tp


In [None]:
df = pd.read_parquet("https://osf.io/gk2ty/download")


In [None]:
df.columns


In [None]:
df["Components"] = df["Task"].map(
    {
        "AND": 2,
        "ANDNOT": 3,
        "NAND": 1,
        "NOR": 4,
        "NOT": 1,
        "OR": 3,
        "ORNOT": 2,
        "XOR": 4,
        "EQUALS": 5,
    },
)


In [None]:
df["codes for num tasks"] = df.groupby(
    ["Site", "Lineage Generation Index", "Treatment", "Run ID"],
    observed=True,
)["Is Task Coding Site"].transform("sum")


In [None]:
df["prev codes for num tasks"] = df.groupby(
    ["Site", "Lineage Generation Index", "Treatment", "Run ID"],
    observed=True,
)["Prev Is Task Coding Site"].transform("sum")


In [None]:
df["prev codes for tasks"] = df["prev codes for num tasks"].astype(bool)


In [None]:
df["prev coded for tasks"] = df.groupby(
    ["Site", "Lineage Generation Index", "Treatment", "Run ID"],
    observed=True,
)["Prev Is Task Coding Site Cumulative Count"].transform("sum")
df["prev coded for tasks"] = df["prev coded for tasks"].astype(bool)


In [None]:
df['First Task Generation'] = df['Generation Born'].where(
    df['has task'] != 0,
).groupby(
    [df['Treatment'], df['Run ID'], df['Task']],
    observed=True,
).transform('min')


In [None]:
df["SLIP_INSERTION_BOOL_MASK any"] = df.groupby(
    ["Lineage Generation Index", "Treatment", "Run ID"],
    observed=True,
)["SLIP_INSERTION_BOOL_MASK"].transform("any")


In [None]:
dff = df[
    (df["delta has task"] == 1)
    & (df["Generation Born"] == df["First Task Generation"])
    & (df["Treatment"] == "Slip-duplicate")
    # & df["Is Task Coding Site"]
].reset_index(drop=True)
dff


In [None]:
dff.columns


In [None]:
tp.tee(
    sns.catplot,
    data=dff[
        dff["SLIP_INSERTION_BOOL_MASK any"]
    ].astype(
        {"Prev Slip Insertion Cumulative Count": bool},
    ).groupby(
        ["Treatment", "Run ID", "Task", "Prev Slip Insertion Cumulative Count"],
        observed=True,
    ).agg(
        {
            "Is Task Coding Site": "mean",
            "Components": "first",
        },
    ),
    x="Prev Slip Insertion Cumulative Count",
    y="Is Task Coding Site",
    col="Components",
    kind="bar",
    sharey=False,
)


In [None]:
dfy = dff.groupby(
    ["Treatment", "Run ID", "Task"],
    observed=True,
)["Is Task Coding Site"].transform("mean")
print(dfy)

dff["Mean Is Task Coding Site"] = dfy


In [None]:
dfp = dff.astype(
    {"Prev Slip Insertion Cumulative Count": bool},
).groupby(
    ["Treatment", "Run ID", "Task", "Prev Slip Insertion Cumulative Count"],
    observed=True,
).agg(
    {
        "Mean Is Task Coding Site": "first",
        "Is Task Coding Site": "mean",
        "Components": "first",
        "SLIP_INSERTION_BOOL_MASK any": "first",
    },
)

dfp["Is Task Coding Site"] /= dfp["Mean Is Task Coding Site"]
dfp = dfp.reset_index()
dfp


In [None]:
tp.tee(
    sns.catplot,
    data=dfp[
        dfp["SLIP_INSERTION_BOOL_MASK any"]
    ],
    hue="Prev Slip Insertion Cumulative Count",
    y="Is Task Coding Site",
    x="Components",
    # row="Treatment",
    kind="violin",
    density_norm="width",
    cut=0,
    split=True,
    gap=0.1,
    # notch=True,
    aspect=1.4,
    teeplot_outattrs={"slipgain": "only"},
)


In [None]:
# Perform Kruskal-Wallis test
kruskal_results = dfp[
    dfp["SLIP_INSERTION_BOOL_MASK any"]
].groupby("Components").apply(
    lambda x: scipy.stats.kruskal(
        x[
            x["Prev Slip Insertion Cumulative Count"]
        ]["Is Task Coding Site"],
        x[
            ~x["Prev Slip Insertion Cumulative Count"]
        ]["Is Task Coding Site"]
    ),
    include_groups=True,
)

# Create a DataFrame to store the results
kruskal_df = pd.DataFrame(
    kruskal_results.tolist(),
    index=kruskal_results.index,
    columns=["H-statistic", "p-value"]
)

print(kruskal_df)
# Calculate means and standard deviations for all groups
group_stats = dfp[
    dfp["SLIP_INSERTION_BOOL_MASK any"]
].groupby(["Components", "Prev Slip Insertion Cumulative Count"])["Is Task Coding Site"].agg(["mean", "std"]).reset_index()

# Print the group statistics
print(group_stats)
# Calculate group sizes
group_sizes = dfp[
    dfp["SLIP_INSERTION_BOOL_MASK any"]
].groupby(["Components", "Prev Slip Insertion Cumulative Count"]).size().reset_index(name='size')

# Print the group sizes
print(group_sizes)


In [None]:
tp.tee(
    sns.catplot,
    data=dfp,
    hue="Prev Slip Insertion Cumulative Count",
    y="Is Task Coding Site",
    x="Components",
    # row="Treatment",
    kind="violin",
    density_norm="width",
    cut=0,
    split=True,
    gap=0.1,
    # notch=True,
    aspect=1.4,
)


In [None]:
# Perform Kruskal-Wallis test
kruskal_results = dfp.groupby("Components").apply(
    lambda x: scipy.stats.kruskal(
        x[
            x["Prev Slip Insertion Cumulative Count"]
        ]["Is Task Coding Site"],
        x[
            ~x["Prev Slip Insertion Cumulative Count"]
        ]["Is Task Coding Site"]
    )
)

# Create a DataFrame to store the results
kruskal_df = pd.DataFrame(
    kruskal_results.tolist(),
    index=kruskal_results.index,
    columns=["H-statistic", "p-value"]
)

print(kruskal_df)
# Calculate means and standard deviations for all groups
group_stats = dfp.groupby(["Components", "Prev Slip Insertion Cumulative Count"])["Is Task Coding Site"].agg(["mean", "std"]).reset_index()

# Print the group statistics
print(group_stats)
# Calculate group sizes
group_sizes = dfp.groupby(["Components", "Prev Slip Insertion Cumulative Count"]).size().reset_index(name='size')

# Print the group sizes
print(group_sizes)


In [None]:
dfpx = dff[
    ~dff["Prev Is Viability Site"]
].astype(
    {"Prev Slip Insertion Cumulative Count": bool},
).groupby(
    ["Treatment", "Run ID", "Task", "Prev Slip Insertion Cumulative Count"],
    observed=True,
).agg(
    {
        "Mean Is Task Coding Site": "first",
        "Is Task Coding Site": "mean",
        "Components": "first",
        "SLIP_INSERTION_BOOL_MASK any": "first",
    },
)

dfpx["Is Task Coding Site"] /= dfpx["Mean Is Task Coding Site"]
dfpx = dfpx.reset_index()
print(dfpx)

tp.tee(
    sns.catplot,
    data=dfpx,
    hue="Prev Slip Insertion Cumulative Count",
    y="Is Task Coding Site",
    x="Components",
    # row="Treatment",
    kind="violin",
    density_norm="width",
    cut=0,
    split=True,
    gap=0.1,
    # notch=True,
    aspect=1.4,
    teeplot_outattrs={"slipgain": "only"},
)


In [None]:
# Perform Kruskal-Wallis test
kruskal_results = dfpx.groupby("Components").apply(
    lambda x: scipy.stats.kruskal(
        x[
            x["Prev Slip Insertion Cumulative Count"]
        ]["Is Task Coding Site"],
        x[
            ~x["Prev Slip Insertion Cumulative Count"]
        ]["Is Task Coding Site"]
    )
)

# Create a DataFrame to store the results
kruskal_df = pd.DataFrame(
    kruskal_results.tolist(),
    index=kruskal_results.index,
    columns=["H-statistic", "p-value"]
)

print(kruskal_df)
# Calculate means and standard deviations for all groups
group_stats = dfpx.groupby(["Components", "Prev Slip Insertion Cumulative Count"])["Is Task Coding Site"].agg(["mean", "std"]).reset_index()

# Print the group statistics
print(group_stats)
# Calculate group sizes
group_sizes = dfpx.groupby(["Components", "Prev Slip Insertion Cumulative Count"]).size().reset_index(name='size')

# Print the group sizes
print(group_sizes)


In [None]:
tp.tee(
    sns.displot,
    data=dff.astype(
        {"Prev Slip Insertion Cumulative Count": bool},
    ).groupby(
        ["Treatment", "Run ID", "Task", "Prev Slip Insertion Cumulative Count"],
        observed=True,
    ).agg(
        {
            "Is Task Coding Site": "mean",
            "Components": "first",
            "Task": "first",
        },
    ),
    x="Prev Slip Insertion Cumulative Count",
    col="Components",
    # row="Treatment",
    kind="hist",
    stat="count",
)


In [None]:
tp.tee(
    sns.catplot,
    data=dff.astype(
        {"Prev Slip Insertion Cumulative Count": bool},
    ).groupby(
        ["Treatment", "Run ID", "Task"],
        observed=True,
    ).agg(
        {
            "Prev Slip Insertion Cumulative Count": "mean",
            "Is Task Coding Site": "mean",
            "Components": "first",
            "Task": "first",
        },
    ),
    y="Prev Slip Insertion Cumulative Count",
    x="Components",
    hue="Is Task Coding Site",
    kind="swarm",
)


In [None]:
for multiple in "fill", "stack":
    tp.tee(
        sns.displot,
        data=dff.astype(
            {
                "Components": "category",
                "Prev Slip Insertion Cumulative Count": bool
            },
        ),
        hue="Prev Slip Insertion Cumulative Count",
        x="Components",
        kind="hist",
        multiple=multiple,
        shrink=0.8,
    )


In [None]:
tp.tee(
    sns.catplot,
    data=dff.astype(
        {"Prev Is Viability Site": bool}
    ),
    x="Prev Is Viability Site",
    y="Is Task Coding Site",
    col="Components",
    kind="bar",
    sharey=False,
)


In [None]:
tp.tee(
    sns.catplot,
    data=dff.astype(
        {"Prev Slip Insertion Cumulative Count": bool}
    ),
    x="Prev Is Viability Site",
    y="Prev Slip Insertion Cumulative Count",
    col="Components",
    kind="bar",
    sharey=False,
)


In [None]:
tp.tee(
    sns.catplot,
    data=dff.astype(
        {"Prev Slip Insertion Cumulative Count": bool}
    ),
    x="prev codes for tasks",
    y="Is Task Coding Site",
    col="Components",
    kind="bar",
    sharey=False,
)


In [None]:
tp.tee(
    sns.catplot,
    data=dff.astype(
        {"Prev Slip Insertion Cumulative Count": bool}
    ),
    x="Prev Slip Insertion Cumulative Count",
    y="prev codes for tasks",
    col="Components",
    kind="bar",
    sharey=False,
)


In [None]:
tp.tee(
    sns.catplot,
    data=dff.astype(
        {"Prev Slip Insertion Cumulative Count": bool}
    ),
    x="SLIP_INSERTION_BOOL_MASK",
    y="Is Task Coding Site",
    col="Components",
    kind="bar",
    sharey=False,
)


In [None]:
tp.tee(
    sns.displot,
    data=dff.astype(
        {"Is Task Coding Site": "category", "Components": "category"}
    ),
    multiple="fill",
    x="Is Task Coding Site",
    col="Components",
    kind="hist",
    hue="prev codes for num tasks",
    row="Treatment",
)


In [None]:
tp.tee(
    sns.catplot,
    data=dff[
        dff["Is Task Coding Site"]
    ].groupby(
        ["Lineage Generation Index", "Treatment", "Run ID", "Components"],
        observed=True,
    )['prev codes for tasks'].sum().reset_index(),
    hue="Treatment",
    y="prev codes for tasks",
    x="Components",
    kind="box",
    notch=True,
)


In [None]:
tp.tee(
    sns.catplot,
    data=dff[
        dff["Is Task Coding Site"]
    ].groupby(
        ["Lineage Generation Index", "Treatment", "Run ID", "Components"],
        observed=True,
    )['prev coded for tasks'].sum().reset_index(),
    hue="Treatment",
    y="prev coded for tasks",
    x="Components",
    kind="box",
    notch=True,
)


In [None]:
tp.tee(
    sns.catplot,
    data=dff[
        dff["Is Task Coding Site"]
    ].groupby(
        ["Lineage Generation Index", "Treatment", "Run ID", "Components"],
        observed=True,
    )['prev codes for tasks'].mean().reset_index(),
    hue="Treatment",
    y="prev codes for tasks",
    x="Components",
    kind="box",
    notch=True,
)


In [None]:
tp.tee(
    sns.catplot,
    data=dff[
        dff["Is Task Coding Site"]
    ].groupby(
        ["Lineage Generation Index", "Treatment", "Run ID", "Components"],
        observed=True,
    )['prev coded for tasks'].mean().reset_index(),
    hue="Treatment",
    y="prev coded for tasks",
    x="Components",
    kind="box",
    notch=True,
)


In [None]:
dff["one"] = 1
tp.tee(
    sns.catplot,
    data=dff[
        dff["Is Task Coding Site"]
    ].groupby(
        ["Lineage Generation Index", "Treatment", "Run ID", "Components"],
        observed=True,
    )['one'].sum().reset_index(),
    hue="Treatment",
    y="one",
    x="Components",
    kind="box",
    notch=True,
)
