In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

from pylib.analyze_agnostic import assay_agnostic_naive
from pylib.analyze_epistasis import (
    describe_skeletons,
    skeletonize_naive,
)
from pylib.modelsys_explicit import GenomeExplicit
from pylib.modelsys_explicit import (
    GenomeExplicit,
    CalcKnockoutEffectsAdditive,
    CalcKnockoutEffectsEpistasis,
    create_additive_array,
    create_epistasis_matrix_disjoint,
    describe_additive_array,
    describe_epistasis_matrix,
)


In [None]:
np.random.seed(1234)


## Create Sample Genome


In [None]:
num_sites = 10000
distn = lambda x: np.random.rand(x) * 0.7
additive_array = create_additive_array(num_sites, 0.04, distn)
epistasis_matrix = create_epistasis_matrix_disjoint(num_sites, 40, 4)
genome = GenomeExplicit(
    [
        CalcKnockoutEffectsAdditive(additive_array),
        CalcKnockoutEffectsEpistasis(epistasis_matrix, effect_size=(0.7, 1.6)),
    ],
)


## Describe and Inspect Genome


In [None]:
dfa = describe_additive_array(additive_array)
dfb = describe_epistasis_matrix(epistasis_matrix)
df_genome = pd.DataFrame.merge(dfa, dfb, on="site")
df_genome["site type"] = (
    df_genome["additive site"].astype(int)
    + df_genome["epistasis site"].astype(int) * 2
).map(
    {
        0: "neutral",
        1: "additive",
        2: "epistasis",
        3: "both",
    }
)

df_genome


How many of each kind of site are in the genome?


In [None]:
sns.displot(df_genome["site type"])
plt.yscale("log")
print(df_genome["site type"].value_counts())
print("non-neutral", (df_genome["site type"] != "neutral").sum())


## Perform Skeletonizations


In [None]:
num_skeletonizations = 5
skeletons = np.vstack(
    [
        skeletonize_naive(num_sites, genome.test_knockout)
        for _ in tqdm(range(num_skeletonizations))
    ],
)


Example skeleton.


In [None]:
# convert from knockout true to retained true
retained_sites = ~skeletons[0].astype(bool)
sns.rugplot(
    np.flatnonzero(retained_sites),
    height=0.5,
)
retained_sites


## Describe Skeletons


In [None]:
sns.scatterplot(
    {
        "skeleton order": np.mean(skeletons, axis=0),
        "skeleton frequency": np.mean(skeletons.astype(bool), axis=0),
    },
    x="skeleton order",
    y="skeleton frequency",
)
plt.show()

df_skeletons = describe_skeletons(skeletons, genome.test_knockout)

df_skeletons


How many unique sites are in any skeleton?


In [None]:
np.any(
    (~skeletons.astype(bool)),
    axis=0,
).sum()


## Estimate Number Functional Sites

Using mark-recapture method.


In [None]:
assay_agnostic_naive(df_skeletons)
