# Explore real data to help with simulating test data

In [None]:
import malariagen_data
import allel
import numpy as np
import plotly.express as px
import plotly.io as pio

pio.templates.default = "plotly_dark"

In [None]:
ag3 = malariagen_data.Ag3(
    "simplecache::gs://vo_agam_release",
    simplecache=dict(cache_storage="../gcs_cache"),
)
ag3

In [None]:
af1 = malariagen_data.Af1(
    "simplecache::gs://vo_afun_release",
    simplecache=dict(cache_storage="../gcs_cache"),
)
af1

## Site annotations

In [None]:
ann = ag3.open_site_annotations()
ann

In [None]:
list(ann)

In [None]:
print(ann.tree())

In [None]:
codon_degeneracy = ann["codon_degeneracy"]["3L"][10_000_000:11_000_000]
px.histogram(codon_degeneracy)

In [None]:
np.bincount(codon_degeneracy + 1) / codon_degeneracy.size

In [None]:
codon_nonsyn = ann["codon_nonsyn"]["3L"][10_000_000:11_000_000]
px.histogram(codon_nonsyn)

In [None]:
np.bincount(codon_nonsyn) / codon_nonsyn.size

In [None]:
codon_position = ann["codon_position"]["3L"][10_000_000:11_000_000]
px.histogram(codon_position)

In [None]:
np.bincount(codon_position + 1) / codon_position.size

In [None]:
seq_cls = ann["seq_cls"]["3L"][10_000_000:11_000_000]
px.histogram(seq_cls)

In [None]:
np.bincount(seq_cls) / seq_cls.size

In [None]:
seq_flen = ann["seq_flen"]["3L"][10_000_000:11_000_000]
px.histogram(seq_flen, nbins=20)

In [None]:
seq_relpos_start = ann["seq_relpos_start"]["3L"][10_000_000:11_000_000]
px.histogram(seq_relpos_start, nbins=20)

In [None]:
seq_relpos_stop = ann["seq_relpos_stop"]["3L"][10_000_000:11_000_000]
px.histogram(seq_relpos_stop, nbins=20)

In [None]:
x = np.random.beta(a=0.4, b=4, size=100_000) * 40_000
px.histogram(x, nbins=20)

In [None]:
list(ann)

## SNP calls

In [None]:
ds = ag3.snp_calls(region="3L", sample_sets="AG1000G-BF-A")
ds

In [None]:
ds2 = ds.isel(variants=slice(10_500_000, 10_600_000))
ds2

In [None]:
gt = ds2["call_genotype"].values
gt

In [None]:
gt2 = allel.GenotypeArray(gt)
gt2

In [None]:
np.bincount(gt.flatten() + 1) / gt.size

In [None]:
missing_fraction = gt2.count_missing() / (gt2.n_variants * gt2.n_samples)
missing_fraction

In [None]:
allele_counts = np.bincount(gt.flatten() + 1)[1:]
allele_counts

In [None]:
allele_fractions = allele_counts / np.sum(allele_counts)
allele_fractions

In [None]:
gt_sim = np.random.choice(4, size=gt.shape, replace=True, p=allele_fractions)
gt_sim

In [None]:
n_calls = gt_sim.shape[0] * gt_sim.shape[1]
n_calls

In [None]:
gt_sim_flat = gt_sim.reshape(-1, 2)
gt_sim_flat

In [None]:
loc_missing = np.random.choice(
    n_calls,
    size=int(missing_fraction * n_calls),
    replace=False,
)
loc_missing

In [None]:
gt_sim_flat[loc_missing] = -1

In [None]:
gt_sim

In [None]:
np.bincount(gt_sim.flatten() + 1) / gt_sim.size

In [None]:
np.bincount(gt.flatten() + 1) / gt.size

In [None]:
allel.GenotypeArray(gt).count_missing()

In [None]:
allel.GenotypeArray(gt_sim).count_missing()

In [None]:
allel.GenotypeArray(gt).count_hom_ref()

In [None]:
allel.GenotypeArray(gt_sim).count_hom_ref()

In [None]:
allel.GenotypeArray(gt).count_hom_alt()

In [None]:
allel.GenotypeArray(gt_sim).count_hom_alt()

In [None]:
gq = ds2["call_GQ"].values

In [None]:
px.histogram(gq.flatten()[:100_000])

In [None]:
gqf = gq.flatten()
n_gq = np.bincount(gqf[gqf >= 0])
n_gq

In [None]:
p_gq = np.bincount(gqf[gqf >= 0]) / gqf.size
p_gq

In [None]:
px.bar(p_gq)

In [None]:
mq = ds2["call_MQ"].values
mq

In [None]:
mqf = mq.flatten()

In [None]:
px.histogram(mqf[:100_000])

In [None]:
p_mq = np.bincount(mqf[mqf >= 0]) / mqf.size
p_mq

In [None]:
px.bar(p_mq)

In [None]:
ad = ds2["call_AD"].values
ad

In [None]:
ad0 = ad[:, :, 0].flatten()
ad0

In [None]:
ad1 = ad[:, :, 1].flatten()
ad1

In [None]:
ad2 = ad[:, :, 2].flatten()
ad3 = ad[:, :, 3].flatten()

In [None]:
px.histogram(ad0[:10_000])

In [None]:
px.histogram(ad1[:10_000])

In [None]:
px.histogram(ad2[:10_000])

In [None]:
p_ad0 = np.bincount(ad0[ad0 >= 0]) / ad0.size
px.bar(p_ad0)

In [None]:
p_ad1 = np.bincount(ad1[ad1 >= 2]) / ad1.size
px.bar(p_ad1)

In [None]:
p_ad2 = np.bincount(ad2[ad2 >= 2]) / ad1.size
px.bar(p_ad2)

In [None]:
p_ad3 = np.bincount(ad3[ad3 >= 2]) / ad1.size
px.bar(p_ad3)

In [None]:
pos = ds2["variant_position"].values

In [None]:
pos

In [None]:
px.line(pos[:100_000])

In [None]:
alleles = ds2["variant_allele"].values[:10]
alleles

In [None]:
ref = alleles[:, 0]

In [None]:
alt_sim = np.empty(shape=(alleles.shape[0], alleles.shape[1] - 1), dtype="S1")
alt_sim[ref == b"A"] = np.array([b"C", b"T", b"G"])
alt_sim[ref == b"C"] = np.array([b"A", b"T", b"G"])
alt_sim[ref == b"T"] = np.array([b"A", b"C", b"G"])
alt_sim[ref == b"G"] = np.array([b"A", b"C", b"T"])
alt_sim

In [None]:
ds2

In [None]:
pass_gc = ds2["variant_filter_pass_gamb_colu"].values
p_pass_gc = np.sum(pass_gc) / pass_gc.size
p_pass_gc

In [None]:
pass_a = ds2["variant_filter_pass_arab"].values
p_pass_a = np.sum(pass_a) / pass_a.size
p_pass_a

In [None]:
pass_gca = ds2["variant_filter_pass_gamb_colu_arab"].values
p_pass_gca = np.sum(pass_gca) / pass_gca.size
p_pass_gca

## Sequence composition

In [None]:
ag_seq = ag3.genome_sequence("3L").compute()
ag_seq

In [None]:
from collections import Counter

In [None]:
ag_seq_count = Counter(ag_seq)
ag_seq_count

In [None]:
filter_pass = af1.site_filters(region="3RL:10,000,000-11,000,000", mask="funestus")
filter_pass

In [None]:
np.sum(filter_pass).compute() / filter_pass.size

In [None]:
af_seq = af1.genome_sequence("3RL")[:60_000_000].compute()
af_seq

In [None]:
bases = np.array([b"a", b"c", b"g", b"t", b"n", b"A", b"C", b"G", b"T", b"N"])
bases

In [None]:
af_seq_count = Counter(af_seq)
af_seq_count

In [None]:
p_bases_af = {b: af_seq_count[b] / af_seq.size for b in bases}
p_bases_af

In [None]:
p_ad0 = np.bincount(ad0[ad0 >= 0]) / ad0.size
px.bar(p_ad0)

In [None]:
p_bases_ag = {b: ag_seq_count[b] / ag_seq.size for b in bases}
p_bases_ag

In [None]:
p_bases_ag = np.array([ag_seq_count[b] for b in bases]) / ag_seq.size
p_bases_ag

In [None]:
p_bases_ag.sum()

In [None]:
seq_sim = np.random.choice(bases, size=ag_seq.size, replace=True, p=p_bases_ag)
seq_sim

In [None]:
af1.sample_sets()

In [None]:
ds = af1.snp_calls(
    region="3RL:10_000_000-10_500_000", sample_sets="1230-VO-GA-CF-AYALA-VMF00045"
)
ds

In [None]:
gt = ds["call_genotype"].values
gt

In [None]:
allele_counts = np.bincount(gt.flatten() + 1)[1:]
allele_counts

In [None]:
allele_counts / np.sum(allele_counts)

In [None]:
np.sum(gt < 0) / gt.size