In [1]:
from ag3 import release_data
v3 = release_data()

In [2]:
import pandas as pd
import numpy as np

For given genomic regions, summarize accessibility

In [3]:
features =  """
2La     inv     2L      20528089        42165182
2Rb     inv     2R      19444433        26313071
2Rj     inv     2R      3262186 15750717
2Rc     inv     2R      26780000        31450000
2Ru     inv     2R      31480000        35500000
2Rd     inv     2R      33575891        41360919
2Rk     inv     2R      25146360        30717395
CHX     chro    X       20009764        24393108
CH2R    chro    2R      58984778        61545105
CH2L    chro    2L      1       2431617
PEU2L   chro    2L      2487770 5042389
IH2L    chro    2L      5078962 5788875
IH3R    chro    3R      38988757        41860198
CH3R    chro    3R      52161877        53200684
CH3L    chro    3L      1       1815119
PEU3L   chro    3L      1896830 4235209
IH3L    chro    3L      4264713 5031692
"""
# via Thiago Antao

In [4]:
features_df = pd.DataFrame(
    [x.split() for x in features.strip().split("\n")], columns=["id", "type", "seq_id", "start", "stop"])

features_df.start = features_df.start.astype(np.int32)
features_df.stop = features_df.stop.astype(np.int32)
    
    #dtype={"start": np.int32, "stop": np.int32})

In [5]:
features_df["type"] = features_df.type.map({"inv": "inversion", "chro": "chromatin"})

In [6]:
chromosomes = ["2R", "2L", "3R", "3L", "X"]

In [7]:
avail_masks = ["gamb_colu", "arab", "gamb_colu_arab"]

In [8]:
features_by_chrom = features_df.groupby("seq_id")

In [9]:
import allel

In [10]:
out = pd.DataFrame(columns=features_df.columns)

In [11]:
out_list = []

for chrom, df in features_by_chrom:
    
    pos = allel.SortedIndex(v3.load_variants(chrom).compute())
    
    for mask_name in avail_masks:
        
        mask = v3.load_mask(chrom, mask_name)
        assert mask.shape == pos.shape
        
        for _, row in df.iterrows():
            
            ix = pos.locate_range(row.start, row.stop)
            
            access = mask[ix].mean().compute()
            out_list.append([mask_name, chrom, row.id, row.type, row.start, row.stop, access])
            
        out_list.append([mask_name, chrom, chrom, "contig", 1, pos[-1], mask.mean().compute()])

In [12]:
result_df = pd.DataFrame(out_list, columns=["mask_id", "chrom", "feature_name", "type", "start", "stop", "frac_accessible"])

In [20]:
result_df["type"] = pd.Categorical(result_df.type, ["contig", "inversion", "chromatin"], ordered=True)

In [21]:
result_df.sort_values(["mask_id", "type", "chrom", "start"], inplace=True)

In [22]:
result_df.head()

Unnamed: 0,mask_id,chrom,feature_name,type,start,stop,frac_accessible
9,arab,2L,2L,contig,1,49364325,0.725773
30,arab,2R,2R,contig,1,61545105,0.73597
46,arab,3L,3L,contig,1,41963435,0.696811
56,arab,3R,3R,contig,1,53200684,0.697803
63,arab,X,X,contig,1,24393108,0.533184


In [24]:
for label, df in result_df.groupby("mask_id"):
    df.drop("mask_id", axis=1).to_csv(f"../content/tables/accessibility_features_{label}.csv", float_format="%.3f", index=False)