In [1]:
from ag3 import release_data
v3 = release_data()

In [2]:
import pandas as pd
import numpy as np

## For given genomic regions, summarize accessibility

### What about "N" sites?

N sites are currently NOT included. 

If a feature contains many N sites- these are not considered in the % calculation.


In [3]:
features =  """
2La     inv     2L      20528089        42165182
2Rb     inv     2R      19444433        26313071
2Rj     inv     2R      3262186 15750717
2Rc     inv     2R      26780000        31450000
2Ru     inv     2R      31480000        35500000
2Rd     inv     2R      33575891        41360919
2Rk     inv     2R      25146360        30717395
CHX     chro    X       20009764        24393108
CH2R    chro    2R      58984778        61545105
CH2L    chro    2L      1       2431617
PEU2L   chro    2L      2487770 5042389
IH2L    chro    2L      5078962 5788875
IH3R    chro    3R      38988757        41860198
CH3R    chro    3R      52161877        53200684
CH3L    chro    3L      1       1815119
PEU3L   chro    3L      1896830 4235209
IH3L    chro    3L      4264713 5031692
"""
# via Thiago Antao

In [4]:
features_df = pd.DataFrame(
    [x.split() for x in features.strip().split("\n")], columns=["id", "type", "seq_id", "start", "stop"])

features_df.start = features_df.start.astype(np.int32)
features_df.stop = features_df.stop.astype(np.int32)
    
    #dtype={"start": np.int32, "stop": np.int32})

In [5]:
features_df["type"] = features_df.type.map({"inv": "inversion", "chro": "chromatin"})

In [6]:
chromosomes = ["2R", "2L", "3R", "3L", "X"]

In [7]:
avail_masks = ["gamb_colu", "arab", "gamb_colu_arab"]

In [8]:
features_by_chrom = features_df.groupby("seq_id")

In [9]:
import allel

In [10]:
out = pd.DataFrame(columns=features_df.columns)

In [11]:
out_list = []

for chrom, df in features_by_chrom:
    
    pos = allel.SortedIndex(v3.load_variants(chrom).compute())
    
    for mask_name in avail_masks:
        
        mask = v3.load_mask(chrom, mask_name)
        assert mask.shape == pos.shape
        
        for _, row in df.iterrows():
            
            ix = pos.locate_range(row.start, row.stop)
            
            # computing the mean is ok- as it allows us to silently handle Ns, which are ignored.
            access = mask[ix].mean().compute()
            out_list.append([mask_name, chrom, row.id, row.type, row.start, row.stop, access])
            
        out_list.append([mask_name, chrom, chrom, "contig", 1, pos[-1], mask.mean().compute()])

In [12]:
result_df = pd.DataFrame(out_list, columns=["mask_id", "chrom", "feature_name", "type", "start", "stop", "frac_accessible"])

In [13]:
result_df["type"] = pd.Categorical(result_df.type, ["contig", "inversion", "chromatin"], ordered=True)

In [14]:
result_df.sort_values(["mask_id", "type", "chrom", "start"], inplace=True)

In [15]:
result_df.head()

Unnamed: 0,mask_id,chrom,feature_name,type,start,stop,frac_accessible
9,arab,2L,2L,contig,1,49364325,0.725773
30,arab,2R,2R,contig,1,61545105,0.73597
46,arab,3L,3L,contig,1,41963435,0.696811
56,arab,3R,3R,contig,1,53200684,0.697803
63,arab,X,X,contig,1,24393108,0.533184


In [16]:
for label, df in result_df.groupby("mask_id"):
    df.drop("mask_id", axis=1).to_csv(f"../content/tables/accessibility/accessibility_features_{label}.csv", float_format="%.3f", index=False)

In [17]:
result_df

Unnamed: 0,mask_id,chrom,feature_name,type,start,stop,frac_accessible
9,arab,2L,2L,contig,1,49364325,0.725773
30,arab,2R,2R,contig,1,61545105,0.735970
46,arab,3L,3L,contig,1,41963435,0.696811
56,arab,3R,3R,contig,1,53200684,0.697803
63,arab,X,X,contig,1,24393108,0.533184
...,...,...,...,...,...,...,...
48,gamb_colu_arab,3L,PEU3L,chromatin,1896830,4235209,0.678704
49,gamb_colu_arab,3L,IH3L,chromatin,4264713,5031692,0.211377
57,gamb_colu_arab,3R,IH3R,chromatin,38988757,41860198,0.335847
58,gamb_colu_arab,3R,CH3R,chromatin,52161877,53200684,0.190784


## Over the exome

In [23]:
exon_data = {}
wg_data = {}

In [24]:
for chrom in chromosomes:
    
    exons = gene_features = allel.gff3_to_dataframe("geneset.gff3.gz", attributes=["ID"]).query("(seqid == @chrom) & (type == 'exon')")
    
    pos = allel.SortedIndex(v3.load_variants(chrom).compute())    
    
    for mask_name in avail_masks:
        
        mask = v3.load_mask(chrom, mask_name).compute()
        
        locv, _ = pos.locate_intersection_ranges(exons.start, exons.end)
        
        v = np.compress(locv, mask, axis=0)
        
        exon_data[mask_name, chrom] = pd.Series([v.sum(), locv.sum()], index=["n_accessible", "n_total_bases"], dtype=np.int32)
        
        wg_data[mask_name, chrom] = pd.Series([mask.sum(), mask.shape[0]], index=["n_accessible", "n_total_bases"], dtype=np.int32)

## whole exome

In [25]:
exon_counts = pd.DataFrame.from_dict(exon_data, orient="index")
exon_counts.index.names = ["mask_id", "chrom"]
exon_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,n_accessible,n_total_bases
mask_id,chrom,Unnamed: 2_level_1,Unnamed: 3_level_1
gamb_colu,2R,8326769,9434770
arab,2R,8251192,9434770
gamb_colu_arab,2R,7796125,9434770
gamb_colu,2L,6718168,7423546
arab,2L,6578902,7423546
gamb_colu_arab,2L,6284272,7423546
gamb_colu,3R,6221607,7043977
arab,3R,6096759,7043977
gamb_colu_arab,3R,5810909,7043977
gamb_colu,3L,4899622,5729235


In [26]:
whole_exome_sum = exon_counts.groupby(level=0).agg(sum)
whole_exome_sum["frac_accessible"] = whole_exome_sum["n_accessible"] / whole_exome_sum["n_total_bases"]
whole_exome_sum

Unnamed: 0_level_0,n_accessible,n_total_bases,frac_accessible
mask_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
arab,28356983,32901366,0.861879
gamb_colu,28891634,32901366,0.878129
gamb_colu_arab,26773478,32901366,0.81375


## whole genome

In [27]:
wg_counts = pd.DataFrame.from_dict(wg_data, orient="index")
wg_counts.index.names = ["mask_id", "chrom"]
wg_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,n_accessible,n_total_bases
mask_id,chrom,Unnamed: 2_level_1,Unnamed: 3_level_1
gamb_colu,2R,44439759,60132453
arab,2R,44255681,60132453
gamb_colu_arab,2R,40561667,60132453
gamb_colu,2L,36005131,48525747
arab,2L,35218697,48525747
gamb_colu_arab,2L,32529983,48525747
gamb_colu,3R,37199402,52226568
arab,3R,36443830,52226568
gamb_colu_arab,3R,33390862,52226568
gamb_colu,3L,28707856,40758473


In [28]:
whole_genome_sum = wg_counts.groupby(level=0).agg(sum)
whole_genome_sum["frac_accessible"] = whole_genome_sum["n_accessible"] / whole_genome_sum["n_total_bases"]
whole_genome_sum

Unnamed: 0_level_0,n_accessible,n_total_bases,frac_accessible
mask_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
arab,156787860,225028590,0.696746
gamb_colu,162714957,225028590,0.723086
gamb_colu_arab,143569766,225028590,0.638007
