In [1]:
import ag2, ag3

ag2_release = ag2.release_data()
ag3_release = ag3.release_data()

  import pandas.util.testing as tm


In [2]:
import pandas as pd
import numpy as np

In [3]:
def load_accessibility(chrom, mask_id='gamb_colu'):
    
    if mask_id == "phase2":

        phase2_is_accessible = ag2_release.load_is_accessible(chrom)
        filter_n = ag2_release.load_filter_n(chrom)
        phase2_is_accessible_nonN = da.compress(
            ~filter_n, phase2_is_accessible, axis=0)

        # Compute chunk sizes avoids this error when c
        # concatenating arrays: ValueError: Arrays chunk sizes are unknown: (nan,)
        phase2_is_accessible_nonN.compute_chunk_sizes() 
        return phase2_is_accessible_nonN
    
    else:
        return ag3_release.load_mask(chrom, mask_id=mask_id)

## For given genomic regions, summarize accessibility

### What about "N" sites?

N sites are currently NOT included. 

If a feature contains many N sites- these are not considered in the % calculation.


In [4]:
features =  """
2La     inv     2L      20528089        42165182
2Rb     inv     2R      19444433        26313071
2Rj     inv     2R      3262186 15750717
2Rc     inv     2R      26780000        31450000
2Ru     inv     2R      31480000        35500000
2Rd     inv     2R      33575891        41360919
2Rk     inv     2R      25146360        30717395
CHX     chro    X       20009764        24393108
CH2R    chro    2R      58984778        61545105
CH2L    chro    2L      1       2431617
PEU2L   chro    2L      2487770 5042389
IH2L    chro    2L      5078962 5788875
IH3R    chro    3R      38988757        41860198
CH3R    chro    3R      52161877        53200684
CH3L    chro    3L      1       1815119
PEU3L   chro    3L      1896830 4235209
IH3L    chro    3L      4264713 5031692
"""
# via Thiago Antao

In [5]:
features_df = pd.DataFrame(
    [x.split() for x in features.strip().split("\n")], columns=["id", "type", "seq_id", "start", "stop"])

features_df.start = features_df.start.astype(np.int32)
features_df.stop = features_df.stop.astype(np.int32)
    
    #dtype={"start": np.int32, "stop": np.int32})

In [6]:
features_df["type"] = features_df.type.map({"inv": "inversion", "chro": "chromatin"})

In [7]:
chromosomes = ["2R", "2L", "3R", "3L", "X"]

In [8]:
avail_masks = ["gamb_colu", "arab", "gamb_colu_arab", "phase2"]

In [9]:
features_by_chrom = features_df.groupby("seq_id")

In [10]:
import allel

In [11]:
out = pd.DataFrame(columns=features_df.columns)

In [12]:
out_list = []

for chrom, df in features_by_chrom:
    
    pos = allel.SortedIndex(ag3_release.load_variants(chrom).compute())
    
    for mask_name in avail_masks:
        
        mask = load_accessibility(chrom, mask_name)
        assert mask.shape == pos.shape
        
        for _, row in df.iterrows():
            
            ix = pos.locate_range(row.start, row.stop)
            
            # computing the mean is ok- as it allows us to silently handle Ns, which are ignored.
            access = mask[ix].mean().compute()
            out_list.append([mask_name, chrom, row.id, row.type, row.start, row.stop, access])
            
        out_list.append([mask_name, chrom, chrom, "contig", 1, pos[-1], mask.mean().compute()])

NameError: name 'da' is not defined

In [None]:
result_df = pd.DataFrame(out_list, columns=["mask_id", "chrom", "feature_name", "type", "start", "stop", "frac_accessible"])

In [None]:
result_df["type"] = pd.Categorical(result_df.type, ["contig", "inversion", "chromatin"], ordered=True)

In [None]:
result_df.sort_values(["mask_id", "type", "chrom", "start"], inplace=True)

In [None]:
result_df.head()

In [None]:
for label, df in result_df.groupby("mask_id"):
    df.drop("mask_id", axis=1).to_csv(f"../content/tables/accessibility/accessibility_features_{label}.csv", float_format="%.3f", index=False)

In [None]:
result_df

## Over the exome

In [None]:
exon_data = {}
wg_data = {}

In [None]:
gff_geneset = "../../Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz"

In [None]:
for chrom in chromosomes:
    
    exons = gene_features = allel.gff3_to_dataframe(
        gff_geneset, 
        attributes=["ID"]).query("(seqid == @chrom) & (type == 'exon')")
    
    pos = allel.SortedIndex(ag3_release.load_variants(chrom).compute())    
    
    for mask_name in avail_masks:
        
        mask = load_accessibility(chrom, mask_name).compute()
        
        locv, _ = pos.locate_intersection_ranges(exons.start, exons.end)
        
        v = np.compress(locv, mask, axis=0)
        
        exon_data[mask_name, chrom] = pd.Series([v.sum(), locv.sum()], index=["n_accessible", "n_total_bases"], dtype=np.int32)
        
        wg_data[mask_name, chrom] = pd.Series([mask.sum(), mask.shape[0]], index=["n_accessible", "n_total_bases"], dtype=np.int32)

## whole exome

In [None]:
exon_counts = pd.DataFrame.from_dict(exon_data, orient="index")
exon_counts.index.names = ["mask_id", "chrom"]
exon_counts

In [None]:
whole_exome_sum = exon_counts.groupby(level=0).agg(sum)
whole_exome_sum["frac_accessible"] = whole_exome_sum["n_accessible"] / whole_exome_sum["n_total_bases"]
whole_exome_sum

## whole genome

In [None]:
wg_counts = pd.DataFrame.from_dict(wg_data, orient="index")
wg_counts.index.names = ["mask_id", "chrom"]
wg_counts

In [None]:
whole_genome_sum = wg_counts.groupby(level=0).agg(sum)
whole_genome_sum["frac_accessible"] = whole_genome_sum["n_accessible"] / whole_genome_sum["n_total_bases"]
whole_genome_sum