In [1]:
import numpy as np
import pandas as pd
import pyranges as pr

In [2]:
from modality.contig_dataset import ContigDataset
from modality.datasets import load_biomodal_dataset
from modality.annotation import (
    get_genes,
    get_transcription_end_region,
    get_tss_region,
    get_exons,
    get_introns,
    get_five_prime_utrs,
    get_three_prime_utrs,
    get_transcripts,
    get_cpg_islands,
)
def load_data(dataset: str = "") -> ContigDataset:
    #downloads modality.contig_dataset.ContigDataset object
    if dataset == "":
        ds = load_biomodal_dataset()
    else:
        ds = ContigDataset.from_zarrz(dataset)
        # ds = ContigDataset.from_zarrz("../ES-E14.zarrz")
    ds = ds.drop_vars(["Input DNA Quantity (ng/sample)", "tech_replicate_number"])  
    ds = ds.sum(dim="sample_id", keep_attrs=True)
    ds = ds.expand_dims(dim="sample_id", axis=1)
    ds = ds.assign_coords(sample_id=["sample_0"])
    ds.assign_fractions(
        numerators=["num_modc", "num_mc", "num_hmc"],
        denominator="num_total_c",
        min_coverage=10,
        inplace=True,
    )
    return ds

In [3]:
ds = load_data()

It can be added using the `assign_coords` method, e.g. `ds.assign_coords(sample_id = ("sample_id", ["sample1", "sample2"]))`
It can be added using the `assign_coords` method, e.g. `ds.assign_coords(sample_id = ("sample_id", ["sample1", "sample2"]))`


In [6]:
gene_filter = {
    "gene_type": "protein_coding",
    "source": "HAVANA",
}

genes = get_genes(
    reference="mm10",
    as_pyranges=True,
    filterby=gene_filter,
)

transcripts = get_transcripts(
        reference="mm10",
        contig=None,
        start=None,
        end=None,
        as_pyranges=False,
    )

def select_transcript_based_on_tag(df):
    # for each transcript in df, select the one with the highest priority tag
    # priorities are:
        # 1. 'basic,appris_principal_1,CCDS'
        # 2. 'basic,appris_principal_1'
        # 3. 'basic,CCDS'
        # 4. 'basic'
    # but with 'exp_conf' (experimentally confirmed) tag, the priority is higher.
    
    priorties = {
        'basic,appris_principal_1,exp_conf,CCDS': 1,
        'basic,appris_principal_1,CCDS': 1,
        'basic,appris_principal_1,exp_conf': 3,
        'basic,appris_principal_1': 4,
        'basic,exp_conf,CCDS': 5,
        'basic,CCDS': 6,
        'basic,exp_conf': 7,
        'basic': 8
    }

    # sort the dataframe by the priority of the tags
    df['tag_priority'] = df.tag.map(priorties)

    df = df.sort_values(by='tag_priority')

    # drop duplicates, keeping the first one
    df = df.drop_duplicates(subset='gene_id', keep='first')

    return df[["gene_id", "transcript_id"]]

2024-07-30 14:31:59 | INFO | [modality/annotation.py:437] Removing readthrough_gene transcripts for gff (https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.basic.annotation.gff3.gz)
2024-07-30 14:32:06 | INFO | [modality/annotation.py:437] Removing readthrough_gene transcripts for gff (https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.basic.annotation.gff3.gz)


In [7]:
selected_transcripts = transcripts.groupby('gene_id').apply(
    select_transcript_based_on_tag
    ).reset_index(drop=True)
# Create dictionary for gene_id to transcript_id mapping
gene_to_transcription = selected_transcripts.set_index('gene_id')['transcript_id'].to_dict()
print(f"There are {len(gene_to_transcription)} genes with unique transcript ids")

def map_transcription_id(gene_id, gene_to_transcription):
    return gene_to_transcription.get(gene_id, None)

# Map transcription_id 
genes_df = genes.df
genes_df['Transcript_id'] = genes_df['Gene_id'].map(lambda gene_id: map_transcription_id(gene_id, gene_to_transcription))
genes_clean = pr.PyRanges(genes_df)

# Handle missing gene_ids
missing_gene_ids = genes_clean.df[genes_clean.df['Transcript_id'].isna()]['Gene_id'].tolist()

filtered_genes = genes_clean.df[genes_clean.df['Gene_id'].isin(missing_gene_ids)]
print(f"Tags for {len(filtered_genes)} missing genes:")
print(filtered_genes['Tag'].value_counts())

genes_clean = genes_clean[~genes_clean.df['Transcript_id'].isna()]


print(f"all genes: {len(genes)}")
print(f"all genes with annotated transcripts: {len(genes_clean)}")
genes_clean


There are 21541 genes with unique transcript ids
Tags for 132 missing genes:
Tag
                                                 87
overlapping_locus                                13
reference_genome_error                           10
fragmented_locus                                  9
ncRNA_host,fragmented_locus                       2
ncRNA_host                                        2
overlapping_locus,reference_genome_error          2
ncRNA_host,reference_genome_error                 2
ncRNA_host,overlapping_locus                      2
fragmented_locus,reference_genome_error           1
fragmented_locus,overlapping_locus                1
ncRNA_host,fragmented_locus,overlapping_locus     1
Name: count, dtype: int64
all genes: 21673
all genes with annotated transcripts: 21541


Unnamed: 0,Chromosome,Source,Type,Start,End,Score,Strand,Phase,Id,Gene_id,Gene_type,Gene_name,Level,Mgi_id,Havana_gene,Tag,Ranges_ID,Transcript_id
0,1,HAVANA,gene,4807787,4848409,.,+,.,ENSMUSG00000025903.14,ENSMUSG00000025903.14,protein_coding,Lypla1,2,MGI:1344588,OTTMUSG00000021562.4,overlapping_locus,0,ENSMUST00000027036.10
1,1,HAVANA,gene,4807891,4886769,.,+,.,ENSMUSG00000104217.1,ENSMUSG00000104217.1,protein_coding,Gm37988,2,MGI:5611216,OTTMUSG00000050100.1,overlapping_locus,1,ENSMUST00000155020.1
2,1,HAVANA,gene,4857813,4897908,.,+,.,ENSMUSG00000033813.15,ENSMUSG00000033813.15,protein_coding,Tcea1,2,MGI:1196624,OTTMUSG00000042348.1,overlapping_locus,2,ENSMUST00000081551.13
3,1,HAVANA,gene,5070017,5162528,.,+,.,ENSMUSG00000033793.12,ENSMUSG00000033793.12,protein_coding,Atp6v1h,2,MGI:1914864,OTTMUSG00000050145.9,,3,ENSMUST00000192847.5
4,1,HAVANA,gene,5588465,5606130,.,+,.,ENSMUSG00000025905.14,ENSMUSG00000025905.14,protein_coding,Oprk1,2,MGI:97439,OTTMUSG00000034734.3,,4,ENSMUST00000160777.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21536,Y,HAVANA,gene,78835720,78838055,.,-,.,ENSMUSG00000094739.2,ENSMUSG00000094739.2,protein_coding,Gm20806,2,MGI:5434162,OTTMUSG00000046577.2,,21668,ENSMUST00000190349.1
21537,Y,HAVANA,gene,79148788,79151121,.,-,.,ENSMUSG00000095867.2,ENSMUSG00000095867.2,protein_coding,Gm20917,2,MGI:5434273,OTTMUSG00000046619.2,,21669,ENSMUST00000188706.1
21538,Y,HAVANA,gene,84562571,84564906,.,-,.,ENSMUSG00000094660.2,ENSMUSG00000094660.2,protein_coding,Gm21394,2,MGI:5434749,OTTMUSG00000045415.1,,21670,ENSMUST00000189463.1
21539,Y,HAVANA,gene,85528516,85530907,.,-,.,ENSMUSG00000095650.2,ENSMUSG00000095650.2,protein_coding,Gm20854,2,MGI:5434210,OTTMUSG00000042966.1,,21671,ENSMUST00000181549.1


splitting ContigDataset by strand

In [22]:
import xarray as xr
from modality.contig_dataset import set_contig_slices, cast_result


plus_strand_mask = (ds['strand'].data == '+').compute()
minus_strand_mask = (ds['strand'].data == '-').compute()
plus_strand_mask_da = xr.DataArray(plus_strand_mask, dims=ds['strand'].dims, coords=ds['strand'].coords)
minus_strand_mask_da = xr.DataArray(minus_strand_mask, dims=ds['strand'].dims, coords=ds['strand'].coords)

plus_strand = ds.where(plus_strand_mask_da, drop=True)
minus_strand = ds.where(minus_strand_mask_da, drop=True)

def refresh_slices(subset_data):
    if subset_data.pos.size == 0:
            raise ValueError(
                "Subset is empty. Please check the coverage values and method used."
            )
    else:
        slices = [x for x in subset_data.attrs if x.startswith("slice")]
        for sl in slices:
            subset_data.attrs.pop(sl)

        return set_contig_slices(subset_data)

    # We need to rechunk here to resolve the irregular chunk sizes that result from the subseting
    # subset_data = rechunk_dataset(
    #     template_data=subset_data, axis1_chunk_size=subset_data.sample_id.size
    # )

plus_strand = cast_result(refresh_slices(plus_strand.ds))
minus_strand = cast_result(refresh_slices(minus_strand.ds))

2024-07-30 14:46:57 | INFO | [modality/contig_slices.py:25] Computing available chromosomes.
2024-07-30 14:46:57 | INFO | [modality/contig_slices.py:44] Found available contigs: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT.
2024-07-30 14:46:57 | INFO | [modality/contig_slices.py:25] Computing available chromosomes.
2024-07-30 14:46:58 | INFO | [modality/contig_slices.py:44] Found available contigs: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT.


get TSS and TES regions

In [8]:
default_args = {
    "contig": None,
    "start": None,
    "end": None,
    "reference": "mm10",
    "as_pyranges": True,
    "protein_coding": True,
    "filterby": None,
}
before_tss = get_tss_region(
    start_offset=-200,
    span=200,
    **default_args,
)

print(f"all before_tss regions: {len(before_tss)}")
# Map transcription_id 
before_tss_df = before_tss.df
before_tss_df['Transcript_id'] = before_tss_df['Gene_id'].map(lambda gene_id: map_transcription_id(gene_id, gene_to_transcription))
before_tss = pr.PyRanges(before_tss_df)
before_tss = before_tss[~before_tss.df['Transcript_id'].isna()]

print(f"before_tss regions with transcripts: {len(before_tss)}")

2024-07-30 14:32:48 | INFO | [modality/annotation.py:437] Removing readthrough_gene transcripts for gff (https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.basic.annotation.gff3.gz)


all before_tss regions: 21673
before_tss regions with transcripts: 21541


In [10]:
after_tes = get_transcription_end_region(
    start_offset=0,
    span=1000,
    **default_args,
)

print(f"all after_tes regions: {len(after_tes)}")
# Map transcription_id 
after_tes_df = after_tes.df
after_tes_df['Transcript_id'] = after_tes_df['Gene_id'].map(lambda gene_id: map_transcription_id(gene_id, gene_to_transcription))
after_tes = pr.PyRanges(before_tss_df)
after_tes = after_tes[~after_tes.df['Transcript_id'].isna()]

print(f"after_tes regions with transcripts: {len(after_tes)}")

2024-07-30 14:33:34 | INFO | [modality/annotation.py:437] Removing readthrough_gene transcripts for gff (https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.basic.annotation.gff3.gz)


all after_tes regions: 21673
after_tes regions with transcripts: 21541


In [30]:
#split strands
before_tss_strand1 = pr.PyRanges(before_tss.df[before_tss.df["Strand"]=="+"])
before_tss_strand2 = pr.PyRanges(before_tss.df[before_tss.df["Strand"]=="-"])
after_tes_strand1 = pr.PyRanges(after_tes.df[after_tes.df["Strand"]=="+"])
after_tes_strand2 = pr.PyRanges(after_tes.df[after_tes.df["Strand"]=="-"])


before_tss_strand1 = before_tss_strand1.unstrand()
before_tss_strand2 = before_tss_strand2.unstrand()
after_tes_strand1 = after_tes_strand1.unstrand()
after_tes_strand2 = after_tes_strand2.unstrand()


In [26]:
before_tss_strand1.df[before_tss_strand1.df['Gene_id']== "ENSMUSG00000000184.12"]

Unnamed: 0,Chromosome,Source,Type,Start,End,Score,Strand,Phase,Id,Gene_id,Gene_type,Gene_name,Level,Mgi_id,Havana_gene,Tag,Ranges_ID,Transcript_id


In [27]:
before_tss_strand2.df[before_tss_strand2.df['Gene_id']== "ENSMUSG00000000184.12"]

Unnamed: 0,Chromosome,Source,Type,Start,End,Score,Strand,Phase,Id,Gene_id,Gene_type,Gene_name,Level,Mgi_id,Havana_gene,Tag,Ranges_ID,Transcript_id
3685,6,HAVANA,gene,127152192,127152392,.,-,.,ENSMUSG00000000184.12,ENSMUSG00000000184.12,protein_coding,Ccnd2,2,MGI:88314,OTTMUSG00000056347.1,,7559,ENSMUST00000000188.11


In [33]:
regions_dict_strand1 ={
        "before_tss": before_tss_strand1,
        "after_tes": after_tes_strand1,
    }

regions_dict_strand2 ={
        "before_tss": before_tss_strand2,
        "after_tes": after_tes_strand2,
    }

for region in regions_dict_strand1:
    regions_dict_strand1[region].Region = region
    try:
        regions_dict_strand1[region] = regions_dict_strand1[region].drop(to_drop)
    except:
        pass

for region in regions_dict_strand2:
    regions_dict_strand2[region].Region = region
    try:
        regions_dict_strand2[region] = regions_dict_strand2[region].drop(to_drop)
    except:
        pass


In [23]:
plus_strand

Unnamed: 0,Array,Chunk
Bytes,248.66 MiB,0.95 MiB
Shape,"(13037140,)","(50000,)"
Dask graph,261 chunks in 3 graph layers,261 chunks in 3 graph layers
Data type,,
"Array Chunk Bytes 248.66 MiB 0.95 MiB Shape (13037140,) (50000,) Dask graph 261 chunks in 3 graph layers Data type",13037140  1,

Unnamed: 0,Array,Chunk
Bytes,248.66 MiB,0.95 MiB
Shape,"(13037140,)","(50000,)"
Dask graph,261 chunks in 3 graph layers,261 chunks in 3 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140,)","(50000,)"
Dask graph,261 chunks in 3 graph layers,261 chunks in 3 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140,) (50000,) Dask graph 261 chunks in 3 graph layers Data type int64 numpy.ndarray",13037140  1,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140,)","(50000,)"
Dask graph,261 chunks in 3 graph layers,261 chunks in 3 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140,)","(50000,)"
Dask graph,261 chunks in 3 graph layers,261 chunks in 3 graph layers
Data type,,
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140,) (50000,) Dask graph 261 chunks in 3 graph layers Data type",13037140  1,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140,)","(50000,)"
Dask graph,261 chunks in 3 graph layers,261 chunks in 3 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 11 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 11 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 11 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 11 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 11 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 11 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 11 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 11 graph layers,261 chunks in 11 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 20 graph layers,261 chunks in 20 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 20 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 20 graph layers,261 chunks in 20 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 20 graph layers,261 chunks in 20 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 20 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 20 graph layers,261 chunks in 20 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 20 graph layers,261 chunks in 20 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 99.47 MiB 390.62 kiB Shape (13037140, 1) (50000, 1) Dask graph 261 chunks in 20 graph layers Data type float64 numpy.ndarray",1  13037140,

Unnamed: 0,Array,Chunk
Bytes,99.47 MiB,390.62 kiB
Shape,"(13037140, 1)","(50000, 1)"
Dask graph,261 chunks in 20 graph layers,261 chunks in 20 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [34]:
# plus strand methylation vs plus strand regions = same strand
same_strand_ds_plus = plus_strand.reduce_byranges(
    ranges=list(regions_dict_strand1.values()), 
    var=["num_mc", "num_hmc", "num_modc", "num_total_c"]
    )

# plus strand methylation vs minus strand regions = opposite strand
opposite_strand_ds_plus = plus_strand.reduce_byranges(
    ranges=list(regions_dict_strand2.values()), 
    var=["num_mc", "num_hmc", "num_modc", "num_total_c"]
    )



# minus strand methylation vs minus strand regions = same strand
same_strand_ds_minus = minus_strand.reduce_byranges(
    ranges=list(regions_dict_strand2.values()), 
    var=["num_mc", "num_hmc", "num_modc", "num_total_c"]
    )
# minus strand methylation vs plus strand regions = opposite strand
opposite_strand_ds_minus = minus_strand.reduce_byranges(
    ranges=list(regions_dict_strand1.values()), 
    var=["num_mc", "num_hmc", "num_modc", "num_total_c"]
    )


In [48]:
regions_dict_strand = {
    "before_tss": before_tss.unstrand(),
    "after_tes": after_tes.unstrand(),
}
unstrand = ds.reduce_byranges(
    ranges=list(regions_dict_strand.values()), 
    var=["num_mc", "num_hmc", "num_modc", "num_total_c"]
    )

In [49]:
unstrand.where(unstrand["Gene_id"]=="ENSMUSG00000000184.12", drop=True)

In [41]:
same_strand_ds_minus.where(same_strand_ds_minus["Gene_id"]=="ENSMUSG00000000184.12", drop=True)

In [40]:
opposite_strand_ds_plus.where(opposite_strand_ds_plus["Gene_id"]=="ENSMUSG00000000184.12", drop=True)