#### Devreg xtin states
Goals:
- Parse the chromatin states from the Filion et al. Cell paper and assign to Drosophila gene regions

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
import pickle

sys.path.append('../scripts')
from plot_helpers import *

import gffutils
import HTSeq
from collections import defaultdict

%load_ext autoreload
%autoreload 2

In [None]:
# load flybase v6.28 annotations
db = gffutils.FeatureDB(gffutils_db)

In [None]:
# Load the H3K27me3 binding sites from modEncode L3. They have been mapped from r5->r6 by NCBI tool
bind_gff = '/Users/mk/Desktop/Davislab_old/C3.1_stability_pathway_analysis/modencode/remapped_table2409.gff3'
bind_db = gffutils.create_db(bind_gff, ':memory:', merge_strategy='create_unique')
# example non-unique:
# ValueError: Duplicate ID _H3K27me3__Abcam_lot3__D_mel_3rd_Instar_Larvae_Nuclei_Solexa_.gff_ID008018
p = 0
sites = bind_db.features_of_type('binding_site')

# make gas of h3k27me3 sites
cluster_gas = HTSeq.GenomicArrayOfSets("auto", stranded = False)
for i in sites:
    p += 1
    cluster_gas[HTSeq.GenomicInterval(i.chrom, i.start, i.end)] += 'h3k27me3'
print('num sites %s' % p)

In [None]:
# Find genes with clusters within flanking_bp of their sites
# Gene IDs should be from the current annotations, i.e. r6.28
xtin_dict = {'up': defaultdict(set), 'down': defaultdict(set), 'gene': defaultdict(set)}
flanking_bp = 1000
for i in db.features_of_type('gene'):
    # exclude genes on mt genome because they don't have histones and circular intervals aren't dealt with
    if i.chrom == 'mitochondrion_genome':
        continue
    # Need to make the min as 0
    # Adjust to 0-based, end excluded
    start = i.start - 1
    left_bound = max(0, start - flanking_bp)
    right_bound = i.end + flanking_bp
    left_iv = (left_bound, start)
    right_iv = (i.end, right_bound)
    gene_iv = (start, i.end)

    if i.strand == '+':
        up_iv = HTSeq.GenomicInterval(i.chrom, *left_iv)
        down_iv = HTSeq.GenomicInterval(i.chrom, *right_iv)
    else:     
        up_iv = HTSeq.GenomicInterval(i.chrom, *right_iv)
        down_iv = HTSeq.GenomicInterval(i.chrom, *left_iv)
    gene_iv = HTSeq.GenomicInterval(i.chrom, *gene_iv)

    int_dict = {'up':up_iv, 'down':down_iv, 'gene':gene_iv}    
    for s in int_dict:
        # if on the left edge of the chromosome, then iv length will be 0.
        if (int_dict[s].end - int_dict[s].start) > 0:
            ol = set.union(*[val for iv, val in cluster_gas[int_dict[s]].steps()])
            for mark in ol:
                xtin_dict[s][mark].add(i.id)
        else:
            print(i.id)

In [None]:
# Save the genes' methylation status
outdir = '../Figures/genesets/'
os.makedirs(outdir, exist_ok=True)
with open(os.path.join(outdir, 'h3k27me3_mapped.p'), 'wb') as g:
    pickle.dump(xtin_dict, g)

In [None]:
print('num of meth sites upstream %s' % len(xtin_dict['up']['h3k27me3']))
print('num of meth sites downstream %s' % len(xtin_dict['down']['h3k27me3']))
print('num of both down/up %s' % len(xtin_dict['up']['h3k27me3'].intersection(xtin_dict['down']['h3k27me3'])))
print('num of of meth sites in gene %s' % len(xtin_dict['gene']['h3k27me3']))
updowngene = xtin_dict['up']['h3k27me3'].intersection(xtin_dict['down']['h3k27me3']).intersection(xtin_dict['gene']['h3k27me3'])
print('num of sites up/down/gene %s' % len(updowngene))