In [1]:
%load_ext autoreload
%autoreload 2
from exon_evolution import *

In [2]:
dirct = '/home/msarrias/projects/exon_dups_analysis/code/00-fasta-files/pieris_napi/data/'
filename = 'Pieris_napi_brakerProt_rename_agat.gff'
db_direct = '/home/msarrias/dbs/pieris_napi.db'

## Run only once 
# create_database(dirct + filename, db_direct)

In [3]:
# read db using the gffutils library
db = gffutils.FeatureDB(db_direct, keep_order=True)
annot_dict = get_annotations_dict(db)

In [4]:
df_stats = pd.DataFrame(basic_stat(annot_dict), index =list(db.featuretypes()) + ['Total'])
df_stats

Unnamed: 0,Number,Size total (kb),Size mean (bp)
CDS,123638,26884.55,217.45
exon,123638,26966.5,218.11
five_prime_UTR,22,62.28,2831.09
gene,32898,220931.03,6715.64
intron,105783,102665.38,970.53
mRNA,17894,129761.45,7251.67
start_codon,17841,35.68,2.0
stop_codon,17844,35.69,2.0
three_prime_UTR,17,19.63,1154.82
transcript,16964,114183.29,6730.92


In [5]:
gene_hierarchy_dict = {}
for gene in db.features_of_type('gene'):
    features = {}
    for mRNA_annot in db.children(gene.id, featuretype='mRNA', order_by='start'):
        temp_i = []
        for child in db.children(mRNA_annot.id, featuretype = ['CDS', 'exon',
                                                               'intron', 'five_prime_UTR',
                                                               'three_prime_UTR'],
                                 order_by='start'):
            temp_i += [{'coord': P.open(child.start, child.end),
                      'id':child.id, 
                      'strand': child.strand,
                      'type': child.featuretype}]
        # sort first by the start and then by the end
        temp_j = sorted(temp_i, key = lambda item: (item['coord'].lower, item['coord'].upper))
        features[mRNA_annot.id] = temp_j
    gene_hierarchy_dict[gene.id] = features

In [6]:
# check possible overlaps:
inters_type_counter = {'exon':0,
                       'intron': 0,
                       'CDS': 0,
                       'five_prime_UTR': 0,
                       'three_prime_UTR':0}
intersections_counter = {'exon':copy.deepcopy(inters_type_counter),
                         'intron': copy.deepcopy(inters_type_counter),
                         'CDS': copy.deepcopy(inters_type_counter),
                         'five_prime_UTR': copy.deepcopy(inters_type_counter),
                         'three_prime_UTR': copy.deepcopy(inters_type_counter)}

for gene_id, gene_dict in gene_hierarchy_dict.items():
    gene_interval_dict = transcript_interval_dict(gene_dict)
    for transcript_id, transcript_dict in gene_interval_dict.items():
        overlaping_dict = {}
        intervals_list = list(transcript_dict.keys())
        for idx, (feat_interv, feature_annot) in enumerate(transcript_dict.items()):
            overlaping_dict[feat_interv] = []
            if idx != (len(transcript_dict) - 1):
                for interval_i in intervals_list[idx+1:]:
                    if feat_interv.overlaps(interval_i):
                        overlaping_dict[feat_interv].append(interval_i)
        for interv, interv_overlap in overlaping_dict.items():
            if interv_overlap:
                for interval_j in interv_overlap:
                    interval_type = transcript_dict[interv]['type']
                    next_interval_type = transcript_dict[interval_j]['type']
                    if interval_type != next_interval_type:
                        intersections_counter[interval_type][next_interval_type] += 1
#                         if interval_type=='intron' and next_interval_type == 'exon':
#                             print(idx, gene_id, interval_j)

In [7]:
for key, value in intersections_counter.items():
    print(key)
    print(value)
    print('   ')

exon
{'exon': 0, 'intron': 3, 'CDS': 33, 'five_prime_UTR': 3, 'three_prime_UTR': 0}
   
intron
{'exon': 14, 'intron': 0, 'CDS': 0, 'five_prime_UTR': 0, 'three_prime_UTR': 0}
   
CDS
{'exon': 6, 'intron': 0, 'CDS': 0, 'five_prime_UTR': 0, 'three_prime_UTR': 0}
   
five_prime_UTR
{'exon': 19, 'intron': 0, 'CDS': 0, 'five_prime_UTR': 0, 'three_prime_UTR': 0}
   
three_prime_UTR
{'exon': 0, 'intron': 0, 'CDS': 0, 'five_prime_UTR': 0, 'three_prime_UTR': 0}
   
