### Simulate genes with exon duplications to compare tool outcomes

In [1]:
%load_ext autoreload
%autoreload 2
from fun import *

In [2]:
# read db using the gffutils library
db_filename = '/home/msarrias/dbs/homo_1.db'
db = gffutils.FeatureDB(db_filename, keep_order=True)
annot_dict = get_annotations_dict(db)

In [3]:
# parent_dic, parent_child_dic = create_parent_child_dic(annot_dict)
# dump_parent_child_dict(parent_dic, parent_child_dic,'../files/parent_child_dic_human_01.pkl')


# genes parent - child annotations
parent_dic_gene, parent_child_dic_gene_mRNA = create_parent_child_dic(annot_dict,parent_type = ['gene'],
                                                       child_type = ['mRNA'])

# mRNA parent - child annotations
with open('../files/parent_child_dic_human_01.pkl', 'rb') as handle:
    merged_dict = pickle.load(handle)

In [4]:
# get genes intron regions 
mRNA_parent_dic, parents_exon_coverage = get_exon_overlaps(merged_dict['parent_child_dic'], 
                                                           merged_dict['parent_dic'])
# get gene - mRNA-exons relationship
gene_hierarchy_dict = {}
for ID, value in parent_child_dic_gene_mRNA.items():
    dict_temp = {}
    for transcript, attrib in value['mRNA'].items():
        dict_temp[transcript] = parents_exon_coverage[transcript].copy()
    gene_hierarchy_dict[ID] = dict_temp

In [5]:
# parse chromosome seq
fasta_GRCh38_seq = SeqIO.parse(open('/home/msarrias/data/Homo_sapiens.GRCh38.dna.chromosome.1.fa'),'fasta')
fasta_GRCh38_seq = {fasta.id : str(fasta.seq) for fasta in fasta_GRCh38_seq}

# get a sample of genes with 3 exons
sample_genes_3exons = {}
for gene_id, gene_attrib in gene_hierarchy_dict.items():
    # to avoid choosing between transcripts
    if len(gene_attrib) == 1:
        for key, value in gene_attrib.items():
            if len(value['exons']) == 3:
                temp_gene_attrib = gene_attrib.copy()
                for value in temp_gene_attrib.values():
                    start, end = value['parent']['coord'] 
                    value['parent']['seq'] = fasta_GRCh38_seq['1'][start-1:end-1]
                    for value_exon in value['exons'].values():
                        start_e, end_e = value_exon['coord']
                        value_exon['seq'] = fasta_GRCh38_seq['1'][start_e-1:end_e-1]
                    for value_intron in value['introns'].values():
                        start_e, end_e = value_intron['coord']
                        value_intron['seq'] = fasta_GRCh38_seq['1'][start_e-1:end_e-1]
                sample_genes_3exons[gene_id] = temp_gene_attrib

In [6]:
len(list(sample_genes_3exons.keys()))

53

In [7]:
sample_genes_3exons.keys()

dict_keys(['gene:ENSG00000186092', 'gene:ENSG00000197921', 'gene:ENSG00000120937', 'gene:ENSG00000116726', 'gene:ENSG00000279804', 'gene:ENSG00000204480', 'gene:ENSG00000204479', 'gene:ENSG00000204478', 'gene:ENSG00000158748', 'gene:ENSG00000117318', 'gene:ENSG00000116329', 'gene:ENSG00000243749', 'gene:ENSG00000044012', 'gene:ENSG00000197273', 'gene:ENSG00000159596', 'gene:ENSG00000123091', 'gene:ENSG00000116157', 'gene:ENSG00000182183', 'gene:ENSG00000162377', 'gene:ENSG00000143001', 'gene:ENSG00000154007', 'gene:ENSG00000143032', 'gene:ENSG00000143125', 'gene:ENSG00000134200', 'gene:ENSG00000143067', 'gene:ENSG00000163191', 'gene:ENSG00000182898', 'gene:ENSG00000159450', 'gene:ENSG00000215853', 'gene:ENSG00000197915', 'gene:ENSG00000143631', 'gene:ENSG00000143520', 'gene:ENSG00000143536', 'gene:ENSG00000163220', 'gene:ENSG00000163221', 'gene:ENSG00000197364', 'gene:ENSG00000186440', 'gene:ENSG00000162728', 'gene:ENSG00000248485', 'gene:ENSG00000143185', 'gene:ENSG00000143184', 'gene

In [8]:
# example:
sample_genes_3exons[ 'gene:ENSG00000120937']

{'transcript:ENST00000376468': {'parent': {'source': 'ensembl_havana',
   'len': 1481,
   'coord': (11857464, 11858945),
   'seq': 'TAAAGCTTATAATGTTGACTTTATTTCACCGTGGAAATTTTGTGCTCAAAGGTAAGAAACCATCTTATATAAAACAATCAAATAAATACATAAATACATTAAAAAAATGAGTCACTTCAAAGGCGGCCACAGGGTTGAGGAAAAAGCCCCTTGTGGAATCAGAAGCAGGTGTCTGCAGCCAGGACTTCCTCTTAATGCCGCCTCAGCACTGTCAGGGAAAGAGAGAGGGTGATGATGGTTAGGGTGGGAGATGGAGGCAGGGGCTGAGCTTACCTCATCGTGTGCCACCCACCACCCTGTTAGTCATCAGACGTTTGAGGCTTAATGCAACTCTCTGAGCCTCAGTTTCCTCATCTGTAAATTGGGGATTATCATTGCTCTGGTGATCCTGCTCTCTGCCTTTGAGACTATAGACAGTTTAGAGGAAACCAGGAGGAAATGTTTGGTTCTCTTTCTGCACCACTGGGGGGCTGCCAAATGATAAACAGACCCCCAAAGGAGATTCTGCCCCTTTGATAAGAAGTAGGAGGTGGGGAGAAGGTATTGTGGGCATGGTAATGAAATAAGCCCACATTTACTAATTCCACAAAGGCACCCCTTGGCCCTGAAGGCTGTTAACAAGAGGAAGCGATGTCCAGGTGACCTTTTCTCAAAGAGTGTGGTTCCCAGAGACAACAAACCCCAAAGTGACTCTAACAGTGTCACACACTGGAATGGGGGAAGGCGGCCGGGGTGGCAGGGGGTGCTTACCTTTGCAGCCCAGGCCACTGGAGGAGCTGATCCGGTCCATCTTCCTCCCAAAGCAGCCAGACCCTTGCACCATCTTGGGGCTTCGTGGTGCCCGCAGGGTGTAGAGGACCATTTTGCG

In [None]:
# simulate genes with exon duplications