In [1]:
%load_ext autoreload
%autoreload 2
from fun import *

![image.png](attachment:image.png)

## Simulate chromosome with exon duplications to compare tool outcomes

In [2]:
# create_database('/home/msarrias/data/Homo_sapiens.GRCh38.105.chromosome.1.gff3',
#                 '/home/msarrias/dbs/homo_1.db')

# read db using the gffutils library
db_filename = '/home/msarrias/dbs/homo_1.db'
db = gffutils.FeatureDB(db_filename, keep_order=True)


We will work with the human first chromosome 

In [3]:
fasta_GRCh38_seq = SeqIO.parse(open('/home/msarrias/data/Homo_sapiens.GRCh38.dna.chromosome.1.fa'),'fasta')
fasta_GRCh38_seq = { fasta.id : {'+' : str(fasta.seq),
                                 '-': str(fasta.seq.reverse_complement())} for fasta in fasta_GRCh38_seq}

In [4]:
gene_hierarchy_dict = generate_gene_hierarchy_dict(db)

### Compute regions of coding exons:

Let's start with an example:

In [5]:
gene_ex_id = 'gene:ENSG00000186092'
gene_ex_transcript = 'transcript:ENST00000641515'
gene_example = gene_hierarchy_dict[gene_ex_id]
trans_interval_dict = transcript_interval_dict(gene_example)
overlaping_dict = get_overlaping_dict(trans_interval_dict[gene_ex_transcript])
example_coding_exons = get_coords_with_coding_exons(trans_interval_dict[gene_ex_transcript])

We will use the intervals of the UTRs and CDSs features to calculate the coding exons regions in a gene:

We have:

In [6]:
trans_interval_dict[gene_ex_transcript]

{(65419,65433): {'id': 'five_prime_UTR_1', 'type': 'five_prime_UTR'},
 (65434,65519): {'id': 'intron_1', 'type': 'intron'},
 (65520,65564): {'id': 'five_prime_UTR_2', 'type': 'five_prime_UTR'},
 (65520,65573): {'id': 'exon_39', 'type': 'exon'},
 (65565,65573): {'id': 'CDS:ENSP00000493376', 'type': 'CDS'},
 (65574,69036): {'id': 'intron_2', 'type': 'intron'},
 (69037,70008): {'id': 'CDS:ENSP00000493376_1', 'type': 'CDS'},
 (69037,71585): {'id': 'exon_40', 'type': 'exon'},
 (70009,71585): {'id': 'three_prime_UTR_1', 'type': 'three_prime_UTR'}}

We want:

In [7]:
example_coding_exons

{(65419,65433): {'id': 'five_prime_UTR_1', 'type': 'five_prime_UTR'},
 (65434,65519): {'id': 'intron_1', 'type': 'intron'},
 (65520,65564): {'id': 'five_prime_UTR_2', 'type': 'five_prime_UTR'},
 (65565,65573): {'id': 'exon_39', 'type': 'coding_exon'},
 (65574,69036): {'id': 'intron_2', 'type': 'intron'},
 (69037,70008): {'id': 'exon_40', 'type': 'coding_exon'},
 (70009,71585): {'id': 'three_prime_UTR_1', 'type': 'three_prime_UTR'}}

In [8]:
gene_transcript_dict, gene_hierarchy_dict_with_ce = get_gene_hierarchy_dict_with_coding_exons(gene_hierarchy_dict)

In [9]:
# with open('../files/gene_transcript_breakdown_dict_with_is.pkl', 'wb') as handle:
#     pickle.dump(gene_transcript_dict, handle)

#### Get a sample of genes with 3 coding exons:

In [10]:
random.seed(10)
sample_genes_3coding_exons = {}
copy_gene_hierarchy_dict_with_ce = copy.deepcopy(gene_hierarchy_dict_with_ce)
for gene_id, gene_hierarchy_dict_ in copy_gene_hierarchy_dict_with_ce.items():
    if db[gene_id].strand == '+' and len(gene_hierarchy_dict_) == 1:
        transcript_temp = {}
        trapt_hier_dict = copy.deepcopy(gene_hierarchy_dict_[next(iter(gene_hierarchy_dict_.keys()))])
        # let's keep those genes with less than 10 annotations - so it's visually easier to validate
        if len(trapt_hier_dict) < 10:
            n_coding_exons = [i['type'] for i in trapt_hier_dict.values() if i['type'] == 'coding_exon']
            # let's aim to simulate exon duplications on genes with 3 coding exons
            if len(n_coding_exons) == 3:
                sample_genes_3coding_exons[gene_id] = copy.deepcopy(trapt_hier_dict)

sample_20 = {key:sample_genes_3coding_exons[key] for key in random.sample(list(sample_genes_3coding_exons), 20)}

### Insert coding exon duplication before the original coding exon

In [11]:
simulated_genes = simulate_genes_with_coding_exons_duplications(2, 'before', sample_20,
                                                                gene_hierarchy_dict_with_ce,
                                                                fasta_GRCh38_seq['1']['+'], db)

# with open('../files/simulated_genes_breakdown_dict_with_is.pkl', 'wb') as handle:
#     pickle.dump(simulated_genes, handle)

In [12]:
sorted_gene_sample = {gene[0]: copy.deepcopy(simulated_genes[gene[0]]) 
                      for gene in sorted([(key, value['coord']) 
                                          for key, value in simulated_genes.items()],
                                         key = lambda item: (item[1].lower, item[1].upper))
                     }

In [13]:
new_seq, len_dup = generate_new_genome_seq(sorted_gene_sample,
                                           gene_hierarchy_dict_with_ce,
                                           fasta_GRCh38_seq['1']['+'], db)

In [14]:
#sanity check
len(new_seq) - len_dup == len(fasta_GRCh38_seq['1']['+'])

True

In [15]:
coding_exons_interv = {gene_id :
    {i:j for i, j in zip(transcr_dict['features_order'],
                         transcr_dict['features_intervals']) if 'exon' in i} 
    for gene_id, transcr_dict in sorted_gene_sample.items() 
                      }

In [16]:
sim_genes_interv_in_chr = get_simulated_new_intervals(fasta_GRCh38_seq['1']['+'],
                                                      new_seq,
                                                      sorted_gene_sample,
                                                      coding_exons_interv)

In [17]:
# sanity check - introns signal - All introns start with an - AG and end with a GT
for gene_id, gene_dict in sim_genes_interv_in_chr.items():
    for id_, annot in gene_dict['coding_exons_dups'].items():
        if len(annot) == 2:
            s, e = annot[0]
            print(gene_id, 'split intron ends with: ', new_seq[(s-2):(s)],
                  '   ', 'starts with: ', new_seq[e:(e+2)])

gene:ENSG00000173673 split intron ends with:  AG     starts with:  GT
gene:ENSG00000116726 split intron ends with:  AG     starts with:  GT
gene:ENSG00000120952 split intron ends with:  AG     starts with:  GT
gene:ENSG00000270601 split intron ends with:  AG     starts with:  GT
gene:ENSG00000237700 split intron ends with:  AG     starts with:  GT
gene:ENSG00000204479 split intron ends with:  AG     starts with:  GT
gene:ENSG00000184454 split intron ends with:  AG     starts with:  GT
gene:ENSG00000116329 split intron ends with:  AG     starts with:  GT
gene:ENSG00000273274 split intron ends with:  AG     starts with:  GT
gene:ENSG00000121905 split intron ends with:  AG     starts with:  GT
gene:ENSG00000044012 split intron ends with:  AG     starts with:  GT
gene:ENSG00000123091 split intron ends with:  AG     starts with:  GT
gene:ENSG00000116157 split intron ends with:  AG     starts with:  GT
gene:ENSG00000182183 split intron ends with:  AG     starts with:  GT
gene:ENSG00000143001

In [18]:
with open('../files/GRCh38_duplicated_coding_exons_coords_with_is.pkl', 'wb') as handle:
    pickle.dump(sim_genes_interv_in_chr, handle)
    
record = SeqRecord(
    Seq(new_seq),
    id = '+',
    description="first chromosome in which 20 genes on + strand "
                "have duplications of the 2nd coding exon"
                "inserted in the 1st intron")
with open("GRCh38_with_coding_exon_dup_with_is.fa", "w") as handle:
    SeqIO.write(record, handle, "fasta")

### Insert coding exon duplication after the original coding exon

In [19]:
simulated_genes_after = simulate_genes_with_coding_exons_duplications(2, 'after', sample_20,
                                                                      gene_hierarchy_dict_with_ce,
                                                                      fasta_GRCh38_seq['1']['+'], db)

with open('../files/simulated_genes_breakdown_dict_with_is_after.pkl', 'wb') as handle:
    pickle.dump(simulated_genes_after, handle)

In [20]:
sorted_gene_sample_after = {gene[0]: copy.deepcopy(simulated_genes[gene[0]]) 
                            for gene in sorted([(key, value['coord']) 
                                                for key, value in simulated_genes_after.items()],
                                               key = lambda item: (item[1].lower, item[1].upper))
                     }

new_seq_after, len_dup_after = generate_new_genome_seq(sorted_gene_sample_after,
                                                       gene_hierarchy_dict_with_ce,
                                                       fasta_GRCh38_seq['1']['+'], db)

In [21]:
#sanity check
len(new_seq_after) - len_dup_after == len(fasta_GRCh38_seq['1']['+'])

True

In [22]:
coding_exons_interv_after = {gene_id :
                             {i:j for i, j in zip(transcr_dict['features_order'],
                                                  transcr_dict['features_intervals']) if 'exon' in i} 
                             for gene_id, transcr_dict in sorted_gene_sample_after.items() 
                            }

sim_genes_interv_in_chr_after = get_simulated_new_intervals(fasta_GRCh38_seq['1']['+'],
                                                            new_seq_after,
                                                            sorted_gene_sample_after,
                                                            coding_exons_interv_after)

In [23]:
# sanity check - introns signal
for gene_id, gene_dict in sim_genes_interv_in_chr_after.items():
    for id_, annot in gene_dict['coding_exons_dups'].items():
        if len(annot) == 2:
            s, e = annot[0]
            print(gene_id, 'split intron ends with: ', new_seq[(s-2):(s)],
                  '   ', 'starts with: ', new_seq[e:(e+2)])

gene:ENSG00000173673 AG     GT
gene:ENSG00000116726 AG     GT
gene:ENSG00000120952 AG     GT
gene:ENSG00000270601 AG     GT
gene:ENSG00000237700 AG     GT
gene:ENSG00000204479 AG     GT
gene:ENSG00000184454 AG     GT
gene:ENSG00000116329 AG     GT
gene:ENSG00000273274 AG     GT
gene:ENSG00000121905 AG     GT
gene:ENSG00000044012 AG     GT
gene:ENSG00000123091 AG     GT
gene:ENSG00000116157 AG     GT
gene:ENSG00000182183 AG     GT
gene:ENSG00000143001 AG     GT
gene:ENSG00000143125 AG     GT
gene:ENSG00000280778 AG     GT
gene:ENSG00000143184 AG     GT
gene:ENSG00000181873 AG     GT
gene:ENSG00000116574 AG     GT


In [24]:
with open('../files/GRCh38_duplicated_coding_exons_coords_with_is_after.pkl', 'wb') as handle:
    pickle.dump(sim_genes_interv_in_chr, handle)
    
record = SeqRecord(
        Seq(new_seq),
        id = '+',
        description="first chromosome in which 20 genes on + strand "
                    "have duplications of the 2nd coding exon"
                    "inserted in the 1st intron")
with open("GRCh38_with_coding_exon_dup_with_is_after.fa", "w") as handle:
    SeqIO.write(record, handle, "fasta")

### Let's remove the orignal conding exon and leave the duplication

## Collect the protein sequences of the genes with duplications

In [20]:
## Retrieve a fasta file with the protein sequences of all genes in human.
#!wget http://ftp.ensembl.org/pub/release-107/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz
#!gunzip *.gz

In [21]:
human_protein_seqs = {}
fasta_sequences = SeqIO.parse(open("Homo_sapiens.GRCh38.pep.all.fa"),'fasta')
for fasta in fasta_sequences:
    gene = [i for i in fasta.description.rsplit(' ') if 'gene:' in i]
    if gene:
        if gene[0] not in human_protein_seqs:
            human_protein_seqs[gene[0]] = {}
        human_protein_seqs[gene[0]][fasta.id] = str(fasta.seq)

In [30]:
proteins_simulated_genes = {}
protein_gene_dict = {}
with open("genes_with_dup_original_prot_seq_.fa", "w") as handle:
    for gene_id in simulated_genes.keys():
        gene_id_prot_dict = [i for i in list(human_protein_seqs.keys()) if gene_id in i]
        if gene_id_prot_dict:
            temp = copy.deepcopy(human_protein_seqs[gene_id_prot_dict[0]])
            for prot_id, seq in temp.items():  
                record = SeqRecord(Seq(seq),
                                   id = str(prot_id),
                                  description= gene_id_prot_dict[0])
                SeqIO.write(record, handle, "fasta")
                protein_gene_dict[prot_id.split('.')[0]] = gene_id_prot_dict[0].split('.')[0]
            proteins_simulated_genes[gene_id_prot_dict[0]] = temp

In [33]:
with open('../files/protein_gene_corresp.pkl', 'wb') as handle:
    pickle.dump(protein_gene_dict, handle)