In [1]:
%load_ext autoreload
%autoreload 2
from simulate_genome import *
from alignment_tools_comparison import *
from global_var import *

In [2]:
# read db using the gffutils library
db_direct = DB_DIRECT + 'pieris_napi.db'
gene_hierarchy_direct = PKL_DIRECT + 'gene_hierarchy_dict_pieris_napi.pkl'
genome_direct = FASTA_DIRECT + 'pieris_napi/Pieris_napi-GCA_905231885.1-softmasked.fa'
prot_direct = FASTA_DIRECT + "pieris_napi/data/Pnap.GCA_905231885.1-softmasked.brakerProt_rename_agat.prot.fa"

In [3]:
Pieris_napi_sim_genome = genome_constructor(db_direct,
                                            gene_hierarchy_direct,
                                            genome_direct,
                                            seed = 20)

## Simulate a genome with genes containing exon duplicates: 

In [4]:
genes_sample = Pieris_napi_sim_genome.collect_genes(n_coding_exons = 3, 
                                                    max_exon_len = 300,
                                                    min_intron_len = 100)

genes_sample_20 = Pieris_napi_sim_genome.get_sample(genes_sample,
                                                    20)

genes_loc = {i : P.closed(Pieris_napi_sim_genome.db[i].start, 
                          Pieris_napi_sim_genome.db[i].end)
             for i in genes_sample_20.keys()
            }

genes_order = list(genes_loc.keys())

### - We duplicate the 2nd exon and insert it before/after the real exon

In [5]:
#################################
# insert exon to the left
#################################
#simulate genes
simulated_sample_20_before = Pieris_napi_sim_genome.simulate_genes_with_exons_dup(2, 
                                                                                  'before', 
                                                                                  genes_sample_20)
#simulate genome
simulated_genome_before = Pieris_napi_sim_genome.simulate_genome(simulated_sample_20_before,
                                                                 'before', 
                                                                 'genome_test.fa')

sanity check: passed


In [6]:
ce_intervals = Pieris_napi_sim_genome.get_coding_exons_interv(simulated_sample_20_before)

new_exons_intervals_before = Pieris_napi_sim_genome.get_new_exon_intervals(simulated_genome_before,
                                                                           ce_intervals,
                                                                           simulated_sample_20_before)

genes_loc_before = {ID:desc['seq'][0] for chrom, chrom_dict in new_exons_intervals_before.items() 
                    for ID, desc in chrom_dict.items()
                   }

In [7]:
#sanity check
Pieris_napi_sim_genome.intron_signal_sanity_check(20,
                                                  'before',
                                                  simulated_genome_before,
                                                  new_exons_intervals_before)

sanity check: passed


In [8]:
#################################
# insert exon to the right
#################################
#simulate genes
simulated_sample_20_after = Pieris_napi_sim_genome.simulate_genes_with_exons_dup(2, 
                                                                                 'after', 
                                                                                 genes_sample_20)
#simulate genome
simulated_genome_after = Pieris_napi_sim_genome.simulate_genome(simulated_sample_20_after,
                                                                'after', 
                                                                'genome_test_after.fa')

sanity check: passed


In [9]:
ce_intervals_after = Pieris_napi_sim_genome.get_coding_exons_interv(simulated_sample_20_after)

new_exons_intervals_after = Pieris_napi_sim_genome.get_new_exon_intervals(simulated_genome_after,
                                                                          ce_intervals_after,
                                                                          simulated_sample_20_after)
genes_loc_after = {ID:desc['seq'][0] for chrom, chrom_dict in new_exons_intervals_after.items() 
                    for ID, desc in chrom_dict.items()}

In [10]:
#sanity check
Pieris_napi_sim_genome.intron_signal_sanity_check(20,
                                                  'after',
                                                  simulated_genome_after,
                                                  new_exons_intervals_after)

sanity check: passed


In [11]:
Pieris_napi_sim_genome.dump_pkl_file(PKL_DIRECT + 'gene_sample_20.pkl', 
                                     {'genes_sample_20':genes_sample_20,
                                      'genes_loc':genes_loc,
                                      'genes_order':genes_order,
                                      'genes_loc_before':genes_loc_before,
                                      'genes_loc_after':genes_loc_after,
                                      'simulated_sample_20_before':simulated_sample_20_before,
                                      'simulated_sample_20_after': simulated_sample_20_after,
                                      'new_exons_intervals_before':new_exons_intervals_before,
                                      'new_exons_intervals_after':new_exons_intervals_after
                                      
                                     }
                                    )