#### Directories etc

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2
from Comparative_Analysis import ORF_Functions as orffn
from Comparative_Analysis import Alignment as alignfn
from random import sample
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
project_dir = 'F:/Project_Data/Intergenic_Region_Comparative_Analysis_Downstream'
seq_dir_1 = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
seq_dir_2 = 'F:/Datasets/NCBI_Refseq_Actinobacteria_All_Levels/data'
sequence_dirs_1 = util.list_dirs(seq_dir_1)
sequence_dirs_2 = util.list_dirs(seq_dir_2)
tb_species = 'NC_000962.3' 
tb_annotation_dirname = 'GCF_000195955.2'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_build = False

In [3]:
def reverse_complement(seq_string):
    complement_dict = {'A':'T','C':'G','G':'C','T':'A','N':'N','S':'A','R':'A','Y':'A','K':'A','M':'A','W':'A'}    # Note S
    temp = []
    for char in reversed(seq_string):
        temp.append(complement_dict[char])
    return ''.join(temp)

#### Function to generate FASTA file containing downstream intergenic regions in orthologous species and run Muscle / R-scape

In [4]:
def generate_alignment(locus, offset, comparison_set):
    target_locus = tb_species+ '@' + locus
    results_dir = project_dir + '/' + locus

    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    if comparison_set == 1:     #1 = Mycobacteria, 2 = Actinobacteria
        rbh = pd.read_csv(project_dir + '/reciprocal_best_hits.csv')
        locations_dict = locations_dict_1
        sequence_dict = sequence_dict_1
    else:
        rbh = pd.read_csv(project_dir + '/reciprocal_best_hits_2.csv')
        locations_dict = locations_dict_2
        sequence_dict = sequence_dict_2
    
    location = locations_dict[target_locus] 
    rbh = rbh[rbh['query_ref'] == target_locus]
    rbh['target_loc'] = rbh['target_ref'].map(locations_dict)
    rbh['cds_count'] = rbh.groupby('target_species_name')['query_ref'].transform('size')
    rbh['min_pct_id'] = rbh.groupby('target_species_name')['percent_identical_matches'].transform('min')
    
    rbh = rbh[rbh['cds_count'] == 1]
    for i, r in rbh.iterrows():
        (start, stop, strand) = r['target_loc']
        rbh.at[i,'start'] = start
        rbh.at[i,'stop'] =  stop
        rbh.at[i,'strand'] = strand
    rbh = rbh[rbh['min_pct_id'] >= 60]
    if len(rbh) > 10:
        intergenic_regions = []
        query_regions = []

        for i, r in rbh.iterrows():
            target_species = r['target_species_name']
            target_species_accession = r['target_species']
            coords = r['target_loc']
            if coords[2] == 1:
                intergenic_region = (coords[1] -3, coords[1] + offset)
                intergenic_sequence = sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]]
            else:
                intergenic_region = (coords[0] - offset, coords[0] + 3)
                intergenic_sequence = reverse_complement(sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]])

            intergenic_regions.append([target_species.replace(" ", "_"), intergenic_sequence])
            if target_species_accession == tb_species:
                query_regions.append([target_species.replace(" ", "_"), intergenic_sequence])

        util.produce_fasta_file(intergenic_regions, results_dir +'/intergenic_regions_'+str(comparison_set)+'.fasta')
        util.produce_fasta_file(query_regions, results_dir +'/query_regions_'+str(comparison_set)+'.fasta')

        cline = MuscleCommandline(muscle_exe, input= results_dir +'/intergenic_regions_'+str(comparison_set)+'.fasta', out = results_dir + '/upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.fasta')
        exception = 0
        try:
            stdout, stderr = cline()
        except Exception as e:
            exception == 1
        if exception == 0:
            #temp =util.read_fasta_to_array(results_dir + '/upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.fasta')
            #for n, name in enumerate(temp[0]):
            #    if name == 'Mycobacterium_tuberculosis_H37Rv':
            #        tb_loc = n
            #        break
            #insertions = []
            #for n, nt in enumerate(temp[1][n]):
            #    if nt == '-':
            #        insertions.append(n)

            #sequences = []
            #for n, sequence in enumerate(temp[1]):
            #    temp_seq = []
            #    for m, l in enumerate(sequence):
            #        if not(m in insertions):
            #            temp_seq.append(l)
            #    sequences.append([temp[0][n] ,''.join(temp_seq)])
            #util.produce_fasta_file(sequences, results_dir + '/upstream_region_no_insertions'+locus + '_alignment_'+str(comparison_set)+'.fasta')  

            if comparison_set == 2:
                blastfn.convert_fasta_to_stockholm(results_dir, 'upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.fasta', 'upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.sto')
                blastfn.run_rscape(results_dir, 'upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.sto', 'rscape_output_'+str(comparison_set))

            #blastfn.convert_fasta_to_stockholm(results_dir, 'upstream_region_no_insertions'+locus + '_alignment_'+str(comparison_set)+'.fasta', 'upstream_region_no_insertions'+locus + '_alignment_'+str(comparison_set)+'.sto')
            #blastfn.run_rscape(results_dir, 'upstream_region_no_insertions'+locus + '_alignment_'+str(comparison_set)+'.sto', 'rscape_output_no_insertions'+str(comparison_set))


#### Run for intergenic regions

In [5]:
with open(project_dir + '/locations_dict_1.pkl', 'rb') as f:
    locations_dict_1 = pickle.load(f) 
with open(project_dir + '/locations_dict_2.pkl', 'rb') as f:
    locations_dict_2 = pickle.load(f) 
with open(project_dir + '/sequence_dict_1.pkl', 'rb') as f:
    sequence_dict_1 = pickle.load(f) 
with open(project_dir + '/sequence_dict_2.pkl', 'rb') as f:
    sequence_dict_2 = pickle.load(f) 

In [6]:
tb_loci = []
for genome_record in SeqIO.parse(seq_dir_1 + '/' + tb_annotation_dirname + '/genomic.gbff', "genbank"):
        for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS':
                    tb_loci.append(a.get("locus_tag")[0])

In [9]:
tb_loci[2340:2342]

['Rv2353c', 'Rv2357c']

In [10]:
for n, locus in tqdm(enumerate(tb_loci)):
    target_locus = tb_species+ '@' + locus 
    location = locations_dict_1[target_locus] 
    if n > len(tb_loci) - 2 or n <= 2341:
        continue
    if location[2] == 1:
        downstream_cds_start = locations_dict_1[tb_species + '@' + tb_loci[n+1]][0]
        offset = downstream_cds_start - location[1] 
    else:
        downstream_cds_start = locations_dict_1[tb_species + '@' + tb_loci[n-1]][1]
        offset = location[0] - downstream_cds_start
    if offset < 30:
        continue
    else:
        #print(locus, offset)
        generate_alignment(locus, offset, 1)
        generate_alignment(locus, offset, 2)

0it [00:00, ?it/s]
100%|██████████| 214/214 [00:00<00:00, 214219.82it/s]

100%|██████████| 1/1 [00:00<?, ?it/s][A

100%|██████████| 491/491 [00:00<00:00, 47798.61it/s]

100%|██████████| 1/1 [00:00<?, ?it/s][A
2344it [00:55, 42.27it/s]
100%|██████████| 102/102 [00:00<00:00, 104729.26it/s]

100%|██████████| 1/1 [00:00<00:00, 1009.95it/s]

100%|██████████| 61/61 [00:00<00:00, 63740.05it/s]

100%|██████████| 1/1 [00:00<?, ?it/s][A
2345it [01:10, 30.58it/s]
100%|██████████| 133/133 [00:00<?, ?it/s]A

100%|██████████| 1/1 [00:00<?, ?it/s][A

100%|██████████| 178/178 [00:00<?, ?it/s]A

100%|██████████| 1/1 [00:00<?, ?it/s][A
2356it [01:32, 18.35it/s]
100%|██████████| 197/197 [00:00<00:00, 195522.45it/s]

100%|██████████| 1/1 [00:00<?, ?it/s][A

100%|██████████| 129/129 [00:00<00:00, 126063.66it/s]

100%|██████████| 1/1 [00:00<00:00, 978.83it/s]
2357it [01:49, 11.83it/s]
100%|██████████| 215/215 [00:00<00:00, 215169.50it/s]

100%|██████████| 1/1 [00:00<?, ?it/s][A

100%|██████████| 393/

#### Extract covariation information to identify potentially significant regions

In [None]:
seq_ids = util.list_dirs(project_dir)
out_list = []
for id in seq_ids:
    if os.path.exists(project_dir + '/' + str(id) + '/rscape_output_2.cov'):
        with open(project_dir + '/' + str(id) + '/rscape_output_2.cov', 'r') as f:  
            num_pairs = 0
            e_values = []
            for l in f:
                if (not ('#' in l)):
                    a = l.split()
                    if len(a) > 6 and abs(int(a[2])-int(a[1])) > 10:    # Exclude covarying pairs less than 3 bp apart)
                        e_values.append(float(a[4]))
                        num_pairs +=1
        if len(e_values) > 0:
            tot = sum([math.log(x) for x  in e_values])
            num = len(e_values)
            combined_e_value = 1-chi2.cdf(-2*tot, 2*num)
        else:
            combined_e_value = 999
        if combined_e_value < 1e-5 and num_pairs> 10:
            out_list.append((id, num_pairs, combined_e_value))
out_list.sort(key = lambda x: x[0])
out_list

In [None]:
reference_species = 'GCF_000195955.2'
refseq_dir  = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
for record in SeqIO.parse(refseq_dir + '/'+reference_species+'/genomic.gbff', "genbank"):
    full_sequence = str(record.seq)
tr = util.Translator()
tr.translate_sequence(full_sequence[4400215:4400777],1,0)