#### Import packages, set directories and parameters

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2
from Comparative_Analysis import Alignment as alignfn
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [101]:
project_dir = 'F:/Project_Data/mabR_Project'
mycobacteria_seq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_Complete_Annot_20230511/data'
actinomycetes_seq_dir = 'F:/Datasets/NCBI_Refseq_Actinomycetes_Complete_Annot_20230511/data'
tb_species = 'NC_000962.3' 
tb_annotation_dirname = 'GCF_000195955.2'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_build = False

#### Only use sequences where annotation contains a single sequence and annotation file

In [9]:
if full_build == True:
    mycobacteria_dirs = []
    actinomycetes_dirs = []
    sequence_dirs = util.list_dirs(mycobacteria_seq_dir)
    for dirname in tqdm(sequence_dirs):
            if not(os.path.exists(mycobacteria_seq_dir + '/' + dirname + '/genomic.gbff')):
                continue
            else:
                ct = 0
                for genome_record in SeqIO.parse(mycobacteria_seq_dir + '/' + dirname + '/genomic.gbff', "genbank"):
                    ct+=1
                    if ct > 1:
                        break
                if ct == 1:
                    mycobacteria_dirs.append(dirname)
    sequence_dirs = util.list_dirs(actinomycetes_seq_dir)
    for dirname in tqdm(sequence_dirs):
            if not(os.path.exists(actinomycetes_seq_dir + '/' + dirname + '/genomic.gbff')):
                continue
            else:
                ct = 0
                for genome_record in SeqIO.parse(actinomycetes_seq_dir + '/' + dirname + '/genomic.gbff', "genbank"):
                    ct+=1
                    if ct > 1:
                        break
                if ct == 1:
                    actinomycetes_dirs.append(dirname)
    with open(project_dir + '/mycobacteria_dirs.pkl', 'wb') as f:
            pickle.dump(mycobacteria_dirs, f) 
    with open(project_dir + '/actinomycetes_dirs.pkl', 'wb') as f:
            pickle.dump(actinomycetes_dirs, f) 

100%|██████████| 222/222 [00:53<00:00,  4.17it/s]
100%|██████████| 722/722 [07:11<00:00,  1.68it/s]


#### Create files with all CDS for both mycobacteria and actinobacteria reference sets and create BLAST databases for TB CDS and both reference sets (to do reciprocal best hits)

In [32]:
def generate_protein_dataset(num_subsets, subset_num, dir_list, seqdir): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    all_cds = []
    all_tb_cds = []
    names = []
    sequences = []
    locations = []
    for dirname in (sequence_dirs):
        for genome_record in SeqIO.parse(seqdir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names.append([accession_ver, genome_record.annotations['organism']])
            full_sequence = str(genome_record.seq)
            sequences.append([accession_ver, full_sequence])
            for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS' and a.get("translation") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    translation = a.get("translation")[0]
                    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                    locations.append([accession_locus, (start, stop, strand)])
                    all_cds.append([accession_locus, translation])
                    if dirname == tb_annotation_dirname:
                        all_tb_cds.append([accession_locus, translation])
    return (all_cds, all_tb_cds, names, locations, sequences)           

In [99]:
def generate_upstream_sequence_dataset(num_subsets, subset_num, dir_list, seqdir): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    upstream_cds_regions = []
    for dirname in (sequence_dirs):
        all_features = []
        for genome_record in SeqIO.parse(seqdir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            full_sequence = str(genome_record.seq)
            len_full_sequence = len(full_sequence)
            for feature in genome_record.features:
                if feature.type in ['gene', 'source']:
                    continue
                a = feature.qualifiers
                feature_type = feature.type
                if a.get("locus_tag") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                else:
                    accession_locus  = ''
                (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                all_features.append([accession_locus, feature_type, start, stop, strand])
        # Positive strand upstream 
        all_features.sort(key = lambda x: x[2])
        max_stop = 0
        for (accession_locus, feature_type, start, stop, strand) in all_features:
            if max_stop < start and feature_type == 'CDS' and strand == 1 and start - max_stop < 100000:    #Avoid joins where biopython interprets inconsistently 
                upstream_cds_regions.append([accession_locus, max_stop, start, strand, full_sequence[max_stop: start]])
            max_stop = max(max_stop, stop)
        # Negative strand upstream
        all_features.sort(key = lambda x: x[3], reverse = True)
        min_start = len(full_sequence)-1
        for (accession_locus, feature_type, start, stop, strand) in all_features:
            if stop < min_start and feature_type == 'CDS' and strand == -1 and min_start - stop < 100000:
                upstream_cds_regions.append([accession_locus, stop, min_start, strand, util.reverse_complement(full_sequence[stop: min_start])])
            min_start = min(min_start, start)
    return (upstream_cds_regions)           

In [33]:
if full_build == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, mycobacteria_dirs, mycobacteria_seq_dir) for core_number in core_numbers)
    names_dict_1 = {}
    locations_dict_1 = {}
    sequence_dict_1 = {}
    all_cds_1 = []
    all_tb_cds_1 = []
    for x in parallel_output:
        all_cds_1 += x[0]
        all_tb_cds_1 += x[1]
        for temp in x[2]:
            names_dict_1[temp[0]] = temp[1]
        for temp in x[3]:
            locations_dict_1[temp[0]] = temp[1]
        for temp in x[4]:
            sequence_dict_1[temp[0]] = temp[1]
    with open(project_dir + '/names_dict_1.pkl', 'wb') as f:
            pickle.dump(names_dict_1, f) 

In [34]:
if full_build == True:
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, actinomycetes_dirs, actinomycetes_seq_dir) for core_number in core_numbers)
    names_dict_2 = {}
    locations_dict_2 = {}
    sequence_dict_2 = {}
    all_cds_2 = []
    all_tb_cds_2 = []
    for x in parallel_output:
        all_cds_2 += x[0]
        all_tb_cds_2 += x[1]
        for temp in x[2]:
            names_dict_2[temp[0]] = temp[1]
        for temp in x[3]:
            locations_dict_2[temp[0]] = temp[1]
        for temp in x[4]:
            sequence_dict_2[temp[0]] = temp[1]
    with open(project_dir + '/names_dict_2.pkl', 'wb') as f:
            pickle.dump(names_dict_2, f) 

In [100]:
if full_build == True:
    mycobacteria_upstream_dict = {}
    actinomycetes_upstream_dict = {}
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_upstream_sequence_dataset)(num_cores, core_number, mycobacteria_dirs, mycobacteria_seq_dir) for core_number in core_numbers)
    for x in parallel_output:
        for n in x:
            mycobacteria_upstream_dict[n[0]] = [n[1], n[2], n[3], n[4]]
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_upstream_sequence_dataset)(num_cores, core_number, actinomycetes_dirs, actinomycetes_seq_dir) for core_number in core_numbers)
    for x in parallel_output:
        for n in x:
            actinomycetes_upstream_dict[n[0]] = [n[1], n[2], n[3], n[4]]
    with open(project_dir + '/mycobacteria_upstream_dict.pkl', 'wb') as f:
        pickle.dump(mycobacteria_upstream_dict, f) 
    with open(project_dir + '/actinomycetes_upstream_dict.pkl', 'wb') as f:
        pickle.dump(actinomycetes_upstream_dict, f) 

In [37]:
if full_build == True:
    util.produce_fasta_file(all_cds_1, project_dir + '/mycobacteria_cds.fasta')
    util.produce_fasta_file(all_tb_cds_1, project_dir + '/tb_cds.fasta')
    util.produce_fasta_file(all_cds_2, project_dir + '/actinomycetes_cds.fasta')
    blastfn.build_blast_db(project_dir, 'mycobacteria_cds.fasta', 'Mycobacteria', project_dir + '/BLAST/Mycobacteria')
    blastfn.build_blast_db(project_dir, 'tb_cds.fasta', 'all_tb_cds', project_dir + '/BLAST/Tb')
    blastfn.build_blast_db(project_dir, 'actinomycetes_cds.fasta', 'Actinomycetes', project_dir + '/BLAST/Actinomycetes')

100%|██████████| 276716/276716 [00:23<00:00, 11732.16it/s]
100%|██████████| 3906/3906 [00:00<00:00, 46801.61it/s]
100%|██████████| 2365638/2365638 [03:24<00:00, 11554.74it/s]


In [38]:
if full_build == True:
    blastfn.run_blastp(project_dir + '/BLAST/Mycobacteria', 'tb_cds.fasta', 'Mycobacteria', 'tb_mycobacteria_hits.csv', e_value = 1e-10)
    blastfn.run_blastp(project_dir + '/BLAST/Tb', 'mycobacteria_cds.fasta', 'all_tb_cds', 'mycobacteria_tb_hits.csv', e_value = 1e-10)
    a = blastfn.process_blast_output(project_dir + '/BLAST/Mycobacteria/tb_mycobacteria_hits.csv', names_dict_1, top_hit_only = False)
    a = a.loc[a.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()]     # Utility top hit method uses accession ver which can differ if multiple sets exist per species in fragmented annotations
    b = blastfn.process_blast_output(project_dir + '/BLAST/Tb/mycobacteria_tb_hits.csv', names_dict_1, top_hit_only = False)
    b = b.loc[b.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()] 
    rbh =  blastfn.keep_reciprocal_best_hits(a, b)
    rbh.to_csv(project_dir + '/tb_mycobacteria_reciprocal_best_hits.csv')

In [39]:
if full_build == True:
    blastfn.run_blastp(project_dir + '/BLAST/Actinomycetes', 'tb_cds.fasta', 'Actinomycetes', 'tb_actinomycetes_hits.csv', e_value = 1e-10)
    blastfn.run_blastp(project_dir + '/BLAST/Tb', 'actinomycetes_cds.fasta', 'all_tb_cds', 'actinomycetes_tb_hits.csv', e_value = 1e-10)
    a = blastfn.process_blast_output(project_dir + '/BLAST/Actinomycetes/tb_actinomycetes_hits.csv', names_dict_2, top_hit_only = False)
    a = a.loc[a.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()]     # Utility top hit method uses accession ver which can differ if multiple sets exist per species in fragmented annotations
    b = blastfn.process_blast_output(project_dir + '/BLAST/Tb/actinomycetes_tb_hits.csv', names_dict_2, top_hit_only = False)
    b = b.loc[b.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()] 
    rbh_2 =  blastfn.keep_reciprocal_best_hits(a, b)
    rbh_2.to_csv(project_dir + '/tb_actinomycetes_reciprocal_best_hits.csv')

#### Function to generate FASTA file containing intergenic regions in orthologous species and run Muscle / R-scape

In [102]:
if not(full_build == True):
    with open(project_dir + '/names_dict_1.pkl', 'rb') as f:
        names_dict_1 = pickle.load(f)
    with open(project_dir + '/names_dict_2.pkl', 'rb') as f:
        names_dict_2 = pickle.load(f)
    with open(project_dir + '/mycobacteria_dirs.pkl', 'rb') as f:
        mycobacteria_dirs = pickle.load(f) 
    with open(project_dir + '/actinomycetes_dirs.pkl', 'rb') as f:
         actinomycetes_dirs = pickle.load(f)  
    with open(project_dir + '/mycobacteria_upstream_dict.pkl', 'rb') as f:
        mycobacteria_upstream_dict = pickle.load(f) 
    with open(project_dir + '/actinomycetes_upstream_dict.pkl', 'rb') as f:
        actinomycetes_upstream_dict = pickle.load(f) 
    tb_mycobacteria_rbh = pd.read_csv(project_dir + '/tb_mycobacteria_reciprocal_best_hits.csv')
    tb_actinomycetes_rbh = pd.read_csv(project_dir + '/tb_actinomycetes_reciprocal_best_hits.csv')

In [105]:
intergenic_regions = []
locus = 'Rv2243'
results_dir = project_dir + '/' + locus
if not os.path.exists(results_dir):
        os.makedirs(results_dir)
target_locus = tb_species + '@' + locus
hits = tb_mycobacteria_rbh[tb_mycobacteria_rbh['query_ref'] == target_locus]
hits = hits[hits['percent_identical_matches'] > 70]
for i, r in hits.iterrows():
    target_ref = r['target_ref']
    if target_ref in mycobacteria_upstream_dict:
        upstream_region = mycobacteria_upstream_dict[target_ref][3]
        if len(upstream_region) > 8:
        intergenic_regions.append([target_ref, upstream_region])
if len(intergenic_regions) > 10:    
    util.produce_fasta_file(intergenic_regions, results_dir +'/intergenic_regions.fasta')
    cline = MuscleCommandline(muscle_exe, input= results_dir +'/intergenic_regions.fasta', out = results_dir + '/upstream_region_'+locus + '_alignment.fasta')
    exception = 0
    try:
        stdout, stderr = cline()
    except Exception as e:
        exception == 1

NZ_AP022613.1@G6N66_RS16115 [3489660, 3489800, 1, 'TGATTTAGCCAACAGATCGCGACGCGATATCAAACACGTAGCTTACGGGCCTGTTTTGTATACTCCATACAAAAACCTAAGACGAGGTTCATAATCTGTTACACCCGCCAAAACCGTTTCCACAGTGTTCTCTTAAACAC']
NZ_AP022569.1@G6N27_RS06415 [1325893, 1326021, 1, 'CAGATCGCGGCTTGATCTCGAATACGTCGCATCGAGCGCGTTTTTGTGGGGTTTCCACAAAAACCTAAGACAAGGTTCATAATCTGTTACACCGCGCAAAACCGTCTTCACAGTGTTCCCTTAAGAAC']
NZ_AP022605.1@G6N07_RS12225 [2492989, 2493117, 1, 'GAGGTCGCGGCTGGATCTCGCCGGAGTTGCATCACATGCAGTTTTGTGGACTTGCTACAAAAACATAAGATGAGGTTCATAATCTCTTACACGGCGCGAAGTCGTCTTCACAGTGTTCTCTTAGAGAC']
NZ_AP022576.1@G6N55_RS00230 [44233, 44373, -1, 'TGATCCAGGCAACAGATCGCGGCGCGGTATCAAATACGTAGCTTGTCGAGCGGTTTTGTGGAGTACCTACAAAAACCTAAGACGAGGTTCATAATCTGTTACACCCGCCAAAACCGTTTCCACAGTGTTCTCTTAGACAC']
NZ_CP011883.2@B586_RS12405 [2644652, 2644792, 1, 'TGATCCAGATCGCAGGTCGCGGGATGATATCAAACCCGTAGCTTACCGAACTGTTTTGTAGGTTACATACAAAAACCTAAGACGAGGTTCATAATCTGTTACACCCCGCAAAACCGTCTTCACAGTGTTCTCTTAGACAC']
NZ_AP022615.1@G6N25_RS18885 [4035255, 4035395, -1, 'TGATCC

In [98]:
mycobacteria_upstream_dict['NC_000962.3@Rv2245']

KeyError: 'NC_000962.3@Rv2245'

In [None]:
def generate_alignment(locus, offset, comparison_set):
    target_locus = tb_species+ '@' + locus
    results_dir = project_dir + '/' + locus

    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    if comparison_set == 1:     #1 = Mycobacteria, 2 = Actinobacteria
        rbh = pd.read_csv(project_dir + '/reciprocal_best_hits.csv')
        locations_dict = locations_dict_1
        sequence_dict = sequence_dict_1
    else:
        rbh = pd.read_csv(project_dir + '/reciprocal_best_hits_2.csv')
        locations_dict = locations_dict_2
        sequence_dict = sequence_dict_2
    
    location = locations_dict[target_locus] 
    rbh = rbh[rbh['query_ref'] == target_locus]
    rbh['target_loc'] = rbh['target_ref'].map(locations_dict)
    rbh['cds_count'] = rbh.groupby('target_species_name')['query_ref'].transform('size')
    rbh['min_pct_id'] = rbh.groupby('target_species_name')['percent_identical_matches'].transform('min')
    
    rbh = rbh[rbh['cds_count'] == 1]
    for i, r in rbh.iterrows():
        (start, stop, strand) = r['target_loc']
        rbh.at[i,'start'] = start
        rbh.at[i,'stop'] =  stop
        rbh.at[i,'strand'] = strand
    rbh = rbh[rbh['min_pct_id'] >= 60]
    if len(rbh) > 10:
        intergenic_regions = []
        query_regions = []

        for i, r in rbh.iterrows():
            target_species = r['target_species_name']
            target_species_accession = r['target_species']
            coords = r['target_loc']
            if coords[2] == 1:
                intergenic_region = (coords[0] - offset, coords[0] + 3)
                intergenic_sequence = sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]]
            else:
                intergenic_region = (coords[1] - 3, coords[1] + offset)
                intergenic_sequence = util.reverse_complement(sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]])

            intergenic_regions.append([target_species.replace(" ", "_"), intergenic_sequence])
            if target_species_accession == tb_species:
                query_regions.append([target_species.replace(" ", "_"), intergenic_sequence])

        util.produce_fasta_file(intergenic_regions, results_dir +'/intergenic_regions_'+str(comparison_set)+'.fasta')
        util.produce_fasta_file(query_regions, results_dir +'/query_regions_'+str(comparison_set)+'.fasta')

        cline = MuscleCommandline(muscle_exe, input= results_dir +'/intergenic_regions_'+str(comparison_set)+'.fasta', out = results_dir + '/upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.fasta')
        exception = 0
        try:
            stdout, stderr = cline()
        except Exception as e:
            exception == 1
        if exception == 0:
            temp =util.read_fasta_to_array(results_dir + '/upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.fasta')
            for n, name in enumerate(temp[0]):
                if name == 'Mycobacterium_tuberculosis_H37Rv':
                    tb_loc = n
                    break
            insertions = []
            for n, nt in enumerate(temp[1][n]):
                if nt == '-':
                    insertions.append(n)

            sequences = []
            for n, sequence in enumerate(temp[1]):
                temp_seq = []
                for m, l in enumerate(sequence):
                    if not(m in insertions):
                        temp_seq.append(l)
                sequences.append([temp[0][n] ,''.join(temp_seq)])
            util.produce_fasta_file(sequences, results_dir + '/upstream_region_no_insertions'+locus + '_alignment_'+str(comparison_set)+'.fasta')  


            blastfn.convert_fasta_to_stockholm(results_dir, 'upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.fasta', 'upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.sto')
            blastfn.run_rscape(results_dir, 'upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.sto', 'rscape_output_'+str(comparison_set))

            blastfn.convert_fasta_to_stockholm(results_dir, 'upstream_region_no_insertions'+locus + '_alignment_'+str(comparison_set)+'.fasta', 'upstream_region_no_insertions'+locus + '_alignment_'+str(comparison_set)+'.sto')
            blastfn.run_rscape(results_dir, 'upstream_region_no_insertions'+locus + '_alignment_'+str(comparison_set)+'.sto', 'rscape_output_no_insertions'+str(comparison_set))

#### Subset based on loci of interest, and filter orthologues based on percent identity, synteny

In [None]:
tb_loci = []
for genome_record in SeqIO.parse(seq_dir_1 + '/' + tb_annotation_dirname + '/genomic.gbff', "genbank"):
        for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS':
                    tb_loci.append(a.get("locus_tag")[0])

In [None]:
tb_loci[3646]

In [None]:
tb_loci = tb_loci[3646:]

In [None]:
for n, locus in tqdm(enumerate(tb_loci)):
    target_locus = tb_species+ '@' + locus 
    location = locations_dict_1[target_locus] 
    if n == 0:
        continue
    if location[2] == 1:
        upstream_cds_stop = locations_dict_1[tb_species + '@' + tb_loci[n-1]][1]
        offset = location[0] - upstream_cds_stop
    else:
        upstream_cds_stop = locations_dict_1[tb_species + '@' + tb_loci[n+1]][0]
        offset = upstream_cds_stop - location[1]

    if offset < 30:
        continue
    else:
        print(locus)
        generate_alignment(locus, offset, 1)
        generate_alignment(locus, offset, 2)

#### Extract covariation information to identify potentially significant regions

In [None]:
seq_ids = util.list_dirs(project_dir)
out_list = []
for id in seq_ids:
    if os.path.exists(project_dir + '/' + str(id) + '/rscape_output_2.cov'):
        with open(project_dir + '/' + str(id) + '/rscape_output_2.cov', 'r') as f:  
            num_pairs = 0
            e_values = []
            for l in f:
                if (not ('#' in l)):
                    a = l.split()
                    if len(a) > 6 and abs(int(a[2])-int(a[1])) > 10:    # Exclude covarying pairs less than 3 bp apart)
                        e_values.append(float(a[4]))
                        num_pairs +=1
        if len(e_values) > 0:
            tot = sum([math.log(x) for x  in e_values])
            num = len(e_values)
            combined_e_value = 1-chi2.cdf(-2*tot, 2*num)
        else:
            combined_e_value = 999
        if combined_e_value < 88 and num_pairs> 4:
            out_list.append((id, num_pairs, combined_e_value))
out_list.sort(key = lambda x: x[0])
out_list

In [None]:
reference_species = 'GCF_000195955.2'
refseq_dir  = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
for record in SeqIO.parse(refseq_dir + '/'+reference_species+'/genomic.gbff', "genbank"):
    full_sequence = str(record.seq)
tr = util.Translator()
tr.translate_sequence(full_sequence[4400215:4400777],1,0)