##### Set up packages and directories

In [1]:
full_run = True

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Align.Applications import MuscleCommandline
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

In [83]:
project_dir = 'F:/Project_Data/Project_10'
literature_datasets_dir = 'F:/Datasets/Data_From_Publications'
output_dir = project_dir + '/Output'
refseq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
ecoli_dir = 'F:/Datasets/E.coli'
num_cores = 8
core_numbers = list(range(1, num_cores+1))

In [128]:
reference_species = 'GCF_000195955.2'
marinum_species = 'GCF_016745295.1'
shiju_species = 'GCF_010730055.1'
ecoli_species = 'GCF_000005845.2'
#all_species = [reference_species, marinum_species, shiju_species]
all_species = util.list_dirs(refseq_dir)

In [85]:
rbh = pd.read_csv('F:/Project_Data/Intergenic_Region_Comparative_Analysis/reciprocal_best_hits.csv')
rbh_dict = {}
for i, r in rbh.iterrows():
    rbh_dict[r['target_ref']] = r['query_ref']

In [129]:
annotated_regions_dict = {}
for species in tqdm(all_species):
    for record in SeqIO.parse(refseq_dir + '/'+species+'/genomic.gbff', "genbank"):
        annotated_regions = []
        intergenic_regions = []
        accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
        for feature in record.features:
            a = feature.qualifiers
            if feature.type not in ['source','gene'] and (int(feature.location.start) < int(feature.location.end)) and (int(feature.location.end) - int(feature.location.start)) < 1000000:
                if not(a.get("product") == None):
                       product = a.get("product")[0]
                else:
                       product = ''
                if not(a.get("locus_tag")==None):
                    locus_tag = accession_ver + '@' + a.get("locus_tag")[0]
                    if locus_tag in rbh_dict:
                        ortholog_locus_tag = rbh_dict[locus_tag]
                    else:
                        ortholog_locus_tag = ''
                else:
                    locus_tag = ''
                    ortholog_locus_tag = ''
                annotated_regions.append((locus_tag, ortholog_locus_tag, product, feature.type, int(feature.location.start), int(feature.location.end)))
        annotated_regions.sort(key = lambda x: x[4])
        prev_locus = ''
        prev_ortholog_locus = ''
        prev_product = ''
        max_stop = 0
        for n, (locus, ortholog_locus, product, feature_type, start, stop) in enumerate(annotated_regions):
            if start > max_stop:
                intergenic_regions.append([prev_locus+':'+locus, prev_ortholog_locus + ':' + ortholog_locus, prev_product + ':' + product, 'Intergenic',max_stop, start])
            if stop > max_stop:
                prev_locus = locus
                prev_ortholog_locus = ortholog_locus
                prev_product = product
            max_stop = max(max_stop, stop)    
        for x in intergenic_regions:
            annotated_regions.append(x)
        annotated_regions.sort(key = lambda x : x[4])
        annotated_regions_dict[accession_ver] = annotated_regions

  0%|          | 0/215 [00:00<?, ?it/s]

In [130]:
if 1== 0:
    with open(project_dir + '/' + 'annotated_regions_dict.pkl', 'wb') as f:
        pickle.dump(annotated_regions_dict, f)
else:
    with open(project_dir + '/' + 'annotated_regions_dict.pkl', 'rb') as f:
        annotated_regions_dict = pickle.load(f)    

In [104]:
species_dir_dict = {}
for spec in all_species:
    for record in SeqIO.parse(refseq_dir + '/'+spec+'/genomic.gbff', "genbank"):
        accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
        full_sequence = str(record.seq)
        species_dir_dict[accession_ver] = spec

In [87]:
min_nts = 30

In [99]:
def produce_full_sequence_and_intergenic_region_fasta(refseq_dir, species, compare_to_all = False):
    loci = []
    for record in SeqIO.parse(refseq_dir + '/'+species+'/genomic.gbff', "genbank"):
        accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
        full_sequence = str(record.seq)
        for feature in record.features:
                a = feature.qualifiers
                if feature.type not in ['source','gene'] and (int(feature.location.start) < int(feature.location.end)) and (int(feature.location.end) - int(feature.location.start)) < 1000000:
                    loci.append((feature.type, int(feature.location.start), int(feature.location.end)))

    loci.sort(key = lambda x: x[1])
    std_annotation_intergenic_regions = []
    max_stop = 0
    for (feature_type, start, stop) in loci:
        if start > max_stop + min_nts and feature_type == 'CDS':
            std_annotation_intergenic_regions.append([str(max_stop)+'_'+str(start), full_sequence[max_stop: start]])
        max_stop = max(max_stop, stop)
    if compare_to_all == True:
        temp = []
        for spec in all_species:
            for record in SeqIO.parse(refseq_dir + '/'+spec+'/genomic.gbff', "genbank"):
                accession_ver = record.annotations['accessions'][0] + '.' + str(record.annotations['sequence_version'])
                full_sequence = str(record.seq)
                temp.append([accession_ver, full_sequence])
        util.produce_fasta_file(temp, project_dir + '/full_'+species+'_sequence.fasta')
    else:
        util.produce_fasta_file([[accession_ver, full_sequence]], project_dir + '/full_'+species+'_sequence.fasta')
    util.produce_fasta_file(std_annotation_intergenic_regions, project_dir + '/'+species+'_intergenic_regions.faa')

In [100]:
def make_blast_sequence_db(species):
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd '+ project_dir + ' &  makeblastdb -in ' + project_dir + '/full_'+species+'_sequence.fasta' +' -dbtype nucl -out full_'+species+'_sequence_nt', shell=True, capture_output = True)
    os.chdir(w_d)
    if not(os.path.exists('F:/Datasets/BLAST/Self_Blast/' + species)):
        os.makedirs('F:/Datasets/BLAST/Self_Blast/' + species)
    files = util.list_files(project_dir)
    for file in files:
        if species in file:
            shutil.move(project_dir+'/'+file, 'F:/Datasets/BLAST/Self_Blast/' + species +'/' + file)  

In [142]:
def run_self_blast(refseq_dir, species):
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd f:\\Datasets\\BLAST\\Self_BLAST\\' + species + ' & blastn -query ' + species + '_intergenic_regions.faa -db full_'+species+'_sequence_nt -out hits.csv -evalue 10 -outfmt "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    #print('cd f:\\Datasets\\BLAST\\Self_BLAST\\' + species + ' & blastn -query ' + species + '_intergenic_regions.faa -db full_'+species+'_sequence_nt -out hits.csv -evalue 10 -outfmt "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16')
    os.chdir(w_d)
    blast_results = pd.read_csv('F:/Datasets/BLAST/Self_BLAST/' + species + '/hits.csv', header = None)
    blast_results.columns = ['query_ref', 'target_ref', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 
                             'query_start_alignment', 'query_end_alignment', 'target_start_alignment', 'target_end_alignment', 'e_value', 'bit_score']
    blast_results['hit_count'] = blast_results.groupby('query_ref')['target_ref'].transform('count')
    blast_results['ref_count'] = blast_results[blast_results['target_ref'] == 'NC_000962.3'].groupby('query_ref')['target_ref'].transform('count')
    blast_results['ref_count'] = blast_results.groupby('query_ref')['ref_count'].transform('max')
    blast_results = blast_results[blast_results['hit_count'] > 1]
    
    repeat_regions = []
    blast_results['annot_features']=''
    for i, r in blast_results.iterrows():
        start1 = min(r['target_start_alignment'],r['target_end_alignment'])
        end1 = max(r['target_start_alignment'],r['target_end_alignment'])
        feature_matches = []
        for (locus, ortholog_locus, product, feature, start, stop) in annotated_regions_dict[r['target_ref']]:
            if start< end1 and stop > start1:
                overlap = (min(end1, stop) - max(start1, start))/ (end1-start1)
                feature_matches.append([locus, ortholog_locus, product, feature, overlap])
                if ('repeat' in feature) or ('mobile' in feature):
                    repeat_regions.append(r['query_ref'])
        blast_results.at[i,'annot_features'] = feature_matches
    repeat_regions = list(set(repeat_regions))
    blast_results = blast_results.query("not(query_ref.isin(@repeat_regions))")
    blast_results.to_excel('F:/Datasets/BLAST/Self_BLAST/' + species + '/processed_hits.xlsx', sheet_name = 'Sheet_1', index = False)

In [53]:
produce_full_sequence_and_intergenic_region_fasta(refseq_dir, reference_species)
make_blast_sequence_db(reference_species)
run_self_blast(refseq_dir, reference_species)
produce_full_sequence_and_intergenic_region_fasta(refseq_dir,marinum_species)
make_blast_sequence_db(marinum_species)
run_self_blast(refseq_dir,marinum_species)
produce_full_sequence_and_intergenic_region_fasta(refseq_dir,shiju_species)
make_blast_sequence_db(shiju_species)
run_self_blast(refseq_dir,shiju_species)
produce_full_sequence_and_intergenic_region_fasta(ecoli_dir,ecoli_species)
make_blast_sequence_db(ecoli_species)
run_self_blast(ecoli_dir,ecoli_species)

100%|██████████| 1/1 [00:00<00:00, 10.17it/s]
100%|██████████| 2108/2108 [00:00<00:00, 84264.23it/s]
100%|██████████| 1/1 [00:00<00:00,  9.10it/s]
100%|██████████| 3159/3159 [00:00<00:00, 78655.81it/s]
100%|██████████| 1/1 [00:00<00:00, 11.01it/s]
100%|██████████| 2366/2366 [00:00<00:00, 77112.20it/s]
100%|██████████| 1/1 [00:00<00:00, 10.48it/s]
100%|██████████| 2503/2503 [00:00<00:00, 71024.97it/s]


In [132]:
produce_full_sequence_and_intergenic_region_fasta(refseq_dir, reference_species, True)
make_blast_sequence_db(reference_species)
run_self_blast(refseq_dir, reference_species)

In [143]:
run_self_blast(refseq_dir, reference_species)

In [124]:
annotated_regions_dict['NZ_LQOX01000055.1']

[[':NZ_LQOX01000001.1@AWC07_RS00005',
  ':NC_000962.3@Rv2079',
  ':alpha/beta hydrolase',
  'Intergenic',
  0,
  444],
 [':NZ_LQOX01000001.1@AWC07_RS00005',
  ':NC_000962.3@Rv2079',
  ':alpha/beta hydrolase',
  'Intergenic',
  0,
  444],
 [':NZ_LQOX01000001.1@AWC07_RS00005',
  ':NC_000962.3@Rv2079',
  ':alpha/beta hydrolase',
  'Intergenic',
  0,
  444],
 [':NZ_LQOX01000001.1@AWC07_RS00005',
  ':NC_000962.3@Rv2079',
  ':alpha/beta hydrolase',
  'Intergenic',
  0,
  444],
 ('NZ_LQOX01000005.1@AWC07_RS00900', '', 'PPE family protein', 'CDS', 0, 1144),
 [':NZ_LQOX01000001.1@AWC07_RS00005',
  ':NC_000962.3@Rv2079',
  ':alpha/beta hydrolase',
  'Intergenic',
  0,
  444],
 ('NZ_LQOX01000006.1@AWC07_RS00905',
  '',
  'type VII secretion system ESX-5 protein EsxL',
  'CDS',
  0,
  93),
 [':NZ_LQOX01000001.1@AWC07_RS00005',
  ':NC_000962.3@Rv2079',
  ':alpha/beta hydrolase',
  'Intergenic',
  0,
  444],
 ('NZ_LQOX01000007.1@AWC07_RS28300',
  '',
  'hypothetical protein',
  'CDS',
  0,
  167),
 

In [18]:
for k, v in rbh_dict.items():
    if v == 'NC_000962.3@Rv0650' and 'NZ_CP058277.1' in k:
        print(k, v)

NZ_CP058277.1@HXW97_RS11190 NC_000962.3@Rv0650


In [64]:
print_sequence(reference_species, 1182316, 1182390)

CCGCGCGAGCAGACGCAAAATCGCCCATTTTCGTGTCGAAATGGGGGCTTTTGCGTCTGCTCGCGGGTAGAAAG


In [72]:
print_sequence(reference_species, 3724552, 3724613)

'CGCGAGCAGACGCAAAATCGCCCATTTCGGCACGAAATTGGGCGATTTTGCGTCTGCTCGC'

In [71]:
print_sequence(reference_species, 1182315, 1182390)[4:65]

'CGCGAGCAGACGCAAAATCGCCCATTTTCGTGTCGAAATGGGGGCTTTTGCGTCTGCTCGC'

In [76]:
print_sequence(reference_species, 3590617, 3590691)

'TCGCCTATGTTGGCGCGAGCAGACGCAAAATCGCCCGAAACCGATGGCTTTCGGGCGATTTTGCGTCTGTCGCG'

In [77]:
print_sequence(reference_species, 3707570, 3707632)

'CGCGAGCAGACGCAAAATCGCCCGAAAACCAGTGGTTTTGGGCGATTTTGCGTCTGCTCGCG'