In [1]:
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
import os
import subprocess
import pandas as pd
from io import StringIO
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Sequence_Analysis_Routines as sar
import math
from tqdm.auto import tqdm
import re
import numpy as np
import statistics
import matplotlib.pyplot as plt
from Bio.Blast import NCBIWWW, NCBIXML
from Comparative_Analysis import Alignment as align
from joblib import Parallel, delayed

In [2]:
project_dir = 'D:/Project_Data/Project_6'
genome_datasets_dir = project_dir + '/Datasets/NCBI_Datasets'
output_dir = project_dir + '/Output'
sonic_paranoid_run_name = 'Run_Without_Outgroup'
sonic_paranoid_output_loc = output_dir + '/Sonic_Paranoid_Output'
ortholog_dir = sonic_paranoid_output_loc + '/runs/' + sonic_paranoid_run_name + '/ortholog_groups'
literature_datasets_dir = project_dir + '/Datasets/Data_From_Publications'
temp_fileloc = project_dir + '/Temp_Files'
reference_species = 'GCF_000195955.2'
outgroup_species = 'GCF_000696675.2'
NCBIWWW.email = "nicholas.underhill@sky.com"

In [3]:
def write_fasta(sequence, name, file):
    line_length = 60
    lines = []
    sequence_length = len(sequence)
    number_of_lines = math.ceil(sequence_length / line_length)
    lines.append(">" + name + "\n")
    for i in range(number_of_lines):
            subsequence = sequence[i*line_length:(i+1)*line_length]
            lines.append(subsequence + "\n")
    a = ''.join(lines)
    with open(file,'w', newline='') as outfile:
        outfile.write(''.join(lines))

In [4]:
codon_dict = {}
with open('D:/Project_Data/Project_3/Datasets/Reference_Tables/Standard_Code.txt') as f:
    for l in f:
        codon_dict[str(l[1:4])] = l[5]
        
def translate_sequence(input_seq, strand, rf):
    output_seq = ''
    if strand == 1:
        seq = input_seq[rf:]
    else:
        seq = align.reverse_complement(input_seq)[rf:]
    for i in range(0,len(seq)-2,3):
        if seq[i:(i+3)] in codon_dict:
            output_seq += codon_dict[seq[i:(i+3)]]
        else:
            output_seq += 'X'
    return output_seq

In [5]:
query_list = [x for x in util.list_dirs(genome_datasets_dir) if not (x in [reference_species, outgroup_species])]

In [6]:
for id in tqdm(query_list + [reference_species]):
    genome_record = next(SeqIO.parse(genome_datasets_dir + '/'+id + '/genomic.gbff', "genbank"))
    organism_name = genome_record.annotations['organism']
    full_sequence = genome_record.seq
    write_fasta(str(full_sequence), id, temp_fileloc + '/'+id+'.fasta')

  0%|          | 0/11 [00:00<?, ?it/s]

In [7]:
if 1==0:
    for query_id in tqdm(query_list):
        subprocess.run('wsl cd ~; cd mummer4/mummer-4.0.0rc1; promer -p promer '+util.wslname(temp_fileloc + '/'+ reference_species +'.fasta ')+ util.wslname(temp_fileloc + '/'+  query_id +'.fasta ') , shell=True)
        temp = subprocess.run('wsl cd ~; cd mummer4/mummer-4.0.0rc1; show-coords -r -k -c -l -L 30 -I 50 -T promer.delta' , shell=True, capture_output=True).stdout.decode('utf-8')
        column_names =[ 'S1', 'E1', 'S2', 'E2', 'LEN 1', 'LEN 2', '% IDY', '% SIM', '% STP', 'LEN R', 'LEN Q', 'COV R', 'COV Q', 'FRM_1', 'FRM_2', 'TAGS_1', 'TAGS_2']
        temp_df = pd.read_table(StringIO(temp), skiprows=4, index_col=False, header=None, names=column_names)
        temp_df.to_csv(project_dir + '/mummer_coords_'+query_id+'_.csv')

In [8]:
query_dfs = []
for query_id in (query_list):
    query_dfs.append(pd.read_csv(project_dir + '/mummer_coords_'+query_id+'_.csv'))

In [9]:
reference_species_len = len(next(SeqIO.parse(genome_datasets_dir + '/'+reference_species + '/genomic.gbff', "genbank")).seq)
reference_protein_dict = {}
genome_record = next(SeqIO.parse(genome_datasets_dir + '/'+reference_species + '/genomic.gbff', "genbank"))
for feature in genome_record.features:
        a = feature.qualifiers
        if feature.type == 'CDS':
            reference_protein_dict[a.get("protein_id")[0]]= a.get("locus_tag")[0]

In [10]:
conservation_counts = np.zeros(reference_species_len)
for df in tqdm(query_dfs):
    for i, r in df.iterrows():
        if r.FRM_1 > 0:
            start = r.S1
            end = r.E1
        else:
            start = r.E1
            end = r.S1
        for pos in range(start-1, end):
            conservation_counts[pos]+=1

  0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
actual_cds_boundaries = []
mycobrowser_df = pd.read_excel(literature_datasets_dir+'/Mycobrowser_Release_4.xlsx')
temp = mycobrowser_df[mycobrowser_df['Feature'] == 'CDS'][['Locus','Start','Stop','Strand']]
actual_cds_boundaries = []
for i, r in temp.iterrows():
    if r['Strand'] == '+':
        strand = 1
    else:
        strand = -1
    actual_cds_boundaries.append((r['Locus'],r['Start']-1, r['Stop'], strand))

In [12]:
orthologs = sar.Ortholog_Grouping(ortholog_dir)
all_copy_seq_data = sar.Ortholog_Sequence_Dataset(orthologs, genome_datasets_dir, [x for x in util.list_dirs(genome_datasets_dir)], 50, reference_species, single_copy = False) 

100%|██████████| 75075/75075 [00:03<00:00, 23240.01it/s]
100%|██████████| 16/16 [00:00<00:00, 82.90it/s]


In [13]:
def run_blast(id_list, num_subsets, subset_num):
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    E_VALUE_THRESH = 0.04
    tb_seq = str(next(SeqIO.parse(genome_datasets_dir + '/'+reference_species + '/genomic.gbff', "genbank")).seq)
    for i in (range(len(ids))):
        blast_results_list = []
        locus_tag = ids[i][0]
        start = ids[i][1]
        end = ids[i][2]
        strand = ids[i][3]
        temp=translate_sequence(tb_seq[start:end],strand,0)
        result_handle = NCBIWWW.qblast("blastp", "nr", temp[:-1], entrez_query= "all [filter] NOT(txid77643[ORGN]) AND txid85007[ORGN]")
        blast_record = NCBIXML.read(result_handle)
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < E_VALUE_THRESH:
                     blast_results_list.append([locus_tag, start, end, strand, alignment.title, alignment.accession, alignment.length, hsp.expect, hsp.identities, hsp.query_start, hsp.query_end, hsp.sbjct_start, hsp.sbjct_end, hsp.strand, hsp.score])
        blast_results_TB_genes_df = pd.DataFrame(blast_results_list, columns = [['locus_tag','locus_start','locus_end','locus_strand','title', 'accession', 'length', 'e_value', 'identities', 'query_start', 'query_end', 'subject_start','subject_end','subject_strand', 'score']])
        blast_results_TB_genes_df.to_csv(project_dir + '/'+ids[i][0]+'_blast_results_TB_genes_df.csv')

In [14]:
num_cores = 16
core_numbers = list(range(1, num_cores+1))
par = Parallel(n_jobs=-1)(delayed(run_blast)(actual_cds_boundaries, num_cores, core_number) for core_number in tqdm(core_numbers))

  0%|          | 0/16 [00:00<?, ?it/s]

ValueError: Error message from NCBI: Entrez Query: all [filter] NOT(txid77643[ORGN]) AND txid85007[ORGN] is not supported

In [None]:
temp_df = temp[temp.locus_tag == 'Rv3121']
group_id = temp_df.iloc[0]['group_id']
temp[temp.group_id == group_id]

temp = all_copy_seq_data.sequence_data
for i in actual_cds_boundaries:
    temp_df = temp[temp.locus_tag == i[0]]
    if len(temp_df) > 0:
        group_id = temp_df.iloc[0]['group_id']
        num_orthologs = len(temp[temp.group_id == group_id]) -1
    else:
        num_orthologs = 0
    print(i[0], i[1], i[2], statistics.mean(conservation_counts[i[1]:i[2]]), num_orthologs)

In [None]:
d = pd.Series(conservation_counts)
a=(d.rolling(10000).mean())

In [None]:
temp = [i for i,v in enumerate(a) if v < 0.5]
z =[v for i, v in enumerate(temp) if i > 1 and v - temp[i-1] > 1]
z

In [None]:
plt.plot(a[200:])
plt.show()

In [None]:
plt.plot(conservation_counts[1650000: 1713090])
plt.show()

##### Read alignments output

In [None]:
temp = subprocess.run('wsl cd ~; cd mummer4/mummer-4.0.0rc1; show-aligns promer.delta '+id_list[0]+' '+ id_list[1] , shell=True, capture_output=True).stdout.decode('utf-8')

In [None]:
def get_alignments_from_ids():
    alignments = subprocess.run('wsl cd ~; cd mummer4/mummer-4.0.0rc1; show-aligns promer.delta '+id_list[0]+' '+ id_list[1] , shell=True, capture_output=True).stdout.decode('utf-8')
    # Note that no sorting is done by default for the output of `show-aligns`, so we _may_ assume
    # that the order of the matches is the same as their order of appearance in the deltafile

    # "Beginning delimiter" of every alignment in the `show-aligns` output
    begin_alignment_regex = '-- BEGIN alignment \[ (?P<ref_direction>[+\-])1 (?P<ref_start>[0-9]+) - (?P<ref_end>[0-9]+) \|' + \
    ' (?P<query_direction>[+\-])1 (?P<query_start>[0-9]+) - (?P<query_end>[0-9]+) \]\n\n'
    # "End delimiter" of every alignment in the `show-aligns` output
    end_alignment_regex = '\n\n--\s+END alignment \[ [+\-]1 [0-9]+ - [0-9]+ \| [+\-]1 [0-9]+ - [0-9]+ \]'

    # Goal is to capture everything between the begin alignment strings and the end alignment strings
    parse_regex = '(?s)'+begin_alignment_regex+'(?P<alignment_string>.*?)'+end_alignment_regex
    # FYI:    have to use (?s) at beginning to ensure '.' will also match new lines
    # See:    https://stackoverflow.com/questions/42302482/python-find-a-string-between-two-strings-repeatedly#comment116031644_42302556
    parsed_alignments = [match.groupdict() for match in re.finditer(parse_regex, alignments)]   

    parsed_alignments = pd.DataFrame(parsed_alignments)

    return parsed_alignments

In [None]:
get_alignments_from_ids()

In [6]:
ct = 0
with open('D:/protein.faa', 'r') as f:
    for l in f:
        if l[0] == '>':
            ct+=1
print(ct)

189
