#### Directories etc

In [None]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Align.Applications import MuscleCommandline
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from Comparative_Analysis import ORF_Functions as orffn
from random import sample
import ete3;

In [None]:
project_dir = 'F:/Project_Data/Project_9'
seq_dir = 'F:/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
tb_species = 'NC_000962.3' 
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

#### Sample of 15 mycobacteria

In [None]:
def create_filename_dict(num_subsets, subset_num, id_list):
    ids = util.chunk_list(id_list, num_subsets, subset_num)
    temp = []
    for id in (ids):
        genome_record = next(SeqIO.parse(seq_dir + '/' + id, "genbank"))
        organism_name = genome_record.annotations['organism']
        temp.append((organism_name, id))
    return temp

species_list = util.list_files(seq_dir)
filename_dict = {}
parallel_output = Parallel(n_jobs=-1)(delayed(create_filename_dict)(num_cores, core_number, species_list) for core_number in core_numbers)
temp = [item for sublist in parallel_output for item in sublist]
for (species_name, filename) in temp:
    filename_dict[species_name] = filename

In [None]:
sample_filenames = ['GCF_000195955.2_ASM19595v2_genomic.gbff', 'GCF_024600175.1_ASM2460017v1_genomic.gbff', 'GCF_010730055.1_ASM1073005v1_genomic.gbff', 'GCF_020616615.1_ASM2061661v1_genomic.gbff', 'GCF_010731535.1_ASM1073153v1_genomic.gbff',
                    'GCF_022370755.1_ASM2237075v1_genomic.gbff', 'GCF_016745295.1_ASM1674529v1_genomic.gbff', 'GCF_000157895.3_ASM15789v2_genomic.gbff', 'GCF_900603025.1_MHAS_genomic.gbff', 'GCF_018363015.1_ASM1836301v1_genomic.gbff',
                    'GCF_019645855.1_ASM1964585v1_genomic.gbff', 'GCF_010727945.1_ASM1072794v1_genomic.gbff', 'GCF_010731895.1_ASM1073189v1_genomic.gbff', 'GCF_900637205.1_50279_G01_genomic.gbff', 'GCF_000184435.1_ASM18443v1_genomic.gbff']
for f in sample_filenames:
    for k, v in filename_dict.items():
        if f==v:
            print(k)

In [None]:
organism_name_dict = {}
protein_dict = {}
sample_info = []
for filename in tqdm(sample_filenames):
    genome_record = next(SeqIO.parse(seq_dir + '/' + filename, "genbank"))
    organism_name = genome_record.annotations['organism']
    accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
    organism_name_dict[accession_ver] = organism_name
    for feature in genome_record.features:
        a = feature.qualifiers
        if feature.type == 'CDS':
            if a.get("protein_id") != None and a.get("locus_tag")!= None and a.get("translation")!= None:

                locus_tag = a.get("locus_tag")[0]
                protein_id = a.get("protein_id")[0]
                translation = a.get("translation")[0]
                sample_info.append([organism_name, accession_ver, int(feature.location.start), int(feature.location.end), int(feature.location.strand), locus_tag, protein_id, translation])
                protein_dict[accession_ver+'@'+protein_id] = translation

In [None]:
# Full sample
util.produce_fasta_file([[x[1]+'@'+x[6], x[7]] for x in sample_info], project_dir + '/full_sample_proteins.fasta')
# Tb only
util.produce_fasta_file([[x[1]+'@'+x[6], x[7]] for x in sample_info if 'H37Rv' in x[0]], project_dir + '/H37Rv_proteins.fasta')

#### Produce FASTA file with CDS candidates

In [None]:
blastfn.build_blast_db(project_dir, 'full_sample_proteins.fasta', 'full_sample_proteins', 'F:/Datasets/BLAST/full_sample_proteins')

In [None]:
blastfn.build_blast_db(project_dir, 'H37Rv_proteins.fasta', 'H37Rv_proteins', 'F:/Datasets/BLAST/H37Rv_proteins')

In [None]:
blastfn.run_blastp('F:/Datasets/BLAST/full_sample_proteins', 'H37Rv_proteins.fasta', 'full_sample_proteins', e_value = 1e-5)

In [None]:
blastfn.run_blastp('F:/Datasets/BLAST/H37Rv_proteins', 'full_sample_proteins.fasta', 'H37Rv_proteins', e_value = 1e-5)

In [None]:
H37Rv_v_full_sample_blast_df = blastfn.process_blast_output('F:/Datasets/BLAST/full_sample_proteins/hits.csv',project_dir + '/H37Rv_v_full_sample_blast_results.pkl' ,organism_name_dict, top_hit_only = True)
full_sample_v_H37Rv_blast_df = blastfn.process_blast_output('F:/Datasets/BLAST/H37Rv_proteins/hits.csv',project_dir + '/full_sample_v_H37Rv_blast_results.pkl' ,organism_name_dict, top_hit_only = True)
rbh_df = blastfn.keep_reciprocal_best_hits(H37Rv_v_full_sample_blast_df, full_sample_v_H37Rv_blast_df , project_dir + 'rbh_df.pkl')
rbh_df.to_csv(project_dir + '/rbh.csv')

In [None]:
ortholog_count_dict = {}
for i, r in rbh_df.iterrows():
    if r['query_ref'] in ortholog_count_dict:
        ortholog_count_dict[r['query_ref']] +=1
    else:
        ortholog_count_dict[r['query_ref']] =1
full_orthologs = []
for k, v in ortholog_count_dict.items():
    if v == len(sample_filenames):
        full_orthologs.append(k)

In [None]:
samp = random.sample(full_orthologs, 200)

In [None]:
ortholog_sequences = []
for query_ref in tqdm(samp):
    seq = []
    for i,r in rbh_df.iterrows():
        if r['query_ref'] == query_ref:
            seq.append([r['target_ref'], protein_dict[r['target_ref']]])
    ortholog_sequences.append(seq)

In [None]:
concatenated_dict = {}
for seqset in ortholog_sequences:
    util.produce_fasta_file(seqset, project_dir + '/temp.fasta')
    cline = MuscleCommandline(muscle_exe, input= project_dir + '/temp.fasta', out=project_dir + '/align.fasta')
    try:
        stdout, stderr = cline()
    except Exception as e:
        pass
    a = util.read_fasta_to_array(project_dir + '/align.fasta')
    b = list(zip(a[0],a[1]))
    for info in b:
        species = info[0].split('@')[0]
        if species in concatenated_dict:
            concatenated_dict[species] += info[1]
        else:
            concatenated_dict[species] = info[1]
outfile = []
for k, v in concatenated_dict.items():
    outfile.append([k, v])
util.produce_fasta_file(outfile, project_dir + '/concatenated_alignments.faa')
util.convert_sequence_file_format(project_dir + '/concatenated_alignments.faa', project_dir + '/concatenated_alignments.phy', input_file_format = "fasta", output_file_format = "phylip")

In [None]:
distances_df = pd.read_excel(project_dir +'/mega_distances.xlsx')

In [None]:
distance_dict = {}
distance_dict[tb_species] = 0.0
for i, r in distances_df.iterrows():
    if r[tb_species] > 0:
        distance_dict[r[0]] = r[tb_species]

In [None]:
distance_dict

In [None]:
full_orthologs[0]

In [None]:
results = []
for ortholog in full_orthologs[176:177]:
    for i, r in rbh_df.iterrows():
        if r['query_ref'] == ortholog:
            results.append([distance_dict[r['target_species']], r['bit_score']/r['query_length'], ortholog])
results_df = pd.DataFrame(results, columns = ['distance', 'score', 'ortholog_ref'])
sns.scatterplot(data=results_df, x="distance", y="score")

In [None]:
results = []
for i, r in rbh_df.iterrows():
    if r['e_value'] > 1e-10:
        results.append([r['bit_score'], r['e_value']])
results_df = pd.DataFrame(results, columns = ['score', 'evalue'])
sns.scatterplot(data=results_df, x="score", y="evalue")

In [None]:
rbh_df