In [43]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
from goatools import obo_parser
import random
import copy
from joblib import Parallel, delayed
import os
import wget
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil
from ftplib import FTP
import Bio.UniProt.GOA as GOA

In [3]:
full_run = True
project_dir = 'D:/Project_Data/Project_8'
datasets_dir = project_dir + '/Datasets'
output_dir = project_dir + '/Output'
wsl_output_loc = util.wslname(output_dir)
seq_dir = 'D:/Actinobacteria_Ref_Rep_Lev_Complete'
blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp'
blast_db_name = 'actinobacteria_ref_rep_comp'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
reference_species_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
species_list = util.list_files(seq_dir)

In [151]:
 with open(output_dir + '/Blast_Output/reciprocal_best_hits.pkl', 'rb') as f:
        rbh_results = pickle.load(f)

In [152]:
with open(output_dir + '/gene_info_dict.pkl', 'rb') as f:
    gene_info_dict = pickle.load(f) 

In [159]:
rfam_df = pd.read_excel(datasets_dir+'/RFAM_Hits_H37Rv_sorted.xlsx')

In [159]:
def ortholog_in_reference(accession_locus, rbh_df):
    if len(rbh_df[rbh_df.target_ref == accession_locus]) >= 1:
        return rbh_df[rbh_df.target_ref == accession_locus].iloc[0]['query_ref'].split('@')[1]
    else:
        return 'NO_ORTHOLOG'

In [525]:
def orthologs(reference_locus):
    temp = rbh_results[rbh_results.query_ref == 'NC_000962.3@'+ reference_locus]
    orths = []
    for i, r in temp.iterrows():
        orths.append(r['target_ref'])
    return orths

In [None]:
go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
data_folder = 'D:/Project_Data/Project_8/data'

# Check if we have the ./data directory already
if(not os.path.isfile(data_folder)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(data_folder)
    except OSError as e:
        if(e.errno != 17):
            raise e
else:
    raise Exception('Data path (' + data_folder + ') exists as a file. '
                   'Please rename, remove or change the desired location of the data path.')

# Check if the file exists already
if(not os.path.isfile(data_folder+'/go-basic.obo')):
    go_obo = wget.download(go_obo_url, data_folder+'/go-basic.obo')
else:
    go_obo = data_folder+'/go-basic.obo'
go = obo_parser.GODag(go_obo)

D:/Project_Data/Project_8/data/go-basic.obo: fmt(1.2) rel(2022-07-01) 47,008 Terms


In [46]:
import os
from ftplib import FTP
tb_uri = '/pub/databases/GO/goa/proteomes/30.M_tuberculosis_ATCC_25618.goa'
tb_fn = tb_uri.split('/')[-1]

# Check if the file exists already
tb_gaf = os.path.join(data_folder, tb_fn)
if(not os.path.isfile(tb_gaf)):
    # Login to FTP server
    ebi_ftp = FTP('ftp.ebi.ac.uk')
    ebi_ftp.login() # Logs in anonymously
    
    # Download
    with open(tb_gaf,'wb') as tb_fp:
        ebi_ftp.retrbinary('RETR {}'.format(tb_uri), tb_fp.write)
        
    # Logout from FTP server
    ebi_ftp.quit()

with open(tb_gaf, 'rt') as tb_gaf_fp:
    tb_funcs = {}  # Initialise the dictionary of functions
    
    # Iterate on each function using Bio.UniProt.GOA library.
    for entry in GOA.gafiterator(tb_gaf_fp):
        uniprot_id = entry.pop('DB_Object_ID')
        tb_funcs[uniprot_id] = entry

In [63]:
def find_go_term(locus_id):
    name_list = []
    for k, v in tb_funcs.items():
        for syn in tb_funcs[k]['Synonym']:
            if locus_id in syn:
                name_list.append(go[tb_funcs[k]['GO_ID']].name)
    return name_list

In [113]:
mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
features = []
for i, r in mycobrowser_df.iterrows():
    features.append([r['Locus'],r['Start'], r['Product']])
features.sort(key=lambda x: x[1])
downstream_dict = {}
for i, feature in enumerate(features):
    if i + 1 < len(features):
        downstream_dict[feature[0]] = (features[i+1][0])

In [385]:
refseq_downstream_dict = {}
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
features = []
for feature in genome_record.features:
    if not(feature.type == 'gene'):
        a = feature.qualifiers
        if a.get("locus_tag")!= None:
            features.append((a.get("locus_tag")[0], feature.type))
for i,feature in enumerate(features):
    if i+1 < len(features):
        next_feature = features[i+1][1]
        refseq_downstream_dict[feature[0]] = next_feature
            

In [626]:
r_scape_output_loc = project_dir + '/RScape_Run_Thoth'
intergenic_regions = util.list_dirs(r_scape_output_loc)
temp_rows = []
for id in intergenic_regions:
    if os.path.isfile(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.power'):
        with open(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.power', 'r') as f:
            num_covarying = ''
            num_expected = ''  
            count_near_pair = 0
            for l in f:
                if 'BPAIRS observed' in l:
                    num_covarying = int(l.split('covary ')[1])
                if 'BPAIRS expected' in l:
                    num_expected = (l.split('covary ')[1])
                if '*' in l:
                    a = l.split()
                    if abs(int(a[1]) - int(a[2])) < 3:
                        count_near_pair += 1    
            with open(r_scape_output_loc + '/' + id + '/search_hits_3.txt', 'r') as f:  
                rfam_overlaps = []
                ref_count = 0
                tot_count = 0
                for l in f:
                    if (not ('#' in l)):
                        a = l.split()
                        if (a[16] == '!'):
                            tot_count+=1
                    if 'NC_000962.3' in l:
                        a = l.split()
                        if a[16] == '!':
                            ref_count+=1
                            start = min(int(a[7]),int(a[8]))
                            stop = max(int(a[7]),int(a[8]))
                            for i, r in rfam_df.iterrows():
                                rfam_start = min(int(r['seq from']), int(r['seq to']))
                                rfam_stop = max(int(r['seq from']), int(r['seq to']))
                                pct_cover = (min(rfam_stop, stop) - max(rfam_start, start)) / (rfam_stop - rfam_start)
                                if pct_cover > 0.1:
                                    rfam_overlaps.append((r.accession, r['description of target'], pct_cover))
            with open(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.R2R.sto', 'r') as f: 
                r2r_near_pairs = 0
                for l in f:
                    if '#=GC SS_cons ' in l:
                        for i in range(len(l)-2):
                            if l[i:(i+2)] == '<>':
                                r2r_near_pairs +=1
        
        if id in refseq_downstream_dict:
            refseq_downstream = refseq_downstream_dict[id]
        else:
            refseq_downstream = ''
        temp_rows.append([id, downstream_dict[id],  refseq_downstream , rfam_overlaps, num_covarying, num_expected, ref_count, tot_count, count_near_pair, r2r_near_pairs])
    else:
        temp_rows.append([id,downstream_dict[id],  refseq_downstream , rfam_overlaps, 0 , 0, 0, 0, 0, 0])
results_df = pd.DataFrame(temp_rows, columns = ['Downstream_of','Feature_Downstream','Refseq_Downstream','RFAM_Overlaps','num_BP_covarying', 'num_Expected', 'num_reference_hits','tot_hits','num_near_pairs','num_r2r_near_pairs'])
results_df.to_csv(r_scape_output_loc + '/intergenic_regions_covariation_summary.csv')

In [329]:
def find_annotations(accession_ver, start, stop, feature_dict, only_return_max = True, translate_to_ortholog = True):
    if start > stop:
        (start, stop) = (stop, start)
        strand = -1
    else:
        strand = 1
    annotations_found = []
    feature_list = feature_dict[accession_ver]
    max_pct_cover = -1
    max_feature = []
    for feature in feature_list:
        if feature[1] < (stop - 1) and feature[2] >= (start - 1):
            len_feature = feature[2] - feature[1]
            pct_cover = (min(feature[2], stop) - max(feature[1], start)) / (stop - start)
            if pct_cover > 0.05 and len_feature < 100000:
                if pct_cover > max_pct_cover:
                    max_feature = feature
                    max_pct_cover = pct_cover
                annotations_found.append([feature, pct_cover])
    if len(max_feature) > 0:
        if only_return_max == True:
            if translate_to_ortholog == True:
                ortholog_max_feature = (max_feature[0], max_feature[1], max_feature[2], max_feature[3], ortholog_in_reference(accession_ver + '@' + max_feature[4], rbh_results), ortholog_in_reference(accession_ver + '@' + max_feature[5], rbh_results))
                return ([start, stop, strand, ortholog_max_feature, max_pct_cover])
        else:
            return (start, stop, annotations_found, [max_feature, max_pct_cover])

In [653]:
def hit_info(upstream_locus, translate_to_orthlog = True):
    hit_dict = {}
    with open(project_dir + '/RScape_Run_Thoth/'+upstream_locus+'/search_hits_3.txt', 'r') as f:
    #with open(project_dir + '/RScape_Run_Thoth/'+upstream_locus+'/summary.txt', 'r') as f:
                for l in f:
                    if not(l[0] == '#'): 
                        a = l.split()
                        if a[16] == '!':
                            if a[0] in hit_dict:
                                hit_dict[a[0]].append([int(a[7]),int(a[8]),a[9],a[15]])
                            else:
                                hit_dict[a[0]] = [[int(a[7]),int(a[8]),a[9],a[15]]]
    results = []
    for k, v in hit_dict.items():
        #if k == 'NC_000962.3':
        if 1==1:
            for hits in v:
                results.append([find_annotations(k, hits[0],hits[1],gene_info_dict, only_return_max = True, translate_to_ortholog = True), hits[3]])
    return results

In [None]:
temp = hit_info('Rv0487')


In [659]:
def myco_hit_info(upstream_locus, translate_to_orthlog = True):
    hit_dict = {}
   # with open(project_dir + '/RScape_Run_Thoth/'+upstream_locus+'/search_hits_3.txt', 'r') as f:
    with open(project_dir + '/RScape_Run_Thoth/'+upstream_locus+'/summary.txt', 'r') as f:
                for l in f:
                    if not(l[0] == '#'): 
                        a = l.split()
                        if a[16] == '!':
                            if a[0] in hit_dict:
                                hit_dict[a[0]].append([int(a[7]),int(a[8]),a[9],a[15]])
                            else:
                                hit_dict[a[0]] = [[int(a[7]),int(a[8]),a[9],a[15]]]
    mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
    features = []
    for i, r in mycobrowser_df.iterrows():
        features.append([r['Locus'],int(r['Start']), int(r['Stop'])])
    for k, v in hit_dict.items():
        if k == 'NC_000962.3':
            for hits in v:
                print(hits)
                start = min(int(hits[0]), int(hits[1]))
                stop = max(int(hits[0]), int(hits[1]))
                for feature in features:
                    pct_cover = (min(feature[2], stop) - max(feature[1], start)) / (feature[2] - feature[1])
                    if pct_cover > 0.1:
                        print(hits, feature, pct_cover)


In [660]:
myco_hit_info('Rv0487_IG')

[3690952, 3691080, '+', '2.3e-21']
[3690952, 3691080, '+', '2.3e-21'] ['MTB000153', 3690941, 3691059] 0.9067796610169492
[4087610, 4087484, '-', '7.6e-16']
[595448, 595325, '-', '2e-15']
[2401926, 2401804, '-', '2.2e-15']
[3820394, 3820509, '+', '2.4e-15']
[3351095, 3351219, '+', '5.4e-15']
[2703903, 2704029, '+', '1.5e-13']
[917734, 917606, '-', '1.9e-13']
[1907461, 1907583, '+', '2.4e-11']
[577286, 577399, '+', '2.8e-11']
[4120919, 4121045, '+', '4.8e-11']
[1282017, 1281887, '-', '7.1e-11']
[2372437, 2372550, '+', '1.8e-10']
[3594452, 3594341, '-', '2.1e-09']
[2069065, 2068954, '-', '2.3e-09']
[1267262, 1267144, '-', '1.7e-08']
[4053136, 4053330, '+', '3.2e-08']
[4053136, 4053330, '+', '3.2e-08'] ['Rv3611', 4052950, 4053603] 0.29709035222052066
[577402, 577525, '+', '4e-08']
[4110678, 4110821, '+', '4.9e-08']
[1305501, 1305638, '+', '1.9e-07']
[2074437, 2074543, '+', '5.4e-07']
[1895470, 1895596, '+', '3.1e-06']
[3291503, 3291378, '-', '3.8e-06']
[1955692, 1955568, '-', '6.8e-06']


In [654]:
temp = hit_info('Rv0487_IG')
temp_out = []
for record in temp:
    if not(record[0] is None):
        if record[0][3][0][-2:] == 'IG':
            position = 'IG'
        else:
            position = 'G'
        if record[0][2] == 1:
            upstream_gene = record[0][3][4]
            downstream_gene = record[0][3][5]
        else:
            upstream_gene = record[0][3][5]
            downstream_gene = record[0][3][4]
        temp_out.append([position, upstream_gene, downstream_gene, upstream_gene + '_' + position, downstream_gene + '_' + position])
hit_df = pd.DataFrame(temp_out, columns = ['Position','Upstream_Gene','Downstream_Gene', 'Upstream_Pos', 'Downstream_Pos'])

In [657]:
freq_table = pd.crosstab(hit_df['Upstream_Pos'], 'no_of_hits')
print(freq_table[freq_table['no_of_hits'] > 3])

col_0           no_of_hits
Upstream_Pos              
NO_ORTHOLOG_G           21
NO_ORTHOLOG_IG          50
Rv0429c_IG               6
Rv0432_IG                4
Rv0480c_IG               8
Rv0487_IG                9
Rv0505c_IG              10
Rv0703_IG                5
Rv0818_IG                4
Rv0824c_IG               5
Rv1074c_IG               6
Rv1140_IG                9
Rv1212c_IG               9
Rv1668c_IG               5
Rv1822_IG                7
Rv1829_IG                5
Rv1867_IG                4
Rv2112c_IG               8
Rv2130c_IG              10
Rv2142c_IG               6
Rv2405_IG                4
Rv2477c_IG               6
Rv2702_IG                4
Rv2793c_IG               4
Rv2993c_IG               9
Rv3198A_IG               6
Rv3207c_IG               4
Rv3218_IG               13
Rv3302c_IG               6
Rv3303c_IG               4
Rv3401_IG                8
Rv3560c_IG               5
Rv3647c_IG               4
Rv3668c_IG               4
Rv3680_IG               11
R

In [350]:
hit_df.to_csv(output_dir + '/hit.csv')

In [130]:
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
organism_name = genome_record.annotations['organism']
accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
function_list = []
for feature in genome_record.features:
    a = feature.qualifiers
    if feature.type == 'CDS':
        if a.get("locus_tag")!= None:
            if len(find_go_term(a.get("locus_tag")[0])) > 0:
                function_list.append(find_go_term(a.get("locus_tag")[0])[0])

In [643]:
for i, record in enumerate(temp):
    if i < 30:
        print(record)

[[2652546, 2652740, -1, ('G6N39_RS12560_IG', 2652460, 2652754, -1, 'Rv2891', 'Rv2890c'), 1.0], '1.3e-33']
[[2402172, 2402366, 1, ('G6N31_RS11320_IG', 2402156, 2402438, -1, 'Rv2890c', 'Rv2891'), 1.0], '5.3e-33']
[[2356272, 2356470, -1, ('MVAN_RS11070_IG', 2356189, 2356485, -1, 'Rv2891', 'Rv2890c'), 1.0], '6.7e-33']
[[3594689, 3594896, 1, ('MSPYR1_RS17115_IG', 3594676, 3594968, -1, 'Rv2890c', 'Rv2891'), 1.0], '2e-32']
[[2315655, 2315856, -1, ('C1S78_RS11220_IG', 2315579, 2315868, -1, 'Rv2891', 'Rv2890c'), 1.0], '3.1e-32']
[[1428326, 1428528, 1, ('G6N46_RS06955_IG', 1428313, 1428601, -1, 'Rv2890c', 'Rv2891'), 1.0], '6.1e-32']
[[2162208, 2162415, -1, ('EL337_RS10250_IG', 2162123, 2162431, -1, 'Rv2891', 'Rv2890c'), 1.0], '6.8e-32']
[[1492022, 1492221, -1, ('B586_RS07090_IG', 1491380, 1492236, 1, 'Rv2894c', 'Rv2890c'), 1.0], '6.9e-32']
[[2150608, 2150803, -1, ('G6N16_RS10310_IG', 2150534, 2150818, -1, 'Rv2891', 'Rv2890c'), 1.0], '1.1e-31']
[[5235929, 5236138, 1, ('NTM_RS24750_IG', 5235913, 5