In [43]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
from goatools import obo_parser
import random
import copy
from joblib import Parallel, delayed
import os
import wget
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil
from ftplib import FTP
import Bio.UniProt.GOA as GOA

In [3]:
full_run = True
project_dir = 'D:/Project_Data/Project_8'
datasets_dir = project_dir + '/Datasets'
output_dir = project_dir + '/Output'
wsl_output_loc = util.wslname(output_dir)
seq_dir = 'D:/Actinobacteria_Ref_Rep_Lev_Complete'
blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp'
blast_db_name = 'actinobacteria_ref_rep_comp'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
reference_species_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
species_list = util.list_files(seq_dir)

In [151]:
 with open(output_dir + '/Blast_Output/reciprocal_best_hits.pkl', 'rb') as f:
        rbh_results = pickle.load(f)

In [152]:
AM408590.1@BCG_0031

In [159]:
def ortholog_in_reference(accession_locus, rbh_df):
    if len(rbh_df[rbh_df.target_ref == accession_locus]) >= 1:
        return rbh_df[rbh_df.target_ref == accession_locus].iloc[0]['query_ref'].split('@')[1]
    else:
        return 'NO_ORTHOLOG'

In [160]:
ortholog_in_reference('AM408590.1@BCG_0031', rbh_results)

'Rv0001'

In [None]:
go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
data_folder = 'D:/Project_Data/Project_8/data'

# Check if we have the ./data directory already
if(not os.path.isfile(data_folder)):
    # Emulate mkdir -p (no error if folder exists)
    try:
        os.mkdir(data_folder)
    except OSError as e:
        if(e.errno != 17):
            raise e
else:
    raise Exception('Data path (' + data_folder + ') exists as a file. '
                   'Please rename, remove or change the desired location of the data path.')

# Check if the file exists already
if(not os.path.isfile(data_folder+'/go-basic.obo')):
    go_obo = wget.download(go_obo_url, data_folder+'/go-basic.obo')
else:
    go_obo = data_folder+'/go-basic.obo'
go = obo_parser.GODag(go_obo)

D:/Project_Data/Project_8/data/go-basic.obo: fmt(1.2) rel(2022-07-01) 47,008 Terms


In [46]:
import os
from ftplib import FTP
tb_uri = '/pub/databases/GO/goa/proteomes/30.M_tuberculosis_ATCC_25618.goa'
tb_fn = tb_uri.split('/')[-1]

# Check if the file exists already
tb_gaf = os.path.join(data_folder, tb_fn)
if(not os.path.isfile(tb_gaf)):
    # Login to FTP server
    ebi_ftp = FTP('ftp.ebi.ac.uk')
    ebi_ftp.login() # Logs in anonymously
    
    # Download
    with open(tb_gaf,'wb') as tb_fp:
        ebi_ftp.retrbinary('RETR {}'.format(tb_uri), tb_fp.write)
        
    # Logout from FTP server
    ebi_ftp.quit()

with open(tb_gaf, 'rt') as tb_gaf_fp:
    tb_funcs = {}  # Initialise the dictionary of functions
    
    # Iterate on each function using Bio.UniProt.GOA library.
    for entry in GOA.gafiterator(tb_gaf_fp):
        uniprot_id = entry.pop('DB_Object_ID')
        tb_funcs[uniprot_id] = entry

In [63]:
def find_go_term(locus_id):
    name_list = []
    for k, v in tb_funcs.items():
        for syn in tb_funcs[k]['Synonym']:
            if locus_id in syn:
                name_list.append(go[tb_funcs[k]['GO_ID']].name)
    return name_list

In [113]:
mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
features = []
for i, r in mycobrowser_df.iterrows():
    features.append([r['Locus'],r['Start'], r['Product']])
features.sort(key=lambda x: x[1])
downstream_dict = {}
for i, feature in enumerate(features):
    if i + 1 < len(features):
        downstream_dict[feature[0]] = (features[i+1][0])

In [68]:
r_scape_output_loc = project_dir + '/RScape_Run_Thoth'
intergenic_regions = util.list_dirs(r_scape_output_loc)
temp_rows = []
for id in intergenic_regions:
    if os.path.isfile(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.power'):
        with open(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.power', 'r') as f:
            num_covarying = ''
            num_expected = ''  
            count_near_pair = 0
            for l in f:
                if 'BPAIRS observed' in l:
                    num_covarying = int(l.split('covary ')[1])
                if 'BPAIRS expected' in l:
                    num_expected = (l.split('covary ')[1])
                if '*' in l:
                    a = l.split()
                    if abs(int(a[1]) - int(a[2])) < 3:
                        count_near_pair += 1    
        temp_rows.append([id, downstream_dict[id], num_covarying, num_expected, count_near_pair])
    else:
        temp_rows.append([id,downstream_dict[id], 0 , 0, 0])
results_df = pd.DataFrame(temp_rows, columns = ['Downstream_of','Feature_Downstream','num_BP_covarying', 'num_Expected', 'num_near_pairs'])
results_df.to_csv(r_scape_output_loc + '/intergenic_regions_covariation_summary.csv')

In [250]:
def find_annotations(accession_ver, start, stop, feature_dict, only_return_max = True, translate_to_ortholog = True):
    if start > stop:
        (start, stop) = (stop, start)
        strand = -1
    else:
        strand = 1
    annotations_found = []
    feature_list = feature_dict[accession_ver]
    max_pct_cover = -1
    max_feature = []
    for feature in feature_list:
        if feature[1] < (stop - 1) and feature[2] >= (start - 1):
            len_feature = feature[2] - feature[1]
            pct_cover = (min(feature[2], stop) - max(feature[1], start)) / (stop - start)
            if pct_cover > 0.05 and len_feature < 100000:
                if pct_cover > max_pct_cover:
                    max_feature = feature
                    max_pct_cover = pct_cover
                annotations_found.append([feature, pct_cover])
    if only_return_max == True:
        if translate_to_ortholog == True:
            ortholog_max_feature = (max_feature[0], max_feature[1], max_feature[2], max_feature[3], ortholog_in_reference(accession_ver + '@' + max_feature[4], rbh_results), ortholog_in_reference(accession_ver + '@' + max_feature[5], rbh_results))
            return ([start, stop, strand, ortholog_max_feature, max_pct_cover])
    else:
        return (start, stop, annotations_found, [max_feature, max_pct_cover])

In [243]:
with open(output_dir + '/gene_info_dict.pkl', 'rb') as f:
    gene_info_dict = pickle.load(f)   

In [245]:
def hit_info(upstream_locus, translate_to_orthlog = True):
    #accession_ver = 'NC_000962.3'
    accession_ver = 'NZ_AP022575.1'
    hit_dict = {}
    with open(project_dir + '/RScape_Run_Thoth/'+upstream_locus+'/search_hits_3.txt', 'r') as f:
                for l in f:
                    if not(l[0] == '#'): 
                        a = l.split()
                        if a[16] == '!':
                            if a[0] in hit_dict:
                                hit_dict[a[0]].append([int(a[7]),int(a[8]),a[9],a[15]])
                            else:
                                hit_dict[a[0]] = [[int(a[7]),int(a[8]),a[9],a[15]]]
    results = []
    for k, v in hit_dict.items():
        results.append([find_annotations(k, v[0][0],v[0][1],gene_info_dict, only_return_max = True, translate_to_ortholog = True), v[0][3]])
    return results

In [251]:
temp = hit_info('Rv0660c')
temp_out = []
for record in temp:
    if record[0][3][0][-2:] == 'IG':
        position = 'IG'
    else:
        position = 'G'
    if record[0][2] == 1:
        upstream_gene = record[0][3][4]
        downstream_gene = record[0][3][5]
    else:
        upstream_gene = record[0][3][5]
        downstream_gene = record[0][3][4]
    temp_out.append([position, upstream_gene, downstream_gene])
hit_df = pd.DataFrame(temp_out, columns = ['Position','Upstream_Gene','Downstream_Gene'])

KeyError: 'NZ_CP033726.1'

In [247]:
hit_df.to_csv(output_dir + '/hit.csv')

In [220]:
temp[1][0]

[943563,
 943743,
 1,
 ('MB901379_RS04110_IG', 943393, 943809, 1, 'NO_ORTHOLOG', 'Rv0651'),
 1.0]

In [130]:
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
organism_name = genome_record.annotations['organism']
accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
organism_accession = organism_name.replace(' ', '_') + '_' + accession_ver
function_list = []
for feature in genome_record.features:
    a = feature.qualifiers
    if feature.type == 'CDS':
        if a.get("locus_tag")!= None:
            if len(find_go_term(a.get("locus_tag")[0])) > 0:
                function_list.append(find_go_term(a.get("locus_tag")[0])[0])

In [249]:
for k, v in gene_info_dict.items():
    print(k)

AM408590.1
LT708304.1
NC_000962.3
NC_003155.5
NC_004369.1
NC_006361.1
NC_007164.1
NC_007777.1
NC_008148.1
NC_008278.1
NC_008578.1
NC_008726.1
NC_009142.1
NC_009380.1
NC_009664.2
NC_010168.1
NC_010572.1
NC_011886.1
NC_012590.1
NC_012669.1
NC_012704.1
NC_013093.1
NC_013124.1
NC_013131.1
NC_013159.1
NC_013170.1
NC_013172.1
NC_013203.1
NC_013235.1
NC_013441.1
NC_013510.1
NC_013521.1
NC_013530.1
NC_013595.1
NC_013729.1
NC_013739.1
NC_013757.1
NC_013929.1
NC_013947.1
NC_014151.1
NC_014158.1
NC_014168.1
NC_014246.1
NC_014363.1
NC_014391.1
NC_014643.1
NC_014666.1
NC_014814.1
NC_014830.1
NC_015067.1
NC_015389.1
NC_015434.1
NC_015514.1
NC_015564.1
NC_015576.1
NC_015588.1
NC_015635.1
NC_015671.1
NC_015673.1
NC_015848.1
NC_016109.1
NC_016111.1
NC_016887.1
NC_016906.1
NC_016946.1
NC_017093.1
NC_017216.2
NC_018720.1
NC_019673.1
NC_020302.1
NC_020506.1
NC_020520.1
NC_020990.1
NC_021064.1
NC_021085.1
NC_021252.1
NC_021352.1
NC_021663.1
NC_021915.1
NC_021985.1
NC_022116.1
NC_022198.1
NC_022438.1
NC_022