In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
from goatools import obo_parser
import random
import copy
from joblib import Parallel, delayed
import os
import wget
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil
from ftplib import FTP
import Bio.UniProt.GOA as GOA

In [4]:
full_run = True
project_dir = 'D:/Project_Data/Project_8'
datasets_dir = project_dir + '/Datasets'
output_dir = project_dir + '/Output'
r_scape_output_loc = project_dir + '/RScape_Run_Thoth_2'
wsl_output_loc = util.wslname(output_dir)
seq_dir = 'D:/Actinobacteria_Ref_Rep_Lev_Complete'
blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp'
blast_db_name = 'actinobacteria_ref_rep_comp'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
reference_species_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
species_list = util.list_files(seq_dir)

##### Load data for orthologs, RFAM, etc

In [5]:
with open(output_dir + '/Blast_Output/reciprocal_best_hits.pkl', 'rb') as f:
    rbh_results = pickle.load(f)

In [23]:
with open(output_dir + '/gene_info_dict.pkl', 'rb') as f:
    gene_info_dict = pickle.load(f) 

In [7]:
rfam_df = pd.read_excel(datasets_dir+'/RFAM_Hits_H37Rv_sorted.xlsx')

In [8]:
def ortholog_in_reference(accession_locus, rbh_df):
    if len(rbh_df[rbh_df.target_ref == accession_locus]) >= 1:
        return rbh_df[rbh_df.target_ref == accession_locus].iloc[0]['query_ref'].split('@')[1]
    else:
        return 'NO_ORTHOLOG'

In [9]:
def orthologs(reference_locus):
    temp = rbh_results[rbh_results.query_ref == 'NC_000962.3@'+ reference_locus]
    orths = []
    for i, r in temp.iterrows():
        orths.append(r['target_ref'])
    return orths

In [10]:
mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
features = []
for i, r in mycobrowser_df.iterrows():
    features.append([r['Locus'],r['Start'], r['Product']])
features.sort(key=lambda x: x[1])
downstream_dict = {}
for i, feature in enumerate(features):
    if i + 1 < len(features):
        downstream_dict[feature[0]] = (features[i+1][0])

In [11]:
refseq_downstream_dict = {}
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
features = []
for feature in genome_record.features:
    if not(feature.type == 'gene'):
        a = feature.qualifiers
        if a.get("locus_tag")!= None:
            features.append((a.get("locus_tag")[0], feature.type))
for i,feature in enumerate(features):
    if i+1 < len(features):
        next_feature = features[i+1][1]
        refseq_downstream_dict[feature[0]] = next_feature
            

In [24]:
intergenic_regions = util.list_dirs(r_scape_output_loc)
temp_rows = []
for id in intergenic_regions:
    if os.path.isfile(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.power'):
        with open(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.power', 'r') as f:
            num_covarying = ''
            num_expected = ''  
            for l in f:
                if 'BPAIRS observed' in l:
                    num_covarying = int(l.split('covary ')[1])
                if 'BPAIRS expected' in l:
                    num_expected = (l.split('covary ')[1])
                
            with open(r_scape_output_loc + '/' + id + '/summary.txt', 'r') as f:  
                rfam_overlaps = []
                feature_overlaps = []
                region_type_list = []
                upstream_ortholog_list = []
                downstream_ortholog_list = []
                ref_count = 0
                tot_count = 0
                for l in f:
                    if (not ('#' in l)):
                        a = l.split()
                        species = a[0]
                        seq_from = int(a[7])
                        seq_to = int(a[8])
                        seq_strand = a[9]
                        significant = a[16]
                        start = min(seq_from,seq_to)
                        stop = max(seq_from,seq_to)
                        if (significant == '!'):
                            tot_count+=1
                            feature_list = gene_info_dict[species]
                            max_pct_cover = -1
                            max_feature = []
                            for feature in feature_list:
                                if feature[1] < (stop - 1) and feature[2] >= (start - 1):
                                    len_feature = feature[2] - feature[1]
                                    pct_cover = (min(feature[2], stop) - max(feature[1], start)) / (stop - start)
                                    if pct_cover > 0.05 and len_feature < 100000:
                                        if pct_cover > max_pct_cover:
                                            max_feature = feature
                                            max_pct_cover = pct_cover
                                    if len(max_feature) > 0:
                                        if max_feature[0][-2:] == 'IG':
                                            region_type = 'IG'
                                            previous_gene = max_feature[3]
                                            next_gene = max_feature[4]
                                            upstream_genes = []
                                            downstream_genes = []
                                            upstream_orthologs = []
                                            downstream_orthologs = []
                                            if previous_gene[1] == 1:
                                                upstream_genes.append(previous_gene[0])
                                            else:
                                                downstream_genes.append(previous_gene[0])
                                            if next_gene[1] == 1:
                                                downstream_genes.append(next_gene[0])
                                            else:
                                                upstream_genes.append(next_gene[0])
                                            upstream_orthologs = [ortholog_in_reference(species + '@' + x, rbh_results) for x in upstream_genes]
                                            downstream_orthologs = [ortholog_in_reference(species + '@' + x, rbh_results) for x in downstream_genes]
                                        
                                        else:
                                            region_type = 'G'
                                        
                                        region_type_list.append(region_type)
                                        upstream_ortholog_list.append(upstream_orthologs)
                                        downstream_ortholog_list.append(downstream_orthologs)
                                        
                            # RFAM overlaps for reference hits
                            if species == 'NC_000962.3':
                                ref_count+=1
                                for i, r in rfam_df.iterrows():
                                    rfam_start = min(int(r['seq from']), int(r['seq to']))
                                    rfam_stop = max(int(r['seq from']), int(r['seq to']))
                                    pct_cover = (min(rfam_stop, stop) - max(rfam_start, start)) / (rfam_stop - rfam_start)
                                    if pct_cover > 0.1:
                                        rfam_overlaps.append((r.accession, r['description of target'], pct_cover))
            
            # Near covarying pairs - suggests not structural, possibly CDS
            with open(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.R2R.sto', 'r') as f: 
                r2r_near_pairs = 0
                for l in f:
                    if '#=GC SS_cons ' in l:
                        for i in range(len(l)-2):
                            if l[i:(i+2)] == '<>':
                                r2r_near_pairs +=1
        
        if id in refseq_downstream_dict:
            refseq_downstream = refseq_downstream_dict[id]
        else:
            refseq_downstream = ''
        if len(region_type_list) > 0:
            intergenic_pct = region_type_list.count('IG') / len(region_type_list)
        else:
            intergenic_pct = 0
        temp_rows.append([id, downstream_dict[id],  refseq_downstream , rfam_overlaps, num_covarying, num_expected, ref_count, tot_count, intergenic_pct, count_near_pair, r2r_near_pairs, region_type_list, upstream_ortholog_list, downstream_ortholog_list])
    else:
        temp_rows.append([id,downstream_dict[id],  refseq_downstream , rfam_overlaps, 0 , 0, 0, 0, 0, 0, 0,[],[],[]])
results_df = pd.DataFrame(temp_rows, columns = ['Downstream_of','Feature_Downstream','Refseq_Downstream','RFAM_Overlaps','num_BP_covarying', 'num_Expected', 'num_reference_hits','tot_hits','intergenic_region_pct','num_near_pairs','num_r2r_near_pairs','region_types','upstream_orthologs', 'downstream_orthologs'])
results_df.to_csv(r_scape_output_loc + '/intergenic_regions_covariation_summary_2.csv')