#### Directories etc

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2, chi2_contingency
from Comparative_Analysis import ORF_Functions as orffn
from Comparative_Analysis import Alignment as alignfn
from random import sample
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
project_dir = 'F:/Project_Data/Intergenic_Region_Comparative_Analysis'
seq_dir_1 = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
seq_dir_2 = 'F:/Datasets/NCBI_Refseq_Actinobacteria_All_Levels/data'
sequence_dirs_1 = util.list_dirs(seq_dir_1)
sequence_dirs_2 = util.list_dirs(seq_dir_2)
tb_species = 'NC_000962.3' 
tb_annotation_dirname = 'GCF_000195955.2'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_build = False

In [3]:
cortes_dict = {}
utr_category_dict = {}
cortes_df = pd.read_csv('F:/Datasets/Data_From_Publications/Cortes_TSS.csv')
for i, r in cortes_df.iterrows():
    cortes_dict[r['RvNumber']] = r['Leaderless']
    utr_category_dict[r['RvNumber']] = r['UTR Category']

In [17]:
cds_info_dict = {}
for genome_record in SeqIO.parse(seq_dir_1 + '/' + tb_annotation_dirname + '/genomic.gbff', "genbank"):
    for feature in genome_record.features:
        a = feature.qualifiers
        if feature.type == 'CDS' and a.get("translation") != None:
            locus_tag = a.get("locus_tag")[0]
            product = a.get("product")[0]
            cds_info_dict[locus_tag] = (feature.location.start, feature.location.end, feature.location.strand)

#####  Find potential structural regions

In [18]:
region_list = []
seq_ids = util.list_dirs(project_dir)
for id in seq_ids:
    if os.path.exists(project_dir + '/' + str(id) + '/rscape_output_2.cov'):
        alignment = util.read_fasta_to_array(project_dir + '/' + str(id) + '/upstream_region_'+str(id) + '_alignment_2.fasta')
        num_sequences = len(alignment[0])
        num_characters = len(alignment[1][0])
  
        with open(project_dir + '/'+str(id) +  '/rscape_output_2.cov', 'r') as f:  
            pair_items = []
            num_pairs = 0
            e_values = []

            for l in f:
                if (not ('#' in l)):
                    a = l.split()
                    if len(a) > 6 and abs(int(a[2])-int(a[1])) > 5 and abs(int(a[1])-num_characters) > 10 and abs(int(a[2])-num_characters) > 10:    # Exclude covarying pairs less than 3 bp apart)
                        e_values.append(float(a[4]))
                        num_pairs+=1
                        pair_items.append(int(a[2]))
                        pair_items.append(int(a[2]))
        if len(e_values) > 0:
            tot = sum([math.log(x) for x  in e_values])
            num = len(e_values)
            combined_e_value = 1-chi2.cdf(-2*tot, 2*num)
        else:
            combined_e_value = 999
        if combined_e_value < 1e-5:
            region_list.append((id, num_pairs, len(set(pair_items)), combined_e_value))
region_list.sort(key = lambda x: x[0])

[(x, cds_info_dict[x[0]]) for x in region_list if x[2] > 8]


[(('Rv0020c', 50, 18, 0.0), (ExactPosition(23860), ExactPosition(25444), -1)),
 (('Rv0053', 10, 9, 0.0), (ExactPosition(58191), ExactPosition(58482), 1)),
 (('Rv0244c', 30, 9, 0.0), (ExactPosition(293797), ExactPosition(295633), -1)),
 (('Rv0440', 13, 9, 0.0), (ExactPosition(528607), ExactPosition(530230), 1)),
 (('Rv0634B', 43, 19, 0.0), (ExactPosition(731711), ExactPosition(731879), 1)),
 (('Rv0651', 21, 10, 0.0), (ExactPosition(748275), ExactPosition(748812), 1)),
 (('Rv0667', 147, 45, 0.0), (ExactPosition(759806), ExactPosition(763325), 1)),
 (('Rv0884c', 22, 10, 0.0),
  (ExactPosition(981423), ExactPosition(982554), -1)),
 (('Rv1018c', 10, 9, 4.2439496361623696e-11),
  (ExactPosition(1136572), ExactPosition(1138060), -1)),
 (('Rv1133c', 29, 17, 0.0),
  (ExactPosition(1259066), ExactPosition(1261346), -1)),
 (('Rv1449c', 71, 32, 0.0),
  (ExactPosition(1628096), ExactPosition(1630199), -1)),
 (('Rv1626', 19, 11, 0.0),
  (ExactPosition(1828179), ExactPosition(1828797), 1)),
 (('Rv165

In [15]:
pd.DataFrame([x for x in region_list if x[2] > 8]).to_csv(project_dir +'/high_covarying_upstream_regions.csv')