#### Directories etc

In [7]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from scipy.stats import chi2
from Comparative_Analysis import ORF_Functions as orffn
from Comparative_Analysis import Alignment as alignfn
from random import sample
from Bio.Align.Applications import MuscleCommandline
import subprocess
pd.options.mode.chained_assignment = None  # default='warn'

In [8]:
project_dir = 'F:/Project_Data/Intergenic_Region_Comparative_Analysis'
seq_dir_1 = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
seq_dir_2 = 'F:/Datasets/NCBI_Refseq_Actinobacteria_All_Levels/data'
sequence_dirs_1 = util.list_dirs(seq_dir_1)
sequence_dirs_2 = util.list_dirs(seq_dir_2)
tb_species = 'NC_000962.3' 
tb_annotation_dirname = 'GCF_000195955.2'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'
full_build = False

#### Create file with all CDS for species and create BLAST databases for TB CDS and All CDS (to do reciprocal best hits)

In [19]:
def generate_protein_dataset(num_subsets, subset_num, dir_list, seqdir): 
    sequence_dirs = util.chunk_list(dir_list, num_subsets, subset_num)
    all_cds = []
    all_tb_cds = []
    names = []
    sequences = []
    locations = []
    for dirname in (sequence_dirs):
        if not(os.path.exists(seqdir + '/' + dirname + '/genomic.gbff')):
            continue
        for genome_record in SeqIO.parse(seqdir + '/' + dirname + '/genomic.gbff', "genbank"):
            accession_ver = genome_record.annotations['accessions'][0] + '.' + str(genome_record.annotations['sequence_version'])
            names.append([accession_ver, genome_record.annotations['organism']])
            full_sequence = str(genome_record.seq)
            sequences.append([accession_ver, full_sequence])
            for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS' and a.get("translation") != None:
                    locus_tag = a.get("locus_tag")[0]
                    accession_locus = accession_ver + '@' + locus_tag
                    translation = a.get("translation")[0]
                    (start, stop, strand) = (int(feature.location.start), int(feature.location.end), int(feature.location.strand))
                    locations.append([accession_locus, (start, stop, strand)])
                    all_cds.append([accession_locus, translation])
                    if dirname == tb_annotation_dirname:
                        all_tb_cds.append([accession_locus, translation])
    return (all_cds, all_tb_cds, names, locations, sequences)           

In [20]:
parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, sequence_dirs_1, seq_dir_1) for core_number in core_numbers)
names_dict_1 = {}
locations_dict_1 = {}
sequence_dict_1 = {}
all_cds_1 = []
all_tb_cds_1 = []
for x in parallel_output:
    all_cds_1 += x[0]
    all_tb_cds_1 += x[1]
    for temp in x[2]:
        names_dict_1[temp[0]] = temp[1]
    for temp in x[3]:
        locations_dict_1[temp[0]] = temp[1]
    for temp in x[4]:
        sequence_dict_1[temp[0]] = temp[1]
with open(project_dir + '/names_dict_1.pkl', 'wb') as f:
        pickle.dump(names_dict_1, f) 

In [21]:
parallel_output = Parallel(n_jobs=-1)(delayed(generate_protein_dataset)(num_cores, core_number, sequence_dirs_2, seq_dir_2) for core_number in core_numbers)
names_dict_2 = {}
locations_dict_2 = {}
sequence_dict_2 = {}
all_cds_2 = []
all_tb_cds_2 = []
for x in parallel_output:
    all_cds_2 += x[0]
    all_tb_cds_2 += x[1]
    for temp in x[2]:
        names_dict_2[temp[0]] = temp[1]
    for temp in x[3]:
        locations_dict_2[temp[0]] = temp[1]
    for temp in x[4]:
        sequence_dict_2[temp[0]] = temp[1]
with open(project_dir + '/names_dict_2.pkl', 'wb') as f:
        pickle.dump(names_dict_2, f) 

In [31]:
if full_build == True:
    util.produce_fasta_file(all_cds_1, project_dir + '/all_cds.fasta')
    util.produce_fasta_file(all_tb_cds_1, project_dir + '/all_tb_cds.fasta')
    blastfn.build_blast_db(project_dir, 'all_cds.fasta', 'Mycobacteriaceae_Refseq', 'F:/Datasets/BLAST/Mycobacteriaceae_Refseq')
    blastfn.build_blast_db(project_dir, 'all_tb_cds.fasta', 'all_tb_cds', 'F:/Datasets/BLAST/all_tb_cds')

In [154]:
if full_build == True:
    util.produce_fasta_file(all_cds_2, project_dir + '/all_cds_2.fasta')
    util.produce_fasta_file(all_tb_cds_2, project_dir + '/all_tb_cds_2.fasta')
    blastfn.build_blast_db(project_dir, 'all_cds_2.fasta', 'Actinobacteria_Refseq', 'F:/Datasets/BLAST/Actinobacteria_Refseq')
    blastfn.build_blast_db(project_dir, 'all_tb_cds_2.fasta', 'all_tb_cds_2', 'F:/Datasets/BLAST/all_tb_cds_2')

100%|██████████| 8786289/8786289 [11:16<00:00, 12986.53it/s]
100%|██████████| 3906/3906 [00:00<00:00, 50716.03it/s]


In [9]:
if full_build == True:
    blastfn.run_blastp('F:/Datasets/BLAST/Mycobacteriaceae_Refseq', 'all_tb_cds.fasta', 'Mycobacteriaceae_Refseq', 'all_tb_cds_hits.csv', e_value = 1e-10)
    blastfn.run_blastp('F:/Datasets/BLAST/all_tb_cds', 'all_cds.fasta', 'all_tb_cds', 'reverse_hits.csv', e_value = 1e-10)
    a = blastfn.process_blast_output('F:/Datasets/BLAST/Mycobacteriaceae_Refseq/all_tb_cds_hits.csv', names_dict, top_hit_only = False)
    a = a.loc[a.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()]     # Utility top hit method uses accession ver which can differ if multiple sets exist per species in fragmented annotations
    b = blastfn.process_blast_output('F:/Datasets/BLAST/all_tb_cds/reverse_hits.csv', names_dict, top_hit_only = False)
    b = b.loc[b.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()] 
    rbh =  blastfn.keep_reciprocal_best_hits(a, b)
    rbh.to_csv(project_dir + '/reciprocal_best_hits.csv')
else:
    rbh = pd.read_csv(project_dir + '/reciprocal_best_hits.csv')

In [10]:
#if 1==1:
if full_build == True:
    #blastfn.run_blastp('F:/Datasets/BLAST/Actinobacteria_Refseq', 'all_tb_cds_2.fasta', 'Actinobacteria_Refseq', 'all_tb_cds_hits_2.csv', e_value = 1e-10)
    #blastfn.run_blastp('F:/Datasets/BLAST/all_tb_cds_2', 'all_cds_2.fasta', 'all_tb_cds_2', 'reverse_hits_2.csv', e_value = 1e-10)
    a = blastfn.process_blast_output('F:/Datasets/BLAST/Actinobacteria_Refseq/all_tb_cds_hits_2.csv', names_dict_2, top_hit_only = False)
    a = a.loc[a.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()]     # Utility top hit method uses accession ver which can differ if multiple sets exist per species in fragmented annotations
    b = blastfn.process_blast_output('F:/Datasets/BLAST/all_tb_cds_2/reverse_hits_2.csv', names_dict_2, top_hit_only = False)
    b = b.loc[b.groupby(['query_ref','target_species_name'])['bit_score'].idxmax()] 
    rbh_2 =  blastfn.keep_reciprocal_best_hits(a, b)
    rbh_2.to_csv(project_dir + '/reciprocal_best_hits_2.csv')
else:
    rbh_2 = pd.read_csv(project_dir + '/reciprocal_best_hits_2.csv')

#### Function to generate FASTA file containing intergenic regions in orthologous species and run Muscle / R-scape

In [11]:
def generate_alignment(locus, offset, comparison_set):
    target_locus = tb_species+ '@' + locus
    
    results_dir = project_dir + '/' + locus
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    
    if comparison_set == 1:     #1 = Mycobacteria, 2 = Actinobacteria
        rbh = pd.read_csv(project_dir + '/reciprocal_best_hits.csv')
        locations_dict = locations_dict_1
        sequence_dict = sequence_dict_1
    else:
        rbh = pd.read_csv(project_dir + '/reciprocal_best_hits_2.csv')
        locations_dict = locations_dict_2
        sequence_dict = sequence_dict_2
    
    location = locations_dict[target_locus] 
    rbh = rbh[rbh['query_ref'] == target_locus]
    rbh['target_loc'] = rbh['target_ref'].map(locations_dict)
    rbh['cds_count'] = rbh.groupby('target_species_name')['query_ref'].transform('size')
    rbh['min_pct_id'] = rbh.groupby('target_species_name')['percent_identical_matches'].transform('min')
    
    rbh = rbh[rbh['cds_count'] == 1]
    for i, r in rbh.iterrows():
        (start, stop, strand) = r['target_loc']
        rbh.at[i,'start'] = start
        rbh.at[i,'stop'] =  stop
        rbh.at[i,'strand'] = strand
    rbh = rbh[rbh['min_pct_id'] >= 60]
    
    intergenic_regions = []
    
    for i, r in rbh.iterrows():
        target_species = r['target_species_name']
        target_species_accession = r['target_species']
        coords = r['target_loc']
        if coords[2] == 1:
            intergenic_region = (coords[0] - offset, coords[0] + 3)
            intergenic_sequence = sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]]
        else:
            intergenic_region = (coords[1] - 3, coords[1] + offset)
            intergenic_sequence = util.reverse_complement(sequence_dict[target_species_accession][intergenic_region[0]: intergenic_region[1]])

        intergenic_regions.append([target_species.replace(" ", "_"), intergenic_sequence])


    util.produce_fasta_file(intergenic_regions, results_dir +'/intergenic_regions_'+str(comparison_set)+'.fasta')
    cline = MuscleCommandline(muscle_exe, input= results_dir +'/intergenic_regions_'+str(comparison_set)+'.fasta', out = results_dir + '/upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.fasta')
    exception = 0
    try:
        stdout, stderr = cline()
    except Exception as e:
        exception == 1
    if exception == 0:
        blastfn.convert_fasta_to_stockholm(results_dir, 'upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.fasta', 'upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.sto')
        blastfn.run_rscape(results_dir, 'upstream_region_'+locus + '_alignment_'+str(comparison_set)+'.sto', 'rscape_output_'+str(comparison_set))


#### Subset based on loci of interest, and filter orthologues based on percent identity, synteny

In [13]:
tb_loci = []
for genome_record in SeqIO.parse(seq_dir_1 + '/' + tb_annotation_dirname + '/genomic.gbff', "genbank"):
        for feature in genome_record.features:
                a = feature.qualifiers
                if feature.type == 'CDS':
                    tb_loci.append(a.get("locus_tag")[0])

In [None]:
dont_start = 1
for n, locus in tqdm(enumerate(tb_loci)):
    if locus == 'Rv2312':
        dont_start = 0
    if dont_start == 1:
        continue
    target_locus = tb_species+ '@' + locus 
    location = locations_dict_1[target_locus] 
    if n == 0:
        continue
    if location[2] == 1:
        upstream_cds_stop = locations_dict_1[tb_species + '@' + tb_loci[n-1]][1]
        offset = location[0] - upstream_cds_stop
    else:
        upstream_cds_stop = locations_dict_1[tb_species + '@' + tb_loci[n+1]][0]
        offset = upstream_cds_stop - location[1]

    if offset < 30:
        continue
    else:
        print(locus)
        generate_alignment(locus, offset, 1)
        generate_alignment(locus, offset, 2)

0it [00:00, ?it/s]

Rv2312



100%|██████████| 3/3 [00:00<00:00, 5989.01it/s]

100%|██████████| 3/3 [00:00<00:00, 6000.43it/s]
2301it [00:21, 105.88it/s]

Rv2315c



100%|██████████| 211/211 [00:00<00:00, 422635.22it/s]
2301it [00:36, 105.88it/s]
100%|██████████| 500/500 [00:00<00:00, 250705.56it/s]
2304it [00:50, 36.31it/s] 

Rv2316



100%|██████████| 169/169 [00:00<00:00, 337541.61it/s]

100%|██████████| 118/118 [00:00<?, ?it/s]A
2305it [01:07, 23.92it/s]

Rv2320c



100%|██████████| 125/125 [00:00<00:00, 14708.19it/s]

100%|██████████| 115/115 [00:00<00:00, 12102.50it/s]
2309it [18:26,  1.35s/it]

Rv2323c



100%|██████████| 153/153 [00:00<00:00, 306168.18it/s]

100%|██████████| 113/113 [00:00<00:00, 226881.93it/s]
2310it [18:45,  1.38s/it]

Rv2324



100%|██████████| 152/152 [00:00<00:00, 303298.86it/s]

100%|██████████| 114/114 [00:00<?, ?it/s]A
2311it [19:04,  1.42s/it]

Rv2326c



100%|██████████| 202/202 [00:00<00:00, 404029.28it/s]

100%|██████████| 125/125 [00:00<?, ?it/s]A
2313it [19:21,  1.48s/it]

Rv2327



100%|██████████| 199/199 [00:00<00:00, 398408.83it/s]

100%|██████████| 124/124 [00:00<?, ?it/s]A
2314it [19:38,  1.56s/it]

Rv2328



100%|██████████| 3/3 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<?, ?it/s][A
2315it [20:10,  1.78s/it]

Rv2329c



100%|██████████| 37/37 [00:00<00:00, 73271.60it/s]

100%|██████████| 22/22 [00:00<?, ?it/s][A
2316it [20:58,  2.26s/it]

Rv2330c



100%|██████████| 74/74 [00:00<00:00, 147308.26it/s]

100%|██████████| 43/43 [00:00<00:00, 86625.88it/s]
2317it [21:13,  2.45s/it]

Rv2331



100%|██████████| 2/2 [00:00<00:00, 4000.29it/s]

100%|██████████| 2/2 [00:00<?, ?it/s][A
2318it [21:29,  2.72s/it]

Rv2331A



100%|██████████| 1/1 [00:00<?, ?it/s][A

100%|██████████| 1/1 [00:00<?, ?it/s][A
2319it [21:43,  3.05s/it]

Rv2333c



100%|██████████| 8/8 [00:00<?, ?it/s][A

100%|██████████| 6/6 [00:00<?, ?it/s][A
2321it [24:40,  9.41s/it]

Rv2334



100%|██████████| 110/110 [00:00<00:00, 31349.69it/s]

100%|██████████| 416/416 [00:00<00:00, 27699.68it/s]
2322it [34:23, 38.44s/it]

Rv2336



100%|██████████| 2/2 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<?, ?it/s][A
2324it [35:21, 37.25s/it]

Rv2337c



100%|██████████| 5/5 [00:00<?, ?it/s][A

100%|██████████| 4/4 [00:00<?, ?it/s][A
2325it [35:39, 35.70s/it]

Rv2338c



100%|██████████| 5/5 [00:00<?, ?it/s][A

100%|██████████| 4/4 [00:00<?, ?it/s][A
2326it [38:05, 47.31s/it]

Rv2339



100%|██████████| 2/2 [00:00<00:00, 2000.14it/s]

100%|██████████| 2/2 [00:00<?, ?it/s][A
2327it [41:27, 67.69s/it]

Rv2340c



100%|██████████| 3/3 [00:00<00:00, 6023.41it/s]

100%|██████████| 3/3 [00:00<?, ?it/s][A
2328it [46:40, 106.29s/it]

Rv2341



100%|██████████| 2/2 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<?, ?it/s][A
2329it [50:50, 132.83s/it]

Rv2342



100%|██████████| 170/170 [00:00<00:00, 56652.76it/s]

100%|██████████| 103/103 [00:00<00:00, 67990.76it/s]
2330it [52:02, 120.07s/it]

Rv2344c



100%|██████████| 215/215 [00:00<00:00, 141699.46it/s]

100%|██████████| 275/275 [00:00<00:00, 183609.30it/s]
2332it [52:36, 81.71s/it] 

Rv2345



100%|██████████| 99/99 [00:00<00:00, 197449.40it/s]

100%|██████████| 59/59 [00:00<00:00, 118403.80it/s]
2333it [52:55, 68.52s/it]

Rv2346c



100%|██████████| 5/5 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<00:00, 4006.02it/s]
2334it [53:10, 56.18s/it]

Rv2347c



100%|██████████| 74/74 [00:00<?, ?it/s][A

100%|██████████| 47/47 [00:00<00:00, 93560.65it/s]
2335it [53:38, 49.19s/it]

Rv2348c



100%|██████████| 47/47 [00:00<00:00, 94276.56it/s]

100%|██████████| 27/27 [00:00<?, ?it/s][A
2336it [54:13, 45.53s/it]

Rv2349c



100%|██████████| 18/18 [00:00<00:00, 35899.89it/s]

100%|██████████| 10/10 [00:00<?, ?it/s][A
2337it [54:28, 37.24s/it]

Rv2350c



100%|██████████| 25/25 [00:00<?, ?it/s][A

100%|██████████| 15/15 [00:00<?, ?it/s][A
2338it [54:57, 34.75s/it]

Rv2351c



100%|██████████| 23/23 [00:00<00:00, 22968.81it/s]

100%|██████████| 13/13 [00:00<00:00, 36916.69it/s]
2339it [59:43, 106.52s/it]

Rv2352c



100%|██████████| 11/11 [00:00<?, ?it/s][A

100%|██████████| 7/7 [00:00<?, ?it/s][A
2340it [1:02:00, 115.50s/it]

Rv2353c



100%|██████████| 8/8 [00:00<00:00, 16131.94it/s]

100%|██████████| 3/3 [00:00<00:00, 6003.30it/s]
2341it [1:04:43, 129.23s/it]

Rv2357c



100%|██████████| 214/214 [00:00<00:00, 71264.87it/s]

100%|██████████| 500/500 [00:00<00:00, 76810.31it/s]
2342it [1:07:33, 141.45s/it]

Rv2358



100%|██████████| 213/213 [00:00<00:00, 106431.59it/s]

100%|██████████| 433/433 [00:00<00:00, 86593.89it/s]
2343it [1:10:03, 143.96s/it]

Rv2362c



100%|██████████| 212/212 [00:00<00:00, 213286.75it/s]

100%|██████████| 376/376 [00:00<00:00, 188036.04it/s]
2347it [1:10:36, 58.60s/it] 

Rv2363



100%|██████████| 204/204 [00:00<00:00, 204404.69it/s]

100%|██████████| 210/210 [00:00<00:00, 209915.12it/s]
2348it [1:10:58, 51.83s/it]

Rv2364c



100%|██████████| 212/212 [00:00<00:00, 212319.11it/s]

100%|██████████| 500/500 [00:00<00:00, 166652.26it/s]
2349it [1:11:59, 53.80s/it]

Rv2370c



100%|██████████| 8/8 [00:00<?, ?it/s][A

100%|██████████| 6/6 [00:00<00:00, 11966.63it/s]
2355it [1:12:58, 25.61s/it]

Rv2371



100%|██████████| 2/2 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<00:00, 3968.12it/s]
2356it [1:13:32, 26.74s/it]

Rv2373c



100%|██████████| 212/212 [00:00<00:00, 212116.52it/s]

100%|██████████| 327/327 [00:00<00:00, 163765.66it/s]
2358it [1:14:06, 24.16s/it]

Rv2374c



100%|██████████| 215/215 [00:00<00:00, 86096.56it/s]

100%|██████████| 393/393 [00:00<00:00, 98099.24it/s]
2359it [1:16:08, 40.03s/it]

Rv2375



100%|██████████| 189/189 [00:00<00:00, 126029.17it/s]

100%|██████████| 495/495 [00:00<00:00, 82483.04it/s]
2360it [1:18:02, 54.21s/it]

Rv2376c



100%|██████████| 82/82 [00:00<00:00, 163699.63it/s]

100%|██████████| 47/47 [00:00<00:00, 94096.56it/s]
2361it [1:18:25, 47.54s/it]

Rv2380c



100%|██████████| 164/164 [00:00<00:00, 109289.14it/s]

100%|██████████| 102/102 [00:00<00:00, 204307.07it/s]
2365it [1:18:58, 25.89s/it]

Rv2383c



100%|██████████| 118/118 [00:00<?, ?it/s]A

100%|██████████| 73/73 [00:00<00:00, 144086.68it/s]
2368it [1:19:20, 19.03s/it]

Rv2384



100%|██████████| 193/193 [00:00<00:00, 128635.10it/s]

100%|██████████| 122/122 [00:00<00:00, 244717.88it/s]
2369it [1:19:49, 20.48s/it]

Rv2385



100%|██████████| 86/86 [00:00<00:00, 171684.98it/s]

100%|██████████| 51/51 [00:00<00:00, 102202.34it/s]
2370it [1:20:08, 20.33s/it]

Rv2386c



100%|██████████| 133/133 [00:00<00:00, 33252.41it/s]

100%|██████████| 80/80 [00:00<00:00, 40098.51it/s]
2371it [1:23:57, 62.31s/it]

Rv2386a



100%|██████████| 63/63 [00:00<00:00, 31581.35it/s]

100%|██████████| 36/36 [00:00<00:00, 71663.48it/s]
2372it [1:27:08, 90.95s/it]

Rv2387



100%|██████████| 100/100 [00:00<00:00, 200301.05it/s]

100%|██████████| 58/58 [00:00<00:00, 115293.66it/s]
2373it [1:27:30, 74.29s/it]

Rv2388c



100%|██████████| 209/209 [00:00<00:00, 139122.29it/s]

100%|██████████| 369/369 [00:00<00:00, 123106.76it/s]
2374it [1:28:47, 75.12s/it]

Rv2390c



100%|██████████| 2/2 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<?, ?it/s][A
2376it [1:30:59, 71.26s/it]

Rv2391



100%|██████████| 211/211 [00:00<00:00, 42144.78it/s]

100%|██████████| 471/471 [00:00<00:00, 39164.13it/s]
2377it [1:40:07, 181.82s/it]

Rv2394



100%|██████████| 78/78 [00:00<?, ?it/s][A

100%|██████████| 107/107 [00:00<?, ?it/s]A
2380it [1:40:22, 93.60s/it] 

Rv2395



100%|██████████| 38/38 [00:00<?, ?it/s][A

100%|██████████| 101/101 [00:00<00:00, 100983.24it/s]
2381it [1:40:47, 80.46s/it]

Rv2395A



100%|██████████| 3/3 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<?, ?it/s][A
2382it [1:41:12, 68.50s/it]

Rv2395B



100%|██████████| 2/2 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<00:00, 3998.38it/s]
2383it [1:41:32, 57.21s/it]

Rv2396



100%|██████████| 1/1 [00:00<?, ?it/s][A

100%|██████████| 1/1 [00:00<00:00, 2003.01it/s]
2384it [1:41:47, 46.70s/it]

Rv2401A



100%|██████████| 168/168 [00:00<00:00, 56048.61it/s]

100%|██████████| 97/97 [00:00<00:00, 64609.73it/s]
2390it [1:43:25, 25.87s/it]

Rv2402



100%|██████████| 203/203 [00:00<00:00, 57286.13it/s]

100%|██████████| 307/307 [00:00<00:00, 55805.29it/s]
2391it [1:46:05, 44.66s/it]

Rv2406c



100%|██████████| 194/194 [00:00<00:00, 55443.92it/s]

100%|██████████| 188/188 [00:00<00:00, 62636.36it/s]
2395it [1:47:34, 34.78s/it]

Rv2407



100%|██████████| 174/174 [00:00<00:00, 57994.99it/s]

100%|██████████| 106/106 [00:00<00:00, 105957.16it/s]
2396it [1:49:16, 43.89s/it]

Rv2408



100%|██████████| 1/1 [00:00<?, ?it/s][A

100%|██████████| 1/1 [00:00<?, ?it/s][A
2397it [1:49:42, 40.96s/it]

Rv2411c



100%|██████████| 184/184 [00:00<00:00, 122403.16it/s]

100%|██████████| 398/398 [00:00<00:00, 88544.69it/s]
2400it [1:50:30, 30.66s/it]

Rv2412



100%|██████████| 214/214 [00:00<00:00, 213658.90it/s]

100%|██████████| 500/500 [00:00<00:00, 111125.05it/s]
2401it [1:52:26, 44.72s/it]

Rv2413c



100%|██████████| 212/212 [00:00<00:00, 422419.22it/s]

100%|██████████| 182/182 [00:00<?, ?it/s]A
2402it [1:52:43, 39.46s/it]

Rv2415c



100%|██████████| 104/104 [00:00<00:00, 51997.57it/s]

100%|██████████| 62/62 [00:00<00:00, 61960.17it/s]
2404it [1:54:30, 44.45s/it]

Rv2416c



100%|██████████| 39/39 [00:00<?, ?it/s][A

100%|██████████| 24/24 [00:00<?, ?it/s][A
2405it [1:55:09, 43.30s/it]

Rv2417c



100%|██████████| 173/173 [00:00<00:00, 155744.71it/s]

100%|██████████| 105/105 [00:00<00:00, 210416.59it/s]
2406it [1:55:31, 38.48s/it]

Rv2421c



100%|██████████| 213/213 [00:00<00:00, 53301.52it/s]

100%|██████████| 499/499 [00:00<00:00, 55397.10it/s]
2410it [2:00:33, 59.21s/it]

Rv2422



100%|██████████| 7/7 [00:00<?, ?it/s][A

100%|██████████| 5/5 [00:00<?, ?it/s][A
2411it [2:01:26, 58.20s/it]

Rv2423



100%|██████████| 88/88 [00:00<00:00, 58642.95it/s]

100%|██████████| 50/50 [00:00<?, ?it/s][A
2412it [2:02:30, 59.27s/it]

Rv2424c



100%|██████████| 1/1 [00:00<00:00, 1996.34it/s]

100%|██████████| 2/2 [00:00<?, ?it/s][A
2413it [2:02:47, 50.28s/it]

Rv2426c



100%|██████████| 208/208 [00:00<00:00, 207717.91it/s]

100%|██████████| 500/500 [00:00<00:00, 250406.21it/s]
2415it [2:03:26, 38.53s/it]

Rv2427c



100%|██████████| 214/214 [00:00<00:00, 23779.50it/s]

100%|██████████| 472/472 [00:00<00:00, 23028.20it/s]
2416it [2:37:17, 464.95s/it]

Rv2428



100%|██████████| 153/153 [00:00<00:00, 21803.03it/s]

100%|██████████| 495/495 [00:00<00:00, 23006.55it/s]
2417it [2:55:37, 613.54s/it]

Rv2430c



100%|██████████| 4/4 [00:00<?, ?it/s][A

100%|██████████| 3/3 [00:00<?, ?it/s][A
2419it [2:55:51, 370.51s/it]

Rv2431c



100%|██████████| 4/4 [00:00<?, ?it/s][A

100%|██████████| 3/3 [00:00<?, ?it/s][A
2420it [2:56:19, 294.25s/it]

Rv2435c



100%|██████████| 98/98 [00:00<00:00, 39202.84it/s]

100%|██████████| 57/57 [00:00<00:00, 37930.40it/s]
2424it [3:00:05, 161.16s/it]

Rv2436



100%|██████████| 133/133 [00:00<00:00, 38052.01it/s]

100%|██████████| 79/79 [00:00<00:00, 39517.00it/s]
2425it [3:03:42, 170.45s/it]

Rv2437



100%|██████████| 43/43 [00:00<?, ?it/s][A

100%|██████████| 26/26 [00:00<00:00, 52202.92it/s]
2426it [3:04:19, 144.78s/it]

Rv2440c



100%|██████████| 215/215 [00:00<00:00, 214708.42it/s]

100%|██████████| 497/497 [00:00<00:00, 142145.86it/s]
2430it [3:05:16, 76.44s/it] 

Rv2442c



100%|██████████| 214/214 [00:00<00:00, 47604.40it/s]

100%|██████████| 500/500 [00:00<00:00, 43378.88it/s]
2432it [3:17:34, 155.99s/it]

Rv2443



100%|██████████| 87/87 [00:00<00:00, 43529.10it/s]

100%|██████████| 468/468 [00:00<00:00, 46822.37it/s]
2433it [3:23:16, 186.40s/it]

Rv2444c



100%|██████████| 195/195 [00:00<00:00, 48730.30it/s]

100%|██████████| 473/473 [00:00<00:00, 52983.28it/s]
2434it [3:29:13, 218.52s/it]

Rv2445c



100%|██████████| 215/215 [00:00<00:00, 429416.84it/s]

100%|██████████| 499/499 [00:00<00:00, 199652.55it/s]
2435it [3:29:45, 178.98s/it]

Rv2448c



100%|██████████| 212/212 [00:00<00:00, 141276.21it/s]

100%|██████████| 500/500 [00:00<00:00, 142150.88it/s]
2438it [3:30:43, 102.86s/it]

Rv2449c



100%|██████████| 201/201 [00:00<00:00, 201302.56it/s]

100%|██████████| 126/126 [00:00<00:00, 252018.27it/s]
2439it [3:31:05, 87.91s/it] 

Rv2450c



100%|██████████| 143/143 [00:00<00:00, 289471.75it/s]

100%|██████████| 260/260 [00:00<00:00, 173043.33it/s]
2440it [3:31:31, 75.07s/it]

Rv2451



100%|██████████| 2/2 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<?, ?it/s][A
2441it [3:31:47, 61.50s/it]

Rv2455c



100%|██████████| 204/204 [00:00<00:00, 68015.74it/s]

100%|██████████| 494/494 [00:00<00:00, 65867.25it/s]
2445it [3:36:43, 68.49s/it]

Rv2457c



100%|██████████| 215/215 [00:00<00:00, 61428.84it/s]

100%|██████████| 500/500 [00:00<00:00, 52590.52it/s]
2447it [3:42:56, 102.60s/it]

Rv2458



100%|██████████| 96/96 [00:00<00:00, 95778.59it/s]

100%|██████████| 179/179 [00:00<00:00, 59439.51it/s]
2448it [3:44:10, 97.71s/it] 

Rv2459



100%|██████████| 16/16 [00:00<?, ?it/s][A

100%|██████████| 10/10 [00:00<?, ?it/s][A
2449it [3:44:42, 84.76s/it]

Rv2461c



100%|██████████| 213/213 [00:00<00:00, 141852.45it/s]

100%|██████████| 495/495 [00:00<00:00, 110101.31it/s]
2451it [3:46:41, 75.74s/it]

Rv2462c



100%|██████████| 200/200 [00:00<00:00, 39995.27it/s]

100%|██████████| 163/163 [00:00<00:00, 46619.27it/s]
2452it [3:49:47, 98.24s/it]

Rv2463



100%|██████████| 152/152 [00:00<00:00, 43452.44it/s]

100%|██████████| 93/93 [00:00<00:00, 61915.92it/s]
2453it [3:51:59, 105.87s/it]

Rv2465c



100%|██████████| 212/212 [00:00<00:00, 141501.03it/s]

100%|██████████| 499/499 [00:00<00:00, 142737.35it/s]
2455it [3:53:14, 79.14s/it] 

Rv2466c



100%|██████████| 215/215 [00:00<00:00, 143526.24it/s]

100%|██████████| 449/449 [00:00<00:00, 127981.14it/s]
2456it [3:54:12, 74.30s/it]

Rv2467



100%|██████████| 210/210 [00:00<00:00, 140121.51it/s]

100%|██████████| 349/349 [00:00<00:00, 139543.57it/s]
2457it [3:54:57, 67.54s/it]

Rv2468A



100%|██████████| 115/115 [00:00<00:00, 229797.50it/s]

100%|██████████| 67/67 [00:00<00:00, 134009.71it/s]
2459it [3:55:13, 43.38s/it]

Rv2469c



100%|██████████| 211/211 [00:00<00:00, 140743.98it/s]

100%|██████████| 424/424 [00:00<00:00, 84568.21it/s]
2460it [3:56:56, 56.83s/it]

Rv2470



100%|██████████| 214/214 [00:00<00:00, 142835.94it/s]

100%|██████████| 484/484 [00:00<00:00, 98876.97it/s]
2461it [3:58:26, 64.85s/it]

Rv2472



100%|██████████| 2/2 [00:00<?, ?it/s][A

100%|██████████| 2/2 [00:00<?, ?it/s][A
2463it [3:58:41, 41.39s/it]

Rv2476c



100%|██████████| 208/208 [00:00<00:00, 104007.54it/s]

100%|██████████| 130/130 [00:00<00:00, 129823.70it/s]
2467it [3:59:17, 23.81s/it]

Rv2477c



100%|██████████| 215/215 [00:00<00:00, 143252.64it/s]

100%|██████████| 500/500 [00:00<00:00, 124994.16it/s]
2468it [4:00:41, 33.55s/it]

Rv2478c



100%|██████████| 162/162 [00:00<00:00, 9814.92it/s]

100%|██████████| 101/101 [00:00<00:00, 9619.53it/s]


#### Extract covariation information to identify potentially significant regions

In [6]:
seq_ids = util.list_dirs(project_dir)
out_list = []
for id in seq_ids:
    if os.path.exists(project_dir + '/' + str(id) + '/rscape_output_2.cov'):
        with open(project_dir + '/' + str(id) + '/rscape_output_2.cov', 'r') as f:  
            num_pairs = 0
            e_values = []
            for l in f:
                if (not ('#' in l)):
                    a = l.split()
                    if len(a) > 6 and abs(int(a[2])-int(a[1])) > 3:    # Exclude covarying pairs less than 3 bp apart)
                        e_values.append(float(a[4]))
                        num_pairs +=1
        if len(e_values) > 0:
            tot = sum([math.log(x) for x  in e_values])
            num = len(e_values)
            combined_e_value = 1-chi2.cdf(-2*tot, 2*num)
        else:
            combined_e_value = 999
        if combined_e_value < 1e-5:
            print(id, num_pairs, combined_e_value)

Rv1133c 30 0.0
Rv0009 11 0.0
Rv0020c 52 0.0
Rv0041 26 0.0
Rv0053 11 0.0
Rv0129c 6 1.2992673603662297e-10
Rv0154c 6 4.008793297316515e-12
Rv0189c 9 0.0
Rv0190 2 1.92957504530078e-06
Rv0211 7 9.992007221626409e-15
Rv0237 8 0.0
Rv0244c 31 0.0
Rv0357c 6 0.0
Rv0423c 6 0.0
Rv0425c 1 3.4252199999640354e-06
Rv0440 19 0.0
Rv0462 1 4.405270000007455e-06
Rv0468 6 5.0013363414036505e-06
Rv0491 6 6.106226635438361e-15
Rv0496 2 4.2173180991866843e-07
Rv0512 4 0.0
Rv0524 8 3.0719161658865346e-08
Rv0548c 7 0.0
Rv0566c 22 0.0
Rv0634B 45 0.0
Rv0651 33 0.0
Rv0667 158 0.0
Rv0670 1 1.1438999999446864e-06
Rv0700 20 0.0
Rv0718 5 6.772360450213455e-14
Rv0732 5 2.3487101152852574e-11
Rv0753c 5 3.5762937056205146e-10
Rv0757 3 9.479231843911862e-10
Rv0768 6 1.7344722325063344e-09
Rv0777 7 1.2434497875801753e-14
Rv0810c 2 3.6731804486933584e-08
Rv0848 2 1.0475486745820106e-06
Rv0858c 1 9.062770001122544e-08
Rv0867c 7 1.0627054791711998e-11
Rv0870c 2 3.769170531242594e-10
Rv0873 5 1.960313895654764e-07
Rv0884c 22 