In [10]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
from Comparative_Analysis import ORF_Functions as orffn
from goatools import obo_parser
import random
import copy
from joblib import Parallel, delayed
import os
import wget
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil
from ftplib import FTP
import Bio.UniProt.GOA as GOA

In [36]:
full_run = True
project_dir = 'D:/Project_Data/Project_8'
datasets_dir = project_dir + '/Datasets'
output_dir = project_dir + '/Output'
r_scape_output_loc = project_dir + '/RScape_Run_Thoth_3'
wsl_output_loc = util.wslname(output_dir)
seq_dir = 'D:/Actinobacteria_Ref_Rep_Lev_Complete'
blast_dir = 'D:/BLAST/actinobacteria_ref_rep_comp'
blast_db_name = 'actinobacteria_ref_rep_comp'
num_cores = 16
core_numbers = list(range(1, num_cores+1))
reference_species_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
species_list = util.list_files(seq_dir)
reference_species_name = 'Mycobacterium tuberculosis H37Rv'

##### Load data for orthologs, RFAM, etc

In [12]:
with open(output_dir + '/Blast_Output/reciprocal_best_hits.pkl', 'rb') as f:
    rbh_results = pickle.load(f)

In [13]:
with open(output_dir + '/Blast_Output/all_to_ref_best_hits.pkl', 'rb') as f:
    bh_results = pickle.load(f)

In [14]:
ref_ortholog_dict = {}
for i, r in rbh_results.iterrows():
    ref_ortholog_dict[r['target_ref']] = r['query_ref'] 

In [15]:
ref_bh_dict = {}
for i, r in bh_results.iterrows():
    ref_bh_dict[r['query_ref']] = r['target_ref'] 

In [16]:
with open(output_dir + '/gene_info_dict.pkl', 'rb') as f:
    gene_info_dict = pickle.load(f) 
with open(output_dir + '/names_dict.pkl', 'rb') as f:
    names_dict = pickle.load(f) 

In [17]:
rfam_df = pd.read_excel(datasets_dir+'/RFAM_Hits_H37Rv_sorted.xlsx')

In [18]:
def ortholog_in_reference(accession_locus, ref_ortholog_dict):
    if accession_locus in ref_ortholog_dict:
        return ref_ortholog_dict[accession_locus].split('@')[1]
    else:
        return 'NO_ORTHOLOG'

In [19]:
def orthologs(reference_locus):
    temp = rbh_results[rbh_results.query_ref == 'NC_000962.3@'+ reference_locus]
    orths = []
    for i, r in temp.iterrows():
        orths.append(r['target_ref'])
    return orths

In [20]:
mycobrowser_df = pd.read_excel(datasets_dir+'/Mycobrowser_Release_4.xlsx')
features = []
for i, r in mycobrowser_df.iterrows():
    features.append([r['Locus'],r['Start'], r['Product']])
features.sort(key=lambda x: x[1])
downstream_dict = {}
for i, feature in enumerate(features):
    if i + 1 < len(features):
        downstream_dict[feature[0]] = (features[i+1][0])

In [21]:
refseq_downstream_dict = {}
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
features = []
for feature in genome_record.features:
    if not(feature.type == 'gene'):
        a = feature.qualifiers
        if a.get("locus_tag")!= None:
            features.append((a.get("locus_tag")[0], feature.type))
for i,feature in enumerate(features):
    if i+1 < len(features):
        next_feature = features[i+1][1]
        refseq_downstream_dict[feature[0]] = next_feature
            

In [22]:
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
product_dict = {}
for feature in genome_record.features:
    if not(feature.type == 'gene'):
        a = feature.qualifiers
        if a.get("locus_tag")!= None and a.get("product")!=None:
            product_dict[a.get("locus_tag")[0]] = a.get("product")[0]
           

In [17]:
intergenic_regions = util.list_dirs(r_scape_output_loc)
temp_rows = []
for id in intergenic_regions:
    if os.path.isfile(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.power'):
        with open(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.power', 'r') as f:
            num_covarying = ''
            num_expected = ''  
            for l in f:
                if 'BPAIRS observed' in l:
                    num_covarying = int(l.split('covary ')[1])
                if 'BPAIRS expected' in l:
                    num_expected = (l.split('covary ')[1])
                
            with open(r_scape_output_loc + '/' + id + '/summary.txt', 'r') as f:  
                rfam_overlaps = []
                feature_overlaps = []
                region_type_list = []
                upstream_ortholog_list = []
                downstream_ortholog_list = []
                ref_count = 0
                tot_count = 0
                for l in f:
                    if (not ('#' in l)):
                        a = l.split()
                        species = a[0]
                        seq_from = int(a[7])
                        seq_to = int(a[8])
                        seq_strand = a[9]
                        significant = a[16]
                        start = min(seq_from,seq_to)
                        stop = max(seq_from,seq_to)
                        if (significant == '!'):
                            tot_count+=1
                            feature_list = gene_info_dict[species]  
                            feature_list.sort(key=lambda x: x[1])
                            max_pct_cover = -1
                            max_feature = []
                            for feature in feature_list:
                                if feature[1] > stop:      #No point to continue as subsequent features are all after the stop 
                                    continue
                                if feature[1] < (stop - 1) and feature[2] >= (start - 1):
                                    len_feature = feature[2] - feature[1]
                                    pct_cover = (min(feature[2], stop) - max(feature[1], start)) / (stop - start)
                                    if pct_cover > 0.05 and len_feature < 100000:
                                        if pct_cover > max_pct_cover:
                                            max_feature = feature
                                            max_pct_cover = pct_cover
                                    if len(max_feature) > 0:
                                        if max_feature[0][-2:] == 'IG':
                                            region_type = 'IG'
                                            previous_gene = max_feature[3]
                                            next_gene = max_feature[4]
                                            upstream_genes = []
                                            downstream_genes = []
                                            upstream_orthologs = []
                                            downstream_orthologs = []
                                            if previous_gene[1] == 1:
                                                upstream_genes.append(previous_gene[0])
                                            else:
                                                downstream_genes.append(previous_gene[0])
                                            if next_gene[1] == 1:
                                                downstream_genes.append(next_gene[0])
                                            else:
                                                upstream_genes.append(next_gene[0])
                                            upstream_orthologs = [ortholog_in_reference(species+ '@' + x, ref_ortholog_dict) for x in upstream_genes]
                                            downstream_orthologs = [ortholog_in_reference(species + '@' + x, ref_ortholog_dict) for x in downstream_genes]
                                        
                                        else:
                                            region_type = 'G'
                                        
                                        region_type_list.append(region_type)
                                        if region_type == 'IG':
                                            upstream_ortholog_list.append(upstream_orthologs)
                                            downstream_ortholog_list.append(downstream_orthologs)
                                        
                            # RFAM overlaps for reference hits
                            if species == 'NC_000962.3':
                                ref_count+=1
                                for i, r in rfam_df.iterrows():
                                    rfam_start = min(int(r['seq from']), int(r['seq to']))
                                    rfam_stop = max(int(r['seq from']), int(r['seq to']))
                                    pct_cover = (min(rfam_stop, stop) - max(rfam_start, start)) / (rfam_stop - rfam_start)
                                    if pct_cover > 0.1:
                                        rfam_overlaps.append((r.accession, r['description of target'], pct_cover))
            
            # Near covarying pairs - suggests not structural, possibly CDS
            with open(r_scape_output_loc + '/' + id + '/rscape_3.cacofold.R2R.sto', 'r') as f: 
                r2r_near_pairs = 0
                for l in f:
                    if '#=GC SS_cons ' in l:
                        for i in range(len(l)-2):
                            if l[i:(i+2)] == '<>':
                                r2r_near_pairs +=1
        
        if id in refseq_downstream_dict:
            refseq_downstream = refseq_downstream_dict[id]
        else:
            refseq_downstream = ''
        if len(region_type_list) > 0:
            intergenic_pct = region_type_list.count('IG') / len(region_type_list)
        else:
            intergenic_pct = 0
        temp_rows.append([id, downstream_dict[id],  refseq_downstream , rfam_overlaps, num_covarying, num_expected, ref_count, tot_count, intergenic_pct, r2r_near_pairs, region_type_list, [], []])
    else:
        temp_rows.append([id,downstream_dict[id],  refseq_downstream , rfam_overlaps, 0 , 0, 0, 0, 0, 0,[],[],[]])
results_df = pd.DataFrame(temp_rows, columns = ['Downstream_of','Feature_Downstream','Refseq_Downstream','RFAM_Overlaps','num_BP_covarying', 'num_Expected', 'num_reference_hits','tot_hits','intergenic_region_pct', 'num_r2r_near_pairs','region_types','upstream_orthologs', 'downstream_orthologs'])
results_df.to_csv(r_scape_output_loc + '/intergenic_regions_covariation_summary.csv')

####   Print out all hits for particular region  

In [93]:
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
full_sequence = str(genome_record.seq)
with open(r_scape_output_loc + '/' + 'Rv1668c' + '/summary.txt', 'r') as f:  
                hit_dict = {}
                ref_count = 0
                tot_count = 0
                for l in f:
                    if (not ('#' in l)):
                        a = l.split()
                        accession = a[0]
                        species = names_dict[a[0]]
                        seq_from = int(a[7])
                        seq_to = int(a[8])
                        seq_strand = a[9]
                        significant = a[16]
                        start = min(seq_from,seq_to)
                        stop = max(seq_from,seq_to)
                        if (significant == '!'):
                            tot_count+=1
                            feature_list = gene_info_dict[accession]  
                            feature_list.sort(key=lambda x: x[1])
                            max_pct_cover = -1
                            max_feature = []
                            for feature in feature_list:
                                if feature[1] > stop:      #No point to continue as subsequent features are all after the stop 
                                    continue
                                if feature[1] < (stop - 1) and feature[2] >= (start - 1):
                                    len_feature = feature[2] - feature[1]
                                    pct_cover = (min(feature[2], stop) - max(feature[1], start)) / (stop - start)
                                    if pct_cover > 0.05 and len_feature < 100000:
                                        if pct_cover > max_pct_cover:
                                            max_feature = feature
                                            max_pct_cover = pct_cover
                            if len(max_feature) > 0:
                                if max_feature[0][-2:] == 'IG':
                                    region_type = 'IG'
                                    previous_gene = max_feature[3]
                                    next_gene = max_feature[4]
                                    output = []
                                    if previous_gene[1] == 1:
                                        output.append(('Upstream gene', ortholog_in_reference(accession + '@' + previous_gene[0], ref_ortholog_dict), a[15], max_pct_cover, seq_from, seq_to)) 
                                    else:
                                         output.append(('Downstream gene', ortholog_in_reference(accession + '@' + previous_gene[0], ref_ortholog_dict), a[15], max_pct_cover, seq_from, seq_to))  

                                    if next_gene[1] == 1:
                                        output.append(('Downstream gene', ortholog_in_reference(accession + '@' + next_gene[0], ref_ortholog_dict), a[15], max_pct_cover, seq_from, seq_to)) 
                                    else:
                                        output.append(('Upstream gene', ortholog_in_reference(accession + '@' + next_gene[0], ref_ortholog_dict), a[15], max_pct_cover, seq_from, seq_to))
                                    if species in hit_dict:
                                        #hit_dict[species].append(output)
                                        hit_dict[species] = hit_dict[species] + output
                                    else:
                                        hit_dict[species] = output

upstream_count_dict = {}                                                
downstream_count_dict = {}
upstream_species_dict = {}                                                
downstream_species_dict = {}
for k, v in hit_dict.items():
    #if 'H37' in k:
    if 1==1:
        #print(k)
        #print(' ')
        product_list = []
        for x in v:
            if x[0] == 'Upstream gene':
                if x[1] in upstream_count_dict:
                    upstream_count_dict[x[1]] += 1
                    upstream_species_dict[x[1]].append((k, x[4],x[5])) 
                else:
                    upstream_count_dict[x[1]] = 1
                    upstream_species_dict[x[1]] = [(k, x[4],x[5])]
            else:
                if x[1] in downstream_count_dict:
                    downstream_count_dict[x[1]] += 1
                    downstream_species_dict[x[1]].append((k, x[4],x[5]))
                else:
                    downstream_count_dict[x[1]] = 1
                    downstream_species_dict[x[1]] = [(k, x[4],x[5])]
            if x[1] in product_dict:
                product_list.append((x, product_dict[x[1]]))
            else:
                product_list.append((x, ''))
        
        #print(len([x for x in product_list if x[0][0] == 'Upstream gene']))
        #print(sorted([x[0][1] for x in product_list if x[0][0] == 'Upstream gene']))
        #print(' ')   
print('Upstream counts')
#print (dict(sorted(upstream_count_dict.items(), key=lambda item: item[1], reverse=True)))
for k, v in dict(sorted(upstream_species_dict.items())).items():
    if not(k == 'NO_ORTHOLOG') and reference_species_name in [x[0] for x in v]:
        print(k)
        for x in v:
            if reference_species_name in x[0]:
                if x[1] < x[2]:
                    print(full_sequence[x[1]-1:x[2]])
                else:
                    print(util.reverse_complement(full_sequence[x[2]-1:x[1]]))
        
        print(v)
        print()

print('Downstream counts')
for k, v in dict(sorted(downstream_species_dict.items())).items():
    if not(k == 'NO_ORTHOLOG') and reference_species_name in [x[0] for x in v]:
        print(k)
        print(v)
        print(k)
        for x in v:
            if reference_species_name in x[0]:
                if x[1] < x[2]:
                    print(full_sequence[x[1]-1:x[2]])
                else:
                    print(util.reverse_complement(full_sequence[x[2]-1:x[1]]))
        
        print(v)
        print()
#print (dict(sorted(downstream_count_dict.items(), key=lambda item: item[1], reverse=True)))
                                        
                           

Upstream counts
Rv0126
CCTATGACTCGCGCCGGCGACGATGCACAGCGAAGCGATGAGGAGGAGCGGCGCCTATGACTCGCGCCAGCGACGATGCACAGCGAAGCGATGAGGAGGAGCGGCGCCTATGACTCGGTCGGACACG
[('Mycobacterium tuberculosis variant bovis AF2122/97', 154314, 154440), ('Mycobacterium canettii CIPT 140010059', 157870, 157994), ('Mycobacterium tuberculosis variant bovis BCG str. Pasteur 1173P2', 184020, 184146), ('Mycobacterium tuberculosis H37Rv', 154123, 154249), ('Mycobacterium spongiae', 318064, 318150)]

Rv0481c
AGCCTAGGCCCGGCGACGAGCGCGCCGCACCGGCGCGCGCAGGAGCCGGGCAATCCAGCTTGCGCCCGGCGACGAGCGCGCCGCACCGGCGCGCGCAGGAGCCGGGCAATCCAGCTTGCGCC
[('Mycobacterium canettii CIPT 140010059', 580098, 580219), ('Mycobacterium tuberculosis H37Rv', 569814, 569935), ('Mycobacterium lacus', 3905070, 3904937), ('Mycobacterium shinjukuense', 3232507, 3232385), ('Mycobacterium haemophilum DSM 44634', 582799, 582934), ('Mycobacterium shigaense', 720964, 721099)]

Rv0487
GGTTAGGTCGAGCCCGACGACGATGCAGAGCGCGCAGCGCGATGAGAAGGAGTTGGGCGGTTAGGTCGAGCCCGACGACGATGC

In [71]:
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
features = []
for feature in genome_record.features:
      if not(feature.type in ['CDS','source']):
        if 'note' in feature.qualifiers:
            a = feature.qualifiers['note']
        else:
            a = ''
        features.append((feature.type, a, int(feature.location.start), int(feature.location.end)))
           

In [95]:
with open(r_scape_output_loc + '/' + 'Rv1668c' + '/summary.txt', 'r') as f:  
                for l in f:
                    if (not ('#' in l)):
                        a = l.split()
                        accession = a[0]
                        species = names_dict[a[0]]
                        if species == reference_species_name:
                            seq_from = int(a[7])
                            seq_to = int(a[8])
                            seq_strand = a[9]
                            significant = a[16]
                            start = min(seq_from,seq_to)
                            stop = max(seq_from,seq_to)
                            if (significant == '!'):
                                print(a)
                                for feature in features:
                                    if feature[2] < stop and feature[3] > start:
                                        print(feature)

['NC_000962.3', '-', 'rscape_2', '-', 'cm', '1', '133', '3351086', '3351216', '+', 'no', '1', '0.76', '11.4', '123.4', '2.4e-29', '!', '-']
['NC_000962.3', '-', 'rscape_2', '-', 'cm', '1', '133', '1907449', '1907582', '+', 'no', '1', '0.63', '0.1', '99.7', '2e-22', '!', '-']
('repeat_region', ['56 bp direct repeat 1, AGTCGGGTGACGATGCGGGCCGGTGTGGTCCGAGGAGGAGCCCGACAATTTAAGCT'], 1907459, 1907515)
('repeat_region', ['56 bp direct repeat 2, AGTCGGGTGACGATGCGGGCCGGTGTGGTCCGAGGAGGAGCCCGACAATTTAAGCT'], 1907515, 1907571)
['NC_000962.3', '-', 'rscape_2', '-', 'cm', '1', '133', '3690943', '3691077', '+', 'no', '1', '0.67', '1.8', '97.5', '8.9e-22', '!', '-']
['NC_000962.3', '-', 'rscape_2', '-', 'cm', '12', '115', '3594452', '3594349', '-', 'no', '1', '0.70', '2.3', '89.0', '2.7e-19', '!', '-']
['NC_000962.3', '-', 'rscape_2', '-', 'cm', '1', '133', '4087618', '4087487', '-', 'no', '1', '0.73', '5.6', '88.5', '3.6e-19', '!', '-']
('gene', '', 4087609, 4088188)
['NC_000962.3', '-', 'rscape_2', '-'

In [96]:
hits = []
for k, v in hit_dict.items():
    hits.append((k, len(v)))
print(sorted(hits, key= lambda item: item[1], reverse = True))

[('Mycobacterium canettii CIPT 140010059', 90), ('Mycobacterium cookii', 70), ('Mycobacterium spongiae', 68), ('Mycobacterium tuberculosis variant bovis BCG str. Pasteur 1173P2', 62), ('Mycobacterium paraterrae', 62), ('Mycobacterium tuberculosis variant bovis AF2122/97', 56), ('Mycobacterium tuberculosis H37Rv', 54), ('Mycobacterium haemophilum DSM 44634', 48), ('Mycobacterium shinjukuense', 46), ('Mycobacterium lacus', 40), ('Mycobacterium kansasii ATCC 12478', 40), ('Mycobacterium intracellulare ATCC 13950', 32), ('Mycobacterium paraintracellulare', 30), ('Mycobacterium ostraviense', 30), ('Mycobacterium basiliense', 30), ('Mycobacterium malmoense', 30), ('Mycobacterium heckeshornense', 28), ('Mycobacterium noviomagense', 26), ('Mycobacterium marseillense', 24), ('Mycobacterium mantenii', 24), ('Mycobacterium simiae', 22), ('Mycobacterium paragordonae', 20), ('Mycobacterium vicinigordonae', 18), ('Mycobacterium marinum', 12), ('Mycobacterium seoulense', 12), ('Mycobacterium shigaens

In [92]:
hit_dict[reference_species_name]

[('Downstream gene', 'Rv2993c', '7.5e-26', 1.0, 3351212, 3351094),
 ('Downstream gene', 'Rv2994', '7.5e-26', 1.0, 3351212, 3351094),
 ('Downstream gene', 'Rv3303c', '5.9e-18', 1.0, 3691073, 3690951),
 ('Downstream gene', 'Rv3304', '5.9e-18', 1.0, 3691073, 3690951),
 ('Downstream gene',
  'Rv3646c',
  '3.5e-16',
  0.9833333333333333,
  4087491,
  4087611),
 ('Upstream gene', 'Rv3647c', '3.5e-16', 0.9833333333333333, 4087491, 4087611),
 ('Upstream gene', 'Rv1682', '1.6e-15', 1.0, 1907578, 1907460),
 ('Downstream gene', 'Rv1683', '1.6e-15', 1.0, 1907578, 1907460),
 ('Upstream gene', 'Rv2405', '2.5e-14', 0.8760330578512396, 2704023, 2703902),
 ('Upstream gene', 'Rv2406c', '2.5e-14', 0.8760330578512396, 2704023, 2703902),
 ('Upstream gene', 'Rv3401', '6.4e-14', 0.9217391304347826, 3820508, 3820393),
 ('Upstream gene',
  'NO_ORTHOLOG',
  '6.4e-14',
  0.9217391304347826,
  3820508,
  3820393),
 ('Upstream gene', 'Rv1155', '8.3e-14', 1.0, 1281892, 1282018),
 ('Downstream gene', 'Rv1156', '8.3e

In [186]:
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
full_sequence = str(genome_record.seq)

In [25]:
util.reverse_complement(full_sequence[2401818:2401924])

'TCGGCGACGATGCGCCCCGGGTAACGGGGTGAGGAGGAGCCGGGCAATCAAATCGAGCTCGGCGACGATGCGCCCCGGGTAACGGGGTGAGGAGGAGCCAGGCAAT'

In [187]:
full_sequence[2522172:2522230]

'GCGAGCAGACGCAGAATCGCACGCGCGAGGTCCGCGCCGTGCGATTCTGCGTCTGCTC'

In [23]:
util.reverse_complement(full_sequence[960173:960263])

'GACGATGCAGAGCGCAGCGATGAGGAGGAGCGGCGCCATTGACTACCGCCGGCGACGATGCAGAGCGCAGCGATGAGGAGGAGCGGCGCC'

In [None]:
util.reverse_complement(full_sequence[2314661:2314825]) 

In [None]:
tr = util.Translator()

In [None]:
names_dict

In [None]:
tr.translate_sequence(full_sequence[2314660:2314825],-1,0)

In [3]:
orf_finder = orffn.ORF_Finder()

In [4]:
orf_finder.max_orf(123700, 124000, 100) 

(123860, 123983, -1, 120, 0.05438814266846015)

In [None]:
temp_dict = {}
for i, r in rbh_results.iterrows():
    if r['query_ref'] == 'NC_000962.3@Rv0794c':
        print(r)

In [65]:
temp_dict['NC_000962.3@Rv0794c']

'NZ_LT985188.1@MPLG2_RS04790'

In [63]:
i =0
for k,v in temp_dict.items():
    print(k)
    i+=1
    if i>10:
        break

NC_000962.3@Rv0001
NC_000962.3@Rv0002
NC_000962.3@Rv0003
NC_000962.3@Rv0004
NC_000962.3@Rv0005
NC_000962.3@Rv0006
NC_000962.3@Rv0007
NC_000962.3@Rv0008c
NC_000962.3@Rv0009
NC_000962.3@Rv0010c
NC_000962.3@Rv0011c


In [54]:
genome_record = next(SeqIO.parse(seq_dir + '/' + reference_species_filename, "genbank"))
features = []
for feature in genome_record.features:
    if not(feature.type in ['gene','CDS','source']):
        if 'repeat' in feature.type:
            #print (feature)
            #print(feature.location.strand)
            print (feature.qualifiers['note'])
            if int(feature.location.strand) == 1:
                print (full_sequence[feature.location.start - 1: feature.location.end])
            else:
                print(util.reverse_complement(full_sequence[feature.location.start - 1: feature.location.end]))
            print(" ")

['101 bp Mycobacterial Interspersed Repetitive Unit,Class I. See Supply et al. (1997) Molecular Microbiology 26, 991-1003']
GTGACCCGCGCTGGCGACGATGCAGAGCGCAGAGAAGCGGTGGGGGCGCGCCCCCACAAGTGGGGGGTACCCCCACCCGCTTGCGGGGGAGAGTGGCGCGCG
 
['5 x 9 bp GTGGACCCG repeats']
GGTGGACCCGGTGGACCCGGTGGACCCGGTGGACCCGGTGGACCCG
 
["(MTV030.15), len: 315 nt. Probable REP'-1 pseudogene fragment"]
AAAGTCGCACGTCCGGTTCGAAGGGCGGCCACGGGAAACGGACCCGCAGCAACGCGGGCACCGCACCCATGGTCGACCCAACTGCCACGCACCCGGTGACCGGTGCGAAGTCCACCATATCGACCAGTGGGCAACCGGCGGCTCAACCGATATCGACAAACTCACCTTCACCTGCACACCCAACCACAAGCTAGTCGGGAAAGGCTGGCAGACAAGGAAACGGTCCGACGGCCAAACGGAATGGATCCCGCCACCCCACCTCGACCGCGGTGCCCACACCAACGACTACCACCACCCCGAACGCCTCTTCGACCAC
 
['REP-2, len: 1503 nt. REP251, member of REP13E12 family.']
GTGCGGTACCTTCCCGTATCAACTCGTAGGATCTGGGTAAATCCCTTGTGTCACTTCAGTTTCACGGTTATCAGCGGGGCGCTCTTTGTCAGTGCCCGACGTTATGATTCGAACATGTTAGCGAATAGCCGGGAGGAGCTTGTCGAGGTCTTCGACGCGCTGGATGCCGACCTGGACCGCTTGGACGAGGTGTCCTTTGAGGTGCTGAGCACCCCGGAACGGCTGCGGTCTCTGGAACGTCTGGAA

In [176]:
with open(r_scape_output_loc + '/' + 'Rv1324' + '/summary.txt', 'r') as f:  
                hit_dict = {}
                ref_count = 0
                tot_count = 0
                hits = []
                for l in f:
                    if (not ('#' in l)):
                        a = l.split()
                        accession = a[0]
                        species = names_dict[a[0]]
                        seq_from = int(a[7])
                        seq_to = int(a[8])
                        seq_strand = a[9]
                        significant = a[16]
                        start = min(seq_from,seq_to)
                        stop = max(seq_from,seq_to)
                        if (significant == '!') and accession == 'NC_000962.3':
                            hits.append((seq_from, seq_to,seq_strand, significant))
print(sorted(hits, key=lambda item: item[0]))


[(183291, 183230, '-', '!'), (234448, 234506, '+', '!'), (234504, 234446, '-', '!'), (253604, 253665, '+', '!'), (253663, 253602, '-', '!'), (279535, 279595, '+', '!'), (279594, 279534, '-', '!'), (456202, 456262, '+', '!'), (456261, 456201, '-', '!'), (459447, 459387, '-', '!'), (558819, 558879, '+', '!'), (558877, 558817, '-', '!'), (663390, 663451, '+', '!'), (663449, 663388, '-', '!'), (736233, 736293, '+', '!'), (736292, 736232, '-', '!'), (755266, 755327, '+', '!'), (755325, 755265, '-', '!'), (767325, 767385, '+', '!'), (767384, 767323, '-', '!'), (786020, 786087, '+', '!'), (786085, 786019, '-', '!'), (806173, 806233, '+', '!'), (806231, 806171, '-', '!'), (815637, 815698, '+', '!'), (815696, 815635, '-', '!'), (829707, 829768, '+', '!'), (829766, 829705, '-', '!'), (842303, 842364, '+', '!'), (842362, 842301, '-', '!'), (958458, 958519, '+', '!'), (958517, 958456, '-', '!'), (1000772, 1000832, '+', '!'), (1000831, 1000771, '-', '!'), (1006619, 1006680, '+', '!'), (1006678, 100

In [191]:
with open(r_scape_output_loc + '/' + 'Rv1324' + '/search_3.sto', 'r') as f:  
                for l in f:
                    if 'NC_000962.3' in l and not ('#' in l):
                        print(l)

NC_000962.3/1488085-1488146           GC.G.A.GC.A.GACGC.AG.A.A.U.C..G.C..C.U..A.A.A.C.CC.G.C.A.C.......................G.G.G.U.U.U.A..G.G.C.G.A.U.U.CU.GCGUC.U.G...CU...C.G.C.GC

NC_000962.3/4021453-4021392           GC.G.A.GC.A.GACGC.AG.A.A.U.C..G.C..A.U..G.A.U.U.UG.A.G.C.U.......................C.A.A.A.U.C.A..U.G.C.G.A.U.U.CU.GCGUC.U.G...CU...C.G.C.GC

NC_000962.3/1828800-1828861           GC.G.A.GC.A.GACGC.AA.A.A.U.C..G.C..C.C..A.U.U.U.CG.U.A.C.C.......................C.G.A.A.A.U.G..G.G.C.G.A.U.U.UU.GCGUC.U.G...CU...C.G.C.GG

NC_000962.3/3724613-3724552           GC.G.A.GC.A.GACGC.AA.A.A.U.C..G.C..C.C..A.A.U.U.UC.G.U.G.C.......................C.G.A.A.A.U.G..G.G.C.G.A.U.U.UU.GCGUC.U.G...CU...C.G.C.GC

NC_000962.3/3707572-3707633           GC.G.A.GC.A.GACGC.AA.A.A.U.C..G.C..C.C..G.A.A.A.AC.C.A.G.U.......................G.G.U.U.U.U.G..G.G.C.G.A.U.U.UU.GCGUC.U.G...CU...C.G.C.GC

NC_000962.3/4021394-4021455           GC.G.A.GC.A.GACGC.AG.A.A.U.C..G.C..A.U..G.A.U.U.UG.A.G.C.U..............