#### Directories etc

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import random
from tqdm import tqdm
from Comparative_Analysis import Utilities as util
from collections import defaultdict
import random
from joblib import Parallel, delayed
from Comparative_Analysis import Blast_Functions as blastfn
import time
import os
import copy
from Comparative_Analysis import ORF_Functions as orffn
from random import sample
pd.options.mode.chained_assignment = None  # default='warn'
import ete3
import matplotlib.pyplot as plt
from statistics import mode
from scipy.stats import chi2
import subprocess



In [2]:
project_dir = 'F:/Project_Data/Project_11'
seq_dir = 'F:/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
dictionary_dir = 'F:/Datasets/CRYPTIC_DATA/Cryptic_Dictionaries'
tb_species = 'NC_000962.3' 
tb_genome_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'
mycobrowser_dir = 'F:/Datasets/Data_From_Publications'
num_cores = 16
core_numbers = list(range(1, num_cores+1))

In [3]:
for record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    full_sequence = str(record.seq)

In [4]:
compass_df = pd.read_csv('F:/Datasets/CRYPTIC_DATA/COMPASS.csv', names = ['species', 'start', 'stop','count'])
compass_dict = {}
for i, r in compass_df.iterrows():
    for k in range(r['start'], r['stop']):     #  Should te start-1 but seems to ve inconsistency with mutations!!
        compass_dict[k] = 'Y'

In [5]:
mycobrowser_df = pd.read_excel(mycobrowser_dir+'/Mycobrowser_Release_4.xlsx')
temp_cds = mycobrowser_df[mycobrowser_df['Feature'] == 'CDS'][['Locus','Start','Stop','Strand','Product','Name','Functional_Category','Gene Ontology']]      #Just CDS
mycobrowser_cds_boundaries = []
for i, r in temp_cds.iterrows():
    if r['Strand'] == '+':
        strand = 1
    else:
        strand = -1
    mycobrowser_cds_boundaries.append((r['Locus'],r['Product'], r['Start']-1, r['Stop'], strand, r['Name'], r['Functional_Category'], r['Gene Ontology']))
mycobrowser_cds_boundaries.sort(key = lambda x: x[2])

In [7]:
temp = []
for i in mycobrowser_cds_boundaries:
    if 'PE' in i[1]:
        ct = 0
        r = range(i[2], i[3])
        l = len(r)
        for n in r:
            if n in compass_dict:
                ct +=1
        temp.append([i[0], i[1], i[2], i[3], i[4], ct/l])
pd.DataFrame(temp, columns = ['Rv_ref','Description','Start','Stop','Strand','Pct_Masked']).to_csv(project_dir +'/compass_PE_coverage.csv')

In [8]:
with open(project_dir + '/Thoth_Full_Run/zero_and_non_zero_mutation_counts.pkl', 'rb') as f:
    full_sample_zero_and_non_zero_mutation_counts = pickle.load(f)  

In [16]:
unmasked_mutations = []
for n, i in enumerate(full_sample_zero_and_non_zero_mutation_counts):
    if not(n in compass_dict):
        if i>0:
            unmasked_mutations.append(1)
        else:
            unmasked_mutations.append(0)

In [17]:
sum([1 for x in full_sample_zero_and_non_zero_mutation_counts if x>0])/len(full_sample_zero_and_non_zero_mutation_counts)

0.4514121171511393

In [18]:
sum([1 for x in unmasked_mutations if x>0])/len(unmasked_mutations)

0.48730925587553936

In [None]:
def score_region(mutationcounts, start):
    bin_counts = [0,0,0]
    for n in range(int(len(mutationcounts)/3)):
        temp_counts = [0,0,0]
        compass_overlap = 0
        for m in range(3): 
            if start+3*n+m in compass_dict:
                compass_overlap = 1
            temp_counts[m % 3] = mutationcounts[3*n+m]
        if sum(temp_counts) < 10 and compass_overlap == 0:
            for m in range(3):
                bin_counts[m] += temp_counts[m]
    return bin_formula(bin_counts[2], sum(bin_counts))

#### Mycobrowswer annotated CDS regions

In [None]:
for (locus, product, start, stop, strand, name, functional_category, gene_ontology) in (mycobrowser_cds_boundaries): 
    if compass_coverage(start, stop) > 0.5:
        continue
    scores = []
    mutation_counts = full_sample_zero_and_non_zero_mutation_counts
    if strand == 1:
        mutation_counts_2 = mutation_counts
    else:
        mutation_counts_2 = list(reversed(mutation_counts))
    score=score_region(mutation_counts_2[start:stop], start)
    if score > 0.999:
        print(locus, start, stop, product, score)

In [None]:
sns.histplot(cds_min_scores)

In [None]:
score_region(full_sample_zero_and_non_zero_mutation_counts[0:1524], start)

In [None]:
def logfn(x):
    if x > 0:
        return math.log(x)
    else:
        return -99
scores = []
window_length =20
for start in tqdm(range(0, len(full_sample_zero_and_non_zero_mutation_counts) - window_length)):
    scores.append((start, logfn(score_region(full_sample_zero_and_non_zero_mutation_counts[start:(start+window_length)], start))))

In [None]:
temp = []
for (pos, score) in scores:
    temp.append([pos, score, pos%3])
scores_df = pd.DataFrame(temp, columns = ['position', 'score', 'offset'])

In [None]:
sns.scatterplot(data = scores_df.query("position > 2837388 and position < 2837615"),x = 'position', y= 'score', hue = 'offset',palette="Accent", s=15) 

In [None]:
cds_min_scores = []
for (locus, product, start, stop, strand, name, functional_category, gene_ontology) in tqdm(mycobrowser_cds_boundaries): 
    if compass_coverage(start, stop) > 0.5:
        continue
    scores = []
    mutation_counts = full_sample_zero_and_non_zero_mutation_counts
    if strand == 1:
        mutation_counts_2 = mutation_counts
    else:
        mutation_counts_2 = list(reversed(mutation_counts))
    
    for offset in range(start, stop-30, 3):
        scores.append(score_region(mutation_counts_2[offset:offset+30], offset))
    cds_min_scores.append(min(scores))
sns.histplot(cds_min_scores)