In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None  
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
import time
from tqdm.auto import tqdm
import random
import copy
from joblib import Parallel, delayed
import os
import shutil
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
import re
import shutil
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import Blast_Functions as blastfn
from scipy.stats import chi2
from IPython import display

num_cores = 16
core_numbers = list(range(1, num_cores+1))
project_dir = 'F:/Project_Data/Mites_Project_2024'
output_dir = project_dir + '/Output'
seq_dir = 'F:/Datasets/Actinobacteria_Ref_Rep_Lev_Complete'
tb_species = 'NC_000962.3' 
tb_genome_filename = 'GCF_000195955.2_ASM19595v2_genomic.gbff'


In [3]:
def reverse_complement(seq_string):
    complement_dict = {'A':'T','C':'G','G':'C','T':'A'}
    temp = []
    for char in reversed(seq_string):
        if char in complement_dict:
            temp.append(complement_dict[char])
        else:
            return('')
    return ''.join(temp)

In [12]:
regions_considered = []
temp =[]
for record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    full_sequence = str(record.seq)
    features = []
    for feature in record.features:
        a = feature.qualifiers
        features.append([' ', int(feature.location.start), int(feature.location.end), int(feature.location.strand)])
    features.sort(key=lambda x: x[1])

    for i, feature in enumerate(features):
        exception = 1
        if feature[1] < feature[2]:  
            if feature[3] == 1:
                if i > 0 and feature[1] > features[i-1][2]:
                    exception = 0
                    utr_coords = (features[i-1][2], feature[1])
                    utr_sequence = full_sequence[features[i-1][2]: feature[1]]
                    utr_length = len(utr_sequence)
            else:
                if i + 1 < len(features) and feature[2] < features[i+1][1]:
                    exception = 0
                    utr_coords = (feature[2], features[i+1][1])
                    utr_sequence =  reverse_complement(full_sequence[feature[2]: features[i+1][1]])
                    utr_length = len(utr_sequence)
        if exception == 1:
            utr_coords = (0,0)
            utr_sequence = ''
            utr_length = 0 
        if utr_length < 7:
            continue
        else:
            if utr_coords in regions_considered:
                continue
            else:
                temp.append([str(utr_coords[0])+'-'+str(utr_coords[1]), utr_sequence])
                regions_considered.append(utr_coords)
util.produce_fasta_file(temp, output_dir + '/' + 'upstream_sections.faa')

100%|██████████| 2429/2429 [00:00<00:00, 77884.28it/s]


In [7]:
min_width = 5
min_sites = 5

In [11]:
subprocess.run('wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; meme \"/mnt/f/Project_Data/Mites_Project_2024/Output/upstream_sections.faa" -oc \"/mnt/f/Project_Data/Mites_Project_2024/Output/upstream_result" -dna -evt 0.01 -mod zoops -brief 4000 -minw ' + str(min_width) +' -maxw 200 -minsites ' + str(min_sites) , shell=True)
      

CompletedProcess(args='wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; meme "/mnt/f/Project_Data/Mites_Project_2024/Output/upstream_sections.faa" -oc "/mnt/f/Project_Data/Mites_Project_2024/Output/upstream_result" -dna -evt 0.01 -mod zoops -brief 4000 -minw 5 -maxw 200 -minsites 5', returncode=0)

In [9]:
len(temp)

2429

In [49]:
subprocess.run('wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; streme --p "/mnt/f/Project_Data/Project_9/Output/upstream_sections.faa" -oc \"/mnt/d/Project_Data/Project_9/Output/upstream_result" -dna -evalue 0.01 -minw ' + str(min_width)  , shell=True)


CompletedProcess(args='wsl export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.4.1:usr/bin:$PATH ; streme --p "/mnt/d/Project_Data/Project_9/Output/upstream_sections.faa" -oc "/mnt/d/Project_Data/Project_9/Output/upstream_result" -dna -evalue 0.01 -minw 5', returncode=0)

In [55]:
temp =[]
for record in SeqIO.parse(seq_dir + '/' + tb_genome_filename, "genbank"):
    full_sequence = str(record.seq)[0:50000]
    temp.append(['NU_00001', full_sequence])
util.produce_fasta_file(temp, output_dir + '/' + 'nu_00001.fasta')

100%|██████████| 1/1 [00:00<00:00, 973.38it/s]
