Skip to content

Commit

Permalink
Corrected version formating, stripped non-ascii characters from input…
Browse files Browse the repository at this point in the history
… files, added error checking for numerical calculatins in host-range
  • Loading branch information
jamespocalypse committed Nov 15, 2019
1 parent 69b833d commit 85fdf75
Show file tree
Hide file tree
Showing 24 changed files with 15,108 additions and 12,508 deletions.
Empty file added build/lib/mob_suite/__init__.py
Empty file.
145 changes: 145 additions & 0 deletions build/lib/mob_suite/blast/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from datetime import datetime
import logging
import shutil

from subprocess import Popen, PIPE
import os

import pandas as pd
from pandas.io.common import EmptyDataError
import re


BLAST_TABLE_COLS = '''
qseqid
sseqid
qlen
slen
qstart
qend
sstart
send
length
mismatch
pident
qcovhsp
qcovs
sstrand
evalue
bitscore
'''.strip().split('\n')


class BlastRunner:

def __init__(self, fasta_path, tmp_work_dir):
self.fasta_path = fasta_path

def makeblastdb(self,fasta_path,dbtype):
p = Popen(['makeblastdb',
'-in', fasta_path,
'-dbtype',dbtype],
stdout=PIPE,
stderr=PIPE)
p.wait()
stdout = p.stdout.read()
stderr = p.stderr.read()



def run_tblastn(self, query_fasta_path, blast_task, db_path, db_type, min_cov, min_ident, evalue,blast_outfile,num_threads=1):

p = Popen(['tblastn',
'-query', query_fasta_path,
'-num_threads','{}'.format(num_threads),
'-db', '{}'.format(db_path),
'-evalue', '{}'.format(evalue),
'-out', blast_outfile,
'-outfmt', '6 {}'.format(' '.join(BLAST_TABLE_COLS))],
stdout=PIPE,
stderr=PIPE)

p.wait()
stdout = p.stdout.read()
stderr = p.stderr.read()
if stdout is not None and stdout != '':
logging.debug('blastn on db {} and query {} STDOUT: {}'.format(query_fasta_path, db_path, stdout))

if stderr is not None and stderr != '':
logging.debug('blastn on db {} and query {} STDERR: {}'.format(query_fasta_path, db_path, stderr))
if os.path.exists(blast_outfile):
return blast_outfile
else:
ex_msg = 'tblastn on db {} and query {} did not produce expected output file at {}'.format(
query_fasta_path,
db_path,
blast_outfile)
logging.error(ex_msg)
raise Exception(ex_msg)

def run_blast(self, query_fasta_path, blast_task, db_path, db_type, min_cov, min_ident, evalue,blast_outfile,num_threads=1,word_size=11):

p = Popen(['blastn',
'-task', blast_task,
'-query', query_fasta_path,
'-db', '{}'.format(db_path),
'-num_threads','{}'.format(num_threads),
'-evalue', '{}'.format(evalue),
'-dust', 'yes',
'-perc_identity', '{}'.format(min_ident),
'-out', blast_outfile,
'-outfmt', '6 {}'.format(' '.join(BLAST_TABLE_COLS))],
stdout=PIPE,
stderr=PIPE)

p.wait()
stdout = p.stdout.read()
stderr = p.stderr.read()
if stdout is not None and stdout != '':
logging.debug('blastn on db {} and query {} STDOUT: {}'.format(query_fasta_path, db_path, stdout))

if stderr is not None and stderr != '':
logging.debug('blastn on db {} and query {} STDERR: {}'.format(query_fasta_path, db_path, stderr))
if os.path.exists(blast_outfile):
return blast_outfile
else:
ex_msg = 'blastn on db {} and query {} did not produce expected output file at {}'.format(
query_fasta_path,
db_path,
blast_outfile)
logging.error(ex_msg)
raise Exception(ex_msg)


class BlastReader:
df = None


def __init__(self, blast_outfile):
"""Read BLASTN output file into a pandas DataFrame
Sort the DataFrame by BLAST bitscore.
If there are no BLASTN results, then no results can be returned.
Args:
blast_outfile (str): `blastn` output file path
Raises:
EmptyDataError: No data could be parsed from the `blastn` output file
"""
self.blast_outfile = blast_outfile
try:

self.df = pd.read_table(self.blast_outfile, header=None)
self.df.columns = BLAST_TABLE_COLS

logging.debug(self.df.head())
self.is_missing = False

except EmptyDataError as exc:
logging.warning('No BLASTN results to parse from file %s', blast_outfile)
self.is_missing = True
self.df = pd.DataFrame(index=['A'], columns='A')


def df_dict(self):
if not self.is_missing:
return self.df.to_dict()

138 changes: 138 additions & 0 deletions build/lib/mob_suite/blast_best_hits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python
import logging, os, sys
from argparse import (ArgumentParser, FileType)
from mob_suite.blast import BlastReader


def parse_args():
"Parse the input arguments, use '-h' for help"
parser = ArgumentParser(description='Filter overlapping queries')
parser.add_argument('--infile', type=str, required=True, help='Input file to process')
parser.add_argument('--outdir', type=str, required=True, help='Output directory')
parser.add_argument('--min_overlap', type=str, required=False, help='Minimum bp overlap', default=5)
return parser.parse_args()

def filter_overlaping_records(blast_df, overlap_threshold,contig_id_col,contig_start_col,contig_end_col,bitscore_col):
prev_contig_id = ''
prev_index = -1
prev_contig_start = -1
prev_contig_end = -1
prev_score = -1
filter_indexes = list()
exclude_filter = dict()


for index, row in blast_df.iterrows():
contig_id = row['sseqid']
contig_start = row['sstart']
contig_end = row['send']
score = row['bitscore']

if prev_contig_id == '':
prev_index = index
prev_contig_id = contig_id
prev_contig_start = contig_start
prev_contig_end = contig_end
prev_score = score
continue

if contig_id != prev_contig_id:
prev_index = index
prev_contig_id = contig_id
prev_contig_start = contig_start
prev_contig_end = contig_end
prev_score = score
continue

if (contig_start >= prev_contig_start and contig_start <= prev_contig_end) or (contig_end >= prev_contig_start and contig_end <= prev_contig_end):
overlap = abs(contig_start - prev_contig_end)
if overlap > overlap_threshold:
if prev_score > score:
filter_indexes.append(index)
else:
filter_indexes.append(prev_index)


prev_index = index
prev_contig_id = contig_id
prev_contig_start = contig_start
prev_contig_end = contig_end
prev_score = score

for index in exclude_filter:
filter_indexes.append(index)
indexes = dict()
for i in blast_df.iterrows():
indexes[i[0]] = ''

blast_df.drop(filter_indexes, inplace=True)

return blast_df.reset_index(drop=True)


def fixStart(blast_df):
for index, row in blast_df.iterrows():
sstart = blast_df.at[index, 'sstart']
send = blast_df.at[index, 'send']
# print "{}\t{}".format(sstart,send)
if send < sstart:
temp = sstart
blast_df.at[index, 'sstart'] = send
blast_df.at[index, 'send'] = temp
# print "====>{}\t{}".format(self.blast_df.at[index, 'sstart'], self.blast_df.at[index, 'send'])
qstart = blast_df.at[index, 'qstart']
qend = blast_df.at[index, 'qend']
if qend < qstart:
temp = qstart
blast_df.at[index, 'qstart'] = qend
blast_df.at[index, 'qend'] = temp
return blast_df

def filter_blast(blast_results_file, min_ident, min_cov, evalue, overlap):
if os.path.getsize(blast_results_file) == 0:
return dict()
blast_df = BlastReader(blast_results_file).df
blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_cov]
blast_df = fixStart(blast_df)
blast_df = blast_df.sort_values(['sseqid','sstart', 'send', 'bitscore'], ascending=[True, True, True, False])
blast_df = blast_df.reset_index(drop=True)
size = str(len(blast_df))
prev_size = 0
while size != prev_size:
blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore')
prev_size = size
size = str(len(blast_df))

return blast_df




def main():
logging.info('Running plasmid detector v. {}'.format('0.1'))
args = parse_args()
if not args.infile:
logging.info('Error, no blast file specified, please specify one')
sys.exit()
if not args.outdir:
logging.info('Error, no output directory specified, please specify one')
sys.exit()
if not os.path.isdir(args.outdir):
os.mkdir(args.outdir, 0o755)

blast_file = args.infile
base_file_name = os.path.splitext(os.path.basename(blast_file))[0]
out_dir = args.outdir
blast_results_file = os.path.join(out_dir, base_file_name+'_blast_results.txt')
processed_blast_results = filter_blast(blast_file, 95, 95, 0.00001, 5)
if isinstance(processed_blast_results,dict):
results_fh = open(blast_results_file, 'w')
results_fh.write('')
results_fh.close()
else:
processed_blast_results.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False)



main()
Empty file.
84 changes: 84 additions & 0 deletions build/lib/mob_suite/classes/mcl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python

from subprocess import Popen, PIPE
import os

class mcl:


def __init__(self, blast_results_file, working_dir,inflation=1.5):
blast_abc = os.path.join(working_dir, 'seq.abc')
cols = [0,1,14]
self.prep_blast(blast_results_file, blast_abc, cols)
mci_outfile = os.path.join(working_dir, 'seq.mci')
tab_outfile = os.path.join(working_dir, 'seq.tab')
clust_outfile = os.path.join(working_dir, 'seq.clust')
self.mcxload(blast_abc,mci_outfile, tab_outfile)
self.run_mcl(mci_outfile,tab_outfile,clust_outfile,inflation,1)
self.clusters = self.parse_mcl(clust_outfile)

def getclusters(self):
return self.clusters


def prep_blast(self,blast_results_file,outfile,col_numbers):
with open(blast_results_file) as f:
content = f.readlines()
content = [x.strip() for x in content]
f.close()

with open(outfile,'w') as f:
for line in content:
row = line.split("\t")
selection = list()
for col in col_numbers:
selection.append(row[col])
f.write("\t".join(selection) + "\n")
f.close()



def mcxload(self, blast_results_file,mci_outfile, tab_outfile):
p = Popen(['mcxload',
'-abc', blast_results_file,
'--stream-mirror',
'--stream-neg-log10',
'-o', mci_outfile,
'-write-tab', tab_outfile,],
stdout=PIPE,
stderr=PIPE)


p.wait()
stdout = p.stdout.read()
print(stdout)
stderr = p.stderr.read()
print(stderr)

def run_mcl(self,mci_outfile,tab_outfile,cluster_outfile,inflation,num_threads):
p = Popen(['mcl',
mci_outfile,
'-I',str(inflation),
'-use-tab', tab_outfile,
'-o',cluster_outfile],
stdout=PIPE,
stderr=PIPE)

p.wait()
stdout = p.stdout.read()
stderr = p.stderr.read()

def parse_mcl(self,cluster_outfile):
with open(cluster_outfile) as f:
content = f.readlines()
content = [x.strip() for x in content]
f.close()
count = 0
clusters = dict()
for line in content:
members = line.split("\t")
for member in members:
clusters[member] = count
count+=1
return clusters

Loading

0 comments on commit 85fdf75

Please sign in to comment.