-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Corrected version formating, stripped non-ascii characters from input…
… files, added error checking for numerical calculatins in host-range
- Loading branch information
1 parent
69b833d
commit 85fdf75
Showing
24 changed files
with
15,108 additions
and
12,508 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
from datetime import datetime | ||
import logging | ||
import shutil | ||
|
||
from subprocess import Popen, PIPE | ||
import os | ||
|
||
import pandas as pd | ||
from pandas.io.common import EmptyDataError | ||
import re | ||
|
||
|
||
BLAST_TABLE_COLS = ''' | ||
qseqid | ||
sseqid | ||
qlen | ||
slen | ||
qstart | ||
qend | ||
sstart | ||
send | ||
length | ||
mismatch | ||
pident | ||
qcovhsp | ||
qcovs | ||
sstrand | ||
evalue | ||
bitscore | ||
'''.strip().split('\n') | ||
|
||
|
||
class BlastRunner: | ||
|
||
def __init__(self, fasta_path, tmp_work_dir): | ||
self.fasta_path = fasta_path | ||
|
||
def makeblastdb(self,fasta_path,dbtype): | ||
p = Popen(['makeblastdb', | ||
'-in', fasta_path, | ||
'-dbtype',dbtype], | ||
stdout=PIPE, | ||
stderr=PIPE) | ||
p.wait() | ||
stdout = p.stdout.read() | ||
stderr = p.stderr.read() | ||
|
||
|
||
|
||
def run_tblastn(self, query_fasta_path, blast_task, db_path, db_type, min_cov, min_ident, evalue,blast_outfile,num_threads=1): | ||
|
||
p = Popen(['tblastn', | ||
'-query', query_fasta_path, | ||
'-num_threads','{}'.format(num_threads), | ||
'-db', '{}'.format(db_path), | ||
'-evalue', '{}'.format(evalue), | ||
'-out', blast_outfile, | ||
'-outfmt', '6 {}'.format(' '.join(BLAST_TABLE_COLS))], | ||
stdout=PIPE, | ||
stderr=PIPE) | ||
|
||
p.wait() | ||
stdout = p.stdout.read() | ||
stderr = p.stderr.read() | ||
if stdout is not None and stdout != '': | ||
logging.debug('blastn on db {} and query {} STDOUT: {}'.format(query_fasta_path, db_path, stdout)) | ||
|
||
if stderr is not None and stderr != '': | ||
logging.debug('blastn on db {} and query {} STDERR: {}'.format(query_fasta_path, db_path, stderr)) | ||
if os.path.exists(blast_outfile): | ||
return blast_outfile | ||
else: | ||
ex_msg = 'tblastn on db {} and query {} did not produce expected output file at {}'.format( | ||
query_fasta_path, | ||
db_path, | ||
blast_outfile) | ||
logging.error(ex_msg) | ||
raise Exception(ex_msg) | ||
|
||
def run_blast(self, query_fasta_path, blast_task, db_path, db_type, min_cov, min_ident, evalue,blast_outfile,num_threads=1,word_size=11): | ||
|
||
p = Popen(['blastn', | ||
'-task', blast_task, | ||
'-query', query_fasta_path, | ||
'-db', '{}'.format(db_path), | ||
'-num_threads','{}'.format(num_threads), | ||
'-evalue', '{}'.format(evalue), | ||
'-dust', 'yes', | ||
'-perc_identity', '{}'.format(min_ident), | ||
'-out', blast_outfile, | ||
'-outfmt', '6 {}'.format(' '.join(BLAST_TABLE_COLS))], | ||
stdout=PIPE, | ||
stderr=PIPE) | ||
|
||
p.wait() | ||
stdout = p.stdout.read() | ||
stderr = p.stderr.read() | ||
if stdout is not None and stdout != '': | ||
logging.debug('blastn on db {} and query {} STDOUT: {}'.format(query_fasta_path, db_path, stdout)) | ||
|
||
if stderr is not None and stderr != '': | ||
logging.debug('blastn on db {} and query {} STDERR: {}'.format(query_fasta_path, db_path, stderr)) | ||
if os.path.exists(blast_outfile): | ||
return blast_outfile | ||
else: | ||
ex_msg = 'blastn on db {} and query {} did not produce expected output file at {}'.format( | ||
query_fasta_path, | ||
db_path, | ||
blast_outfile) | ||
logging.error(ex_msg) | ||
raise Exception(ex_msg) | ||
|
||
|
||
class BlastReader: | ||
df = None | ||
|
||
|
||
def __init__(self, blast_outfile): | ||
"""Read BLASTN output file into a pandas DataFrame | ||
Sort the DataFrame by BLAST bitscore. | ||
If there are no BLASTN results, then no results can be returned. | ||
Args: | ||
blast_outfile (str): `blastn` output file path | ||
Raises: | ||
EmptyDataError: No data could be parsed from the `blastn` output file | ||
""" | ||
self.blast_outfile = blast_outfile | ||
try: | ||
|
||
self.df = pd.read_table(self.blast_outfile, header=None) | ||
self.df.columns = BLAST_TABLE_COLS | ||
|
||
logging.debug(self.df.head()) | ||
self.is_missing = False | ||
|
||
except EmptyDataError as exc: | ||
logging.warning('No BLASTN results to parse from file %s', blast_outfile) | ||
self.is_missing = True | ||
self.df = pd.DataFrame(index=['A'], columns='A') | ||
|
||
|
||
def df_dict(self): | ||
if not self.is_missing: | ||
return self.df.to_dict() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
#!/usr/bin/env python | ||
import logging, os, sys | ||
from argparse import (ArgumentParser, FileType) | ||
from mob_suite.blast import BlastReader | ||
|
||
|
||
def parse_args(): | ||
"Parse the input arguments, use '-h' for help" | ||
parser = ArgumentParser(description='Filter overlapping queries') | ||
parser.add_argument('--infile', type=str, required=True, help='Input file to process') | ||
parser.add_argument('--outdir', type=str, required=True, help='Output directory') | ||
parser.add_argument('--min_overlap', type=str, required=False, help='Minimum bp overlap', default=5) | ||
return parser.parse_args() | ||
|
||
def filter_overlaping_records(blast_df, overlap_threshold,contig_id_col,contig_start_col,contig_end_col,bitscore_col): | ||
prev_contig_id = '' | ||
prev_index = -1 | ||
prev_contig_start = -1 | ||
prev_contig_end = -1 | ||
prev_score = -1 | ||
filter_indexes = list() | ||
exclude_filter = dict() | ||
|
||
|
||
for index, row in blast_df.iterrows(): | ||
contig_id = row['sseqid'] | ||
contig_start = row['sstart'] | ||
contig_end = row['send'] | ||
score = row['bitscore'] | ||
|
||
if prev_contig_id == '': | ||
prev_index = index | ||
prev_contig_id = contig_id | ||
prev_contig_start = contig_start | ||
prev_contig_end = contig_end | ||
prev_score = score | ||
continue | ||
|
||
if contig_id != prev_contig_id: | ||
prev_index = index | ||
prev_contig_id = contig_id | ||
prev_contig_start = contig_start | ||
prev_contig_end = contig_end | ||
prev_score = score | ||
continue | ||
|
||
if (contig_start >= prev_contig_start and contig_start <= prev_contig_end) or (contig_end >= prev_contig_start and contig_end <= prev_contig_end): | ||
overlap = abs(contig_start - prev_contig_end) | ||
if overlap > overlap_threshold: | ||
if prev_score > score: | ||
filter_indexes.append(index) | ||
else: | ||
filter_indexes.append(prev_index) | ||
|
||
|
||
prev_index = index | ||
prev_contig_id = contig_id | ||
prev_contig_start = contig_start | ||
prev_contig_end = contig_end | ||
prev_score = score | ||
|
||
for index in exclude_filter: | ||
filter_indexes.append(index) | ||
indexes = dict() | ||
for i in blast_df.iterrows(): | ||
indexes[i[0]] = '' | ||
|
||
blast_df.drop(filter_indexes, inplace=True) | ||
|
||
return blast_df.reset_index(drop=True) | ||
|
||
|
||
def fixStart(blast_df): | ||
for index, row in blast_df.iterrows(): | ||
sstart = blast_df.at[index, 'sstart'] | ||
send = blast_df.at[index, 'send'] | ||
# print "{}\t{}".format(sstart,send) | ||
if send < sstart: | ||
temp = sstart | ||
blast_df.at[index, 'sstart'] = send | ||
blast_df.at[index, 'send'] = temp | ||
# print "====>{}\t{}".format(self.blast_df.at[index, 'sstart'], self.blast_df.at[index, 'send']) | ||
qstart = blast_df.at[index, 'qstart'] | ||
qend = blast_df.at[index, 'qend'] | ||
if qend < qstart: | ||
temp = qstart | ||
blast_df.at[index, 'qstart'] = qend | ||
blast_df.at[index, 'qend'] = temp | ||
return blast_df | ||
|
||
def filter_blast(blast_results_file, min_ident, min_cov, evalue, overlap): | ||
if os.path.getsize(blast_results_file) == 0: | ||
return dict() | ||
blast_df = BlastReader(blast_results_file).df | ||
blast_df = blast_df.loc[blast_df['pident'] >= min_ident] | ||
blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_cov] | ||
blast_df = fixStart(blast_df) | ||
blast_df = blast_df.sort_values(['sseqid','sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) | ||
blast_df = blast_df.reset_index(drop=True) | ||
size = str(len(blast_df)) | ||
prev_size = 0 | ||
while size != prev_size: | ||
blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore') | ||
prev_size = size | ||
size = str(len(blast_df)) | ||
|
||
return blast_df | ||
|
||
|
||
|
||
|
||
def main(): | ||
logging.info('Running plasmid detector v. {}'.format('0.1')) | ||
args = parse_args() | ||
if not args.infile: | ||
logging.info('Error, no blast file specified, please specify one') | ||
sys.exit() | ||
if not args.outdir: | ||
logging.info('Error, no output directory specified, please specify one') | ||
sys.exit() | ||
if not os.path.isdir(args.outdir): | ||
os.mkdir(args.outdir, 0o755) | ||
|
||
blast_file = args.infile | ||
base_file_name = os.path.splitext(os.path.basename(blast_file))[0] | ||
out_dir = args.outdir | ||
blast_results_file = os.path.join(out_dir, base_file_name+'_blast_results.txt') | ||
processed_blast_results = filter_blast(blast_file, 95, 95, 0.00001, 5) | ||
if isinstance(processed_blast_results,dict): | ||
results_fh = open(blast_results_file, 'w') | ||
results_fh.write('') | ||
results_fh.close() | ||
else: | ||
processed_blast_results.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False) | ||
|
||
|
||
|
||
main() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/env python | ||
|
||
from subprocess import Popen, PIPE | ||
import os | ||
|
||
class mcl: | ||
|
||
|
||
def __init__(self, blast_results_file, working_dir,inflation=1.5): | ||
blast_abc = os.path.join(working_dir, 'seq.abc') | ||
cols = [0,1,14] | ||
self.prep_blast(blast_results_file, blast_abc, cols) | ||
mci_outfile = os.path.join(working_dir, 'seq.mci') | ||
tab_outfile = os.path.join(working_dir, 'seq.tab') | ||
clust_outfile = os.path.join(working_dir, 'seq.clust') | ||
self.mcxload(blast_abc,mci_outfile, tab_outfile) | ||
self.run_mcl(mci_outfile,tab_outfile,clust_outfile,inflation,1) | ||
self.clusters = self.parse_mcl(clust_outfile) | ||
|
||
def getclusters(self): | ||
return self.clusters | ||
|
||
|
||
def prep_blast(self,blast_results_file,outfile,col_numbers): | ||
with open(blast_results_file) as f: | ||
content = f.readlines() | ||
content = [x.strip() for x in content] | ||
f.close() | ||
|
||
with open(outfile,'w') as f: | ||
for line in content: | ||
row = line.split("\t") | ||
selection = list() | ||
for col in col_numbers: | ||
selection.append(row[col]) | ||
f.write("\t".join(selection) + "\n") | ||
f.close() | ||
|
||
|
||
|
||
def mcxload(self, blast_results_file,mci_outfile, tab_outfile): | ||
p = Popen(['mcxload', | ||
'-abc', blast_results_file, | ||
'--stream-mirror', | ||
'--stream-neg-log10', | ||
'-o', mci_outfile, | ||
'-write-tab', tab_outfile,], | ||
stdout=PIPE, | ||
stderr=PIPE) | ||
|
||
|
||
p.wait() | ||
stdout = p.stdout.read() | ||
print(stdout) | ||
stderr = p.stderr.read() | ||
print(stderr) | ||
|
||
def run_mcl(self,mci_outfile,tab_outfile,cluster_outfile,inflation,num_threads): | ||
p = Popen(['mcl', | ||
mci_outfile, | ||
'-I',str(inflation), | ||
'-use-tab', tab_outfile, | ||
'-o',cluster_outfile], | ||
stdout=PIPE, | ||
stderr=PIPE) | ||
|
||
p.wait() | ||
stdout = p.stdout.read() | ||
stderr = p.stderr.read() | ||
|
||
def parse_mcl(self,cluster_outfile): | ||
with open(cluster_outfile) as f: | ||
content = f.readlines() | ||
content = [x.strip() for x in content] | ||
f.close() | ||
count = 0 | ||
clusters = dict() | ||
for line in content: | ||
members = line.split("\t") | ||
for member in members: | ||
clusters[member] = count | ||
count+=1 | ||
return clusters | ||
|
Oops, something went wrong.