Corrected version formating, stripped non-ascii characters from input…

… files, added error checking for numerical calculatins in host-range
phac-nml · Nov 15, 2019 · 85fdf75 · 85fdf75
1 parent 69b833d
commit 85fdf75
Show file tree

Hide file tree

Showing 24 changed files with 15,108 additions and 12,508 deletions.
diff --git a/build/lib/mob_suite/__init__.py b/build/lib/mob_suite/__init__.py
diff --git a/build/lib/mob_suite/blast/__init__.py b/build/lib/mob_suite/blast/__init__.py
@@ -0,0 +1,145 @@
+from datetime import datetime
+import logging
+import shutil
+
+from subprocess import Popen, PIPE
+import os
+
+import pandas as pd
+from pandas.io.common import EmptyDataError
+import re
+
+
+BLAST_TABLE_COLS = '''
+qseqid
+sseqid
+qlen
+slen
+qstart
+qend
+sstart
+send
+length
+mismatch
+pident
+qcovhsp
+qcovs
+sstrand
+evalue
+bitscore
+'''.strip().split('\n')
+
+
+class BlastRunner:
+
+    def __init__(self, fasta_path, tmp_work_dir):
+        self.fasta_path = fasta_path
+
+    def makeblastdb(self,fasta_path,dbtype):
+        p = Popen(['makeblastdb',
+                  '-in', fasta_path,
+                  '-dbtype',dbtype],
+                  stdout=PIPE,
+                  stderr=PIPE)
+        p.wait()
+        stdout = p.stdout.read()
+        stderr = p.stderr.read()
+
+
+
+    def run_tblastn(self, query_fasta_path, blast_task, db_path, db_type, min_cov, min_ident, evalue,blast_outfile,num_threads=1):
+
+        p = Popen(['tblastn',
+                   '-query', query_fasta_path,
+                   '-num_threads','{}'.format(num_threads),
+                   '-db', '{}'.format(db_path),
+                   '-evalue', '{}'.format(evalue),
+                   '-out', blast_outfile,
+                   '-outfmt', '6 {}'.format(' '.join(BLAST_TABLE_COLS))],
+                  stdout=PIPE,
+                  stderr=PIPE)
+
+        p.wait()
+        stdout = p.stdout.read()
+        stderr = p.stderr.read()
+        if stdout is not None and stdout != '':
+            logging.debug('blastn on db {} and query {} STDOUT: {}'.format(query_fasta_path, db_path, stdout))
+
+        if stderr is not None and stderr != '':
+            logging.debug('blastn on db {} and query {} STDERR: {}'.format(query_fasta_path, db_path, stderr))
+            if os.path.exists(blast_outfile):
+                return blast_outfile
+            else:
+                ex_msg = 'tblastn on db {} and query {} did not produce expected output file at {}'.format(
+                    query_fasta_path,
+                    db_path,
+                    blast_outfile)
+                logging.error(ex_msg)
+                raise Exception(ex_msg)
+
+    def run_blast(self, query_fasta_path, blast_task, db_path, db_type, min_cov, min_ident, evalue,blast_outfile,num_threads=1,word_size=11):
+
+        p = Popen(['blastn',
+                   '-task', blast_task,
+                   '-query', query_fasta_path,
+                   '-db', '{}'.format(db_path),
+                   '-num_threads','{}'.format(num_threads),
+                   '-evalue', '{}'.format(evalue),
+                   '-dust', 'yes',
+                   '-perc_identity', '{}'.format(min_ident),
+                   '-out', blast_outfile,
+                   '-outfmt', '6 {}'.format(' '.join(BLAST_TABLE_COLS))],
+                  stdout=PIPE,
+                  stderr=PIPE)
+
+        p.wait()
+        stdout = p.stdout.read()
+        stderr = p.stderr.read()
+        if stdout is not None and stdout != '':
+            logging.debug('blastn on db {} and query {} STDOUT: {}'.format(query_fasta_path, db_path, stdout))
+
+        if stderr is not None and stderr != '':
+            logging.debug('blastn on db {} and query {} STDERR: {}'.format(query_fasta_path, db_path, stderr))
+            if os.path.exists(blast_outfile):
+                return blast_outfile
+            else:
+                ex_msg = 'blastn on db {} and query {} did not produce expected output file at {}'.format(
+                    query_fasta_path,
+                    db_path,
+                    blast_outfile)
+                logging.error(ex_msg)
+                raise Exception(ex_msg)
+
+
+class BlastReader:
+    df = None
+
+
+    def __init__(self, blast_outfile):
+        """Read BLASTN output file into a pandas DataFrame
+        Sort the DataFrame by BLAST bitscore.
+        If there are no BLASTN results, then no results can be returned.
+        Args:
+            blast_outfile (str): `blastn` output file path
+        Raises:
+            EmptyDataError: No data could be parsed from the `blastn` output file
+        """
+        self.blast_outfile = blast_outfile
+        try:
+
+            self.df = pd.read_table(self.blast_outfile, header=None)
+            self.df.columns = BLAST_TABLE_COLS
+
+            logging.debug(self.df.head())
+            self.is_missing = False
+
+        except EmptyDataError as exc:
+            logging.warning('No BLASTN results to parse from file %s', blast_outfile)
+            self.is_missing = True
+            self.df = pd.DataFrame(index=['A'], columns='A')
+
+
+    def df_dict(self):
+        if not self.is_missing:
+            return self.df.to_dict()
+
diff --git a/build/lib/mob_suite/blast_best_hits.py b/build/lib/mob_suite/blast_best_hits.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+import logging, os, sys
+from argparse import (ArgumentParser, FileType)
+from mob_suite.blast import BlastReader
+
+
+def parse_args():
+    "Parse the input arguments, use '-h' for help"
+    parser = ArgumentParser(description='Filter overlapping queries')
+    parser.add_argument('--infile', type=str, required=True, help='Input file to process')
+    parser.add_argument('--outdir', type=str, required=True, help='Output directory')
+    parser.add_argument('--min_overlap', type=str, required=False, help='Minimum bp overlap', default=5)
+    return parser.parse_args()
+
+def filter_overlaping_records(blast_df, overlap_threshold,contig_id_col,contig_start_col,contig_end_col,bitscore_col):
+    prev_contig_id = ''
+    prev_index = -1
+    prev_contig_start = -1
+    prev_contig_end = -1
+    prev_score = -1
+    filter_indexes = list()
+    exclude_filter = dict()
+
+
+    for index, row in blast_df.iterrows():
+        contig_id = row['sseqid']
+        contig_start = row['sstart']
+        contig_end = row['send']
+        score = row['bitscore']
+
+        if prev_contig_id == '':
+            prev_index = index
+            prev_contig_id = contig_id
+            prev_contig_start = contig_start
+            prev_contig_end = contig_end
+            prev_score = score
+            continue
+
+        if contig_id != prev_contig_id:
+            prev_index = index
+            prev_contig_id = contig_id
+            prev_contig_start = contig_start
+            prev_contig_end = contig_end
+            prev_score = score
+            continue
+
+        if (contig_start >= prev_contig_start and contig_start <= prev_contig_end) or (contig_end >= prev_contig_start and contig_end <= prev_contig_end):
+            overlap = abs(contig_start - prev_contig_end)
+            if overlap > overlap_threshold:
+                if prev_score > score:
+                    filter_indexes.append(index)
+                else:
+                    filter_indexes.append(prev_index)
+
+
+        prev_index = index
+        prev_contig_id = contig_id
+        prev_contig_start = contig_start
+        prev_contig_end = contig_end
+        prev_score = score
+
+    for index in exclude_filter:
+        filter_indexes.append(index)
+    indexes = dict()
+    for i in blast_df.iterrows():
+        indexes[i[0]] = ''
+
+    blast_df.drop(filter_indexes, inplace=True)
+
+    return blast_df.reset_index(drop=True)
+
+
+def fixStart(blast_df):
+    for index, row in blast_df.iterrows():
+        sstart = blast_df.at[index, 'sstart']
+        send = blast_df.at[index, 'send']
+        # print "{}\t{}".format(sstart,send)
+        if send < sstart:
+            temp = sstart
+            blast_df.at[index, 'sstart'] = send
+            blast_df.at[index, 'send'] = temp
+        # print "====>{}\t{}".format(self.blast_df.at[index, 'sstart'], self.blast_df.at[index, 'send'])
+        qstart = blast_df.at[index, 'qstart']
+        qend = blast_df.at[index, 'qend']
+        if qend < qstart:
+            temp = qstart
+            blast_df.at[index, 'qstart'] = qend
+            blast_df.at[index, 'qend'] = temp
+    return blast_df
+
+def filter_blast(blast_results_file, min_ident, min_cov, evalue, overlap):
+    if os.path.getsize(blast_results_file) == 0:
+        return dict()
+    blast_df = BlastReader(blast_results_file).df
+    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
+    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_cov]
+    blast_df = fixStart(blast_df)
+    blast_df = blast_df.sort_values(['sseqid','sstart', 'send', 'bitscore'], ascending=[True, True, True, False])
+    blast_df = blast_df.reset_index(drop=True)
+    size = str(len(blast_df))
+    prev_size = 0
+    while size != prev_size:
+        blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore')
+        prev_size = size
+        size = str(len(blast_df))
+
+    return blast_df
+
+
+
+
+def main():
+    logging.info('Running plasmid detector v. {}'.format('0.1'))
+    args = parse_args()
+    if not args.infile:
+        logging.info('Error, no blast file specified, please specify one')
+        sys.exit()
+    if not args.outdir:
+        logging.info('Error, no output directory specified, please specify one')
+        sys.exit()
+    if not os.path.isdir(args.outdir):
+        os.mkdir(args.outdir, 0o755)
+
+    blast_file = args.infile
+    base_file_name = os.path.splitext(os.path.basename(blast_file))[0]
+    out_dir = args.outdir
+    blast_results_file = os.path.join(out_dir, base_file_name+'_blast_results.txt')
+    processed_blast_results = filter_blast(blast_file, 95, 95, 0.00001, 5)
+    if isinstance(processed_blast_results,dict):
+        results_fh = open(blast_results_file, 'w')
+        results_fh.write('')
+        results_fh.close()
+    else:
+        processed_blast_results.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False)
+
+
+
+main()
diff --git a/build/lib/mob_suite/classes/__init__.py b/build/lib/mob_suite/classes/__init__.py
diff --git a/build/lib/mob_suite/classes/mcl.py b/build/lib/mob_suite/classes/mcl.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+from subprocess import Popen, PIPE
+import os
+
+class mcl:
+
+
+    def __init__(self, blast_results_file, working_dir,inflation=1.5):
+        blast_abc = os.path.join(working_dir, 'seq.abc')
+        cols = [0,1,14]
+        self.prep_blast(blast_results_file, blast_abc, cols)
+        mci_outfile = os.path.join(working_dir, 'seq.mci')
+        tab_outfile = os.path.join(working_dir, 'seq.tab')
+        clust_outfile = os.path.join(working_dir, 'seq.clust')
+        self.mcxload(blast_abc,mci_outfile, tab_outfile)
+        self.run_mcl(mci_outfile,tab_outfile,clust_outfile,inflation,1)
+        self.clusters = self.parse_mcl(clust_outfile)
+
+    def getclusters(self):
+        return self.clusters
+
+
+    def prep_blast(self,blast_results_file,outfile,col_numbers):
+        with open(blast_results_file) as f:
+            content = f.readlines()
+            content = [x.strip() for x in content]
+            f.close()
+
+        with open(outfile,'w') as f:
+            for line in content:
+                row = line.split("\t")
+                selection = list()
+                for col in col_numbers:
+                    selection.append(row[col])
+                f.write("\t".join(selection) + "\n")
+            f.close()
+
+
+
+    def  mcxload(self, blast_results_file,mci_outfile, tab_outfile):
+        p = Popen(['mcxload',
+                   '-abc', blast_results_file,
+                   '--stream-mirror',
+                   '--stream-neg-log10',
+                   '-o', mci_outfile,
+                   '-write-tab', tab_outfile,],
+                  stdout=PIPE,
+                  stderr=PIPE)
+
+
+        p.wait()
+        stdout = p.stdout.read()
+        print(stdout)
+        stderr = p.stderr.read()
+        print(stderr)
+
+    def run_mcl(self,mci_outfile,tab_outfile,cluster_outfile,inflation,num_threads):
+        p = Popen(['mcl',
+                   mci_outfile,
+                   '-I',str(inflation),
+                   '-use-tab', tab_outfile,
+                   '-o',cluster_outfile],
+                  stdout=PIPE,
+                  stderr=PIPE)
+
+        p.wait()
+        stdout = p.stdout.read()
+        stderr = p.stderr.read()
+
+    def parse_mcl(self,cluster_outfile):
+        with open(cluster_outfile) as f:
+            content = f.readlines()
+            content = [x.strip() for x in content]
+        f.close()
+        count = 0
+        clusters = dict()
+        for line in content:
+            members = line.split("\t")
+            for member in members:
+                clusters[member] = count
+            count+=1
+        return clusters
+