predict; parse --other_gff contigs compare to assembly, warn user if …

…do not match and exit #528
nextgenusfs · Jan 12, 2021 · 2cbf7fe · 2cbf7fe
1 parent dc0e497
commit 2cbf7fe
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 13 deletions.
diff --git a/funannotate/library.py b/funannotate/library.py
@@ -791,8 +791,7 @@ def runSubprocess8(cmd, dir, logfile, output):
 
 
 def evmGFFvalidate(input, evmpath, logfile):
-    Validator = os.path.join(
-        evmpath, 'EvmUtils', 'gff3_gene_prediction_file_validator.pl')
+    Validator = os.path.join(evmpath, 'EvmUtils', 'gff3_gene_prediction_file_validator.pl')
     cmd = ['perl', Validator, os.path.realpath(input)]
     logfile.debug(' '.join(cmd))
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
@@ -801,8 +800,8 @@ def evmGFFvalidate(input, evmpath, logfile):
     if not stderr:
         return True
     else:
-        logfile.debug(stderr)
-        False
+        logfile.error(stderr.rstrip())
+        return False
 
 
 def hashfile(afile, hasher, blocksize=65536):
@@ -1221,6 +1220,7 @@ def setupLogging(LOGNAME):
 
 
 def renameGFF(input, newname, output):
+    contigs = set()
     with open(output, 'w') as outfile:
         with open(input, 'r') as infile:
             for line in infile:
@@ -1232,8 +1232,10 @@ def renameGFF(input, newname, output):
                     cols = line.split('\t')
                     # make sure it has correct columns to be GFF
                     if len(cols) == 9:
-                        outfile.write('%s\t%s\t%s' %
-                                      (cols[0], newname, '\t'.join(cols[2:])))
+                        contigs.add(cols[0])
+                        outfile.write('{}\t{}\t{}'.format(cols[0], newname,
+                                                          '\t'.join(cols[2:])))
+    return contigs
 
 
 def countGFFgenes(input):

diff --git a/funannotate/predict.py b/funannotate/predict.py
@@ -624,6 +624,7 @@ def __init__(self, prog):
     OTHER_GFFs = []
     other_weights = []
     other_files = []
+    other_contigs = set()
     if args.other_gff:
         if any(':' in s for s in args.other_gff):
             for x in args.other_gff:
@@ -640,15 +641,13 @@ def __init__(self, prog):
     if len(other_files) > 0:
         for i, file in enumerate(other_files):
             featurename = 'other_pred'+str(i+1)
-            lib.log.info(
-                'Parsing GFF pass-through: {:} --> setting source to {:}'.format(file, featurename))
-            outputGFF = os.path.join(
-                args.out, 'predict_misc', 'other'+str(i+1)+'_predictions.gff3')
-            lib.renameGFF(os.path.abspath(file), featurename, outputGFF)
+            lib.log.info('Parsing GFF pass-through: {:} --> setting source to {:}'.format(file, featurename))
+            outputGFF = os.path.join(args.out, 'predict_misc', 'other'+str(i+1)+'_predictions.gff3')
+            contig_names = lib.renameGFF(os.path.abspath(file), featurename, outputGFF)
+            other_contigs.update(contig_names)
             # validate output with EVM
             if not lib.evmGFFvalidate(outputGFF, EVM, lib.log):
-                lib.log.error(
-                    "ERROR: %s is not a properly formatted GFF file, please consult EvidenceModeler docs" % args.other_gff)
+                lib.log.error("ERROR: {} is not proper GFF file, please consult EvidenceModeler docs".format(file))
                 sys.exit(1)
             OTHER_GFFs.append(outputGFF)
             if not featurename in StartWeights:
@@ -704,6 +703,18 @@ def __init__(self, prog):
         else:
             lib.log.info('Genome loaded: {:,} scaffolds; {:,} bp; {:.2%} repeats masked'.format(
                 len(ContigSizes), GenomeLength, percentMask))
+
+        # if other_gff passed, check contigs
+        if len(other_contigs) > 0:
+            extra_contigs = []
+            for contig in other_contigs:
+                if contig not in ContigSizes:
+                    extra_contigs.append(contig)
+            if len(extra_contigs) > 0:
+                lib.log.error('ERROR: found {:,} contigs in --other_gff that are not found in the genome assembly.'.format(len(extra_contigs)))
+                lib.log.error('  --other_gff should contain gene models from. Offending contigs: {}\n  {}'.format(args.input, ', '.join(extra_contigs)))
+                sys.exit(1)
+
         # just copy the input fasta to the misc folder and move on.
         shutil.copyfile(args.input, MaskGenome)
     else: