Skip to content

Commit

Permalink
predict; parse --other_gff contigs compare to assembly, warn user if …
Browse files Browse the repository at this point in the history
…do not match and exit #528
  • Loading branch information
Jon Palmer committed Jan 12, 2021
1 parent dc0e497 commit 2cbf7fe
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 13 deletions.
14 changes: 8 additions & 6 deletions funannotate/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,8 +791,7 @@ def runSubprocess8(cmd, dir, logfile, output):


def evmGFFvalidate(input, evmpath, logfile):
Validator = os.path.join(
evmpath, 'EvmUtils', 'gff3_gene_prediction_file_validator.pl')
Validator = os.path.join(evmpath, 'EvmUtils', 'gff3_gene_prediction_file_validator.pl')
cmd = ['perl', Validator, os.path.realpath(input)]
logfile.debug(' '.join(cmd))
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
Expand All @@ -801,8 +800,8 @@ def evmGFFvalidate(input, evmpath, logfile):
if not stderr:
return True
else:
logfile.debug(stderr)
False
logfile.error(stderr.rstrip())
return False


def hashfile(afile, hasher, blocksize=65536):
Expand Down Expand Up @@ -1221,6 +1220,7 @@ def setupLogging(LOGNAME):


def renameGFF(input, newname, output):
contigs = set()
with open(output, 'w') as outfile:
with open(input, 'r') as infile:
for line in infile:
Expand All @@ -1232,8 +1232,10 @@ def renameGFF(input, newname, output):
cols = line.split('\t')
# make sure it has correct columns to be GFF
if len(cols) == 9:
outfile.write('%s\t%s\t%s' %
(cols[0], newname, '\t'.join(cols[2:])))
contigs.add(cols[0])
outfile.write('{}\t{}\t{}'.format(cols[0], newname,
'\t'.join(cols[2:])))
return contigs


def countGFFgenes(input):
Expand Down
25 changes: 18 additions & 7 deletions funannotate/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@ def __init__(self, prog):
OTHER_GFFs = []
other_weights = []
other_files = []
other_contigs = set()
if args.other_gff:
if any(':' in s for s in args.other_gff):
for x in args.other_gff:
Expand All @@ -640,15 +641,13 @@ def __init__(self, prog):
if len(other_files) > 0:
for i, file in enumerate(other_files):
featurename = 'other_pred'+str(i+1)
lib.log.info(
'Parsing GFF pass-through: {:} --> setting source to {:}'.format(file, featurename))
outputGFF = os.path.join(
args.out, 'predict_misc', 'other'+str(i+1)+'_predictions.gff3')
lib.renameGFF(os.path.abspath(file), featurename, outputGFF)
lib.log.info('Parsing GFF pass-through: {:} --> setting source to {:}'.format(file, featurename))
outputGFF = os.path.join(args.out, 'predict_misc', 'other'+str(i+1)+'_predictions.gff3')
contig_names = lib.renameGFF(os.path.abspath(file), featurename, outputGFF)
other_contigs.update(contig_names)
# validate output with EVM
if not lib.evmGFFvalidate(outputGFF, EVM, lib.log):
lib.log.error(
"ERROR: %s is not a properly formatted GFF file, please consult EvidenceModeler docs" % args.other_gff)
lib.log.error("ERROR: {} is not proper GFF file, please consult EvidenceModeler docs".format(file))
sys.exit(1)
OTHER_GFFs.append(outputGFF)
if not featurename in StartWeights:
Expand Down Expand Up @@ -704,6 +703,18 @@ def __init__(self, prog):
else:
lib.log.info('Genome loaded: {:,} scaffolds; {:,} bp; {:.2%} repeats masked'.format(
len(ContigSizes), GenomeLength, percentMask))

# if other_gff passed, check contigs
if len(other_contigs) > 0:
extra_contigs = []
for contig in other_contigs:
if contig not in ContigSizes:
extra_contigs.append(contig)
if len(extra_contigs) > 0:
lib.log.error('ERROR: found {:,} contigs in --other_gff that are not found in the genome assembly.'.format(len(extra_contigs)))
lib.log.error(' --other_gff should contain gene models from. Offending contigs: {}\n {}'.format(args.input, ', '.join(extra_contigs)))
sys.exit(1)

# just copy the input fasta to the misc folder and move on.
shutil.copyfile(args.input, MaskGenome)
else:
Expand Down

0 comments on commit 2cbf7fe

Please sign in to comment.