Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhanced GFF updating #135

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion ragtag_splitasm.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,17 @@
SOFTWARE.
"""

import os
import re
import sys
import argparse

import pysam

from ragtag_correct import make_gff_interval_tree

from ragtag_utilities.utilities import get_ragtag_version
from ragtag_utilities.utilities import log
from ragtag_utilities.AGPFile import AGPFile


Expand All @@ -39,6 +43,7 @@ def main():
parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]")
parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]")
parser.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals [null]")

# Parse the command line arguments
args = parser.parse_args()
Expand All @@ -50,7 +55,11 @@ def main():
asm_fn = args.asm
min_gap_size = args.n
agp_fn = args.o

gff_file = args.gff
if gff_file:
gff_file = os.path.abspath(gff_file)
it = make_gff_interval_tree(gff_file)
log("INFO", "Avoiding breaks within GFF intervals")
# Initialize the AGP file
agp = AGPFile(agp_fn, mode="w")
agp.add_pragma()
Expand All @@ -63,6 +72,19 @@ def main():
seq = fai.fetch(header).upper()
seq_len = fai.get_reference_length(header)
gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size]
# Remove coordinates overlapping gff features
if gff_file:
#non_gff_breaks = dict()
new_breaks = []
for gap in gap_coords:
if it[header][gap[0]] or it[header][gap[1]]:
log("INFO", "Avoiding breaking %s between %d-%d. This point intersects a feature in the gff file."
% (header, gap[0], gap[1]))
else:
new_breaks.append(gap)
if new_breaks:
#non_gff_breaks[header] = new_breaks
gap_coords = new_breaks

if not gap_coords:
new_header = "seq{0:08}".format(new_header_idx)
Expand Down
76 changes: 55 additions & 21 deletions ragtag_update_gff.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -28,60 +28,92 @@
import sys
import argparse
from collections import defaultdict
import re

from intervaltree import IntervalTree

from ragtag_utilities.utilities import log, get_ragtag_version
from ragtag_utilities.AGPFile import AGPFile


def sub_update(gff_file, agp_file):
def sub_update(gff_file, agp_file, is_split):
# Make a dictionary associating each original sequence with an interval tree of component sequences
trans = defaultdict(IntervalTree)
agp = AGPFile(agp_file, mode="r")
for agp_line in agp.iterate_lines():

# Check that the agp file looks correct for this task
if agp_line.orientation == "-":
raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.")
if not agp_line.comp_type == "W":
if not agp_line.comp_type == "W" and not is_split:
raise ValueError("The placement BED file is not formatted correctly. All lines should be WGS contig (W).")
if agp_line.is_gap:
if agp_line.is_gap and not is_split:
raise ValueError("There should be no gaps in the correction AGP file.")
# gaps don't have the orientation attribute so this check should come last
if not is_split:
if agp_line.orientation == "-":
raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.")

start, end = agp_line.obj_beg - 1, agp_line.obj_end
trans[agp_line.obj][start:end] = agp_line.comp
# Cant adjust gap comps because gaps aren't assembled into object
if agp_line.comp_type == "W":
start, end = agp_line.obj_beg - 1, agp_line.obj_end
trans[agp_line.obj][start:end] = agp_line.comp

# Iterate through the gff intervals and update them according to trans
with open(gff_file, "r") as f:
attr_id = re.compile('(?<=ID=).*')
attr_parent = re.compile('(?<=Parent=).*')
ovlp_ids = []
for line in f:
line = line.rstrip()
if line.startswith("#"):
print(line) # Print this comment line
else:
fields = line.split("\t")
h, s, e = fields[0], int(fields[3]), int(fields[4])
attributes = fields[8]
parent = None
feat_id = None
for attr in attributes.split(";"):
feat_id_matches = attr_id.findall(attr)
parent_matches = attr_parent.findall(attr)
if feat_id_matches:
feat_id = feat_id_matches
if parent_matches:
parent = parent_matches


s -= 1 # Keep everything zero-indexed

if h not in trans:
raise ValueError("Inconsistent input files.")

ovlps = trans[h][s:e]
if len(ovlps) > 1:
raise ValueError(
"%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
)
#raise ValueError(
# "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
#)
log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s and any child features. To retain feature, include '--gff' and re-run 'correct' or 'splitasm'" % (h, s, e, feat_id))
if feat_id:
ovlp_ids.append(feat_id)
continue
if len(ovlps) < 1:
raise ValueError("The placement BED file is not formatted correctly.")

# Get the data from the overlapping interval and print the new line
o = list(ovlps)[0]
new_s = s - o.begin
new_e = e - o.begin
fields[0] = o.data
fields[3] = str(new_s + 1) # back to one-based indexing for gff format
fields[4] = str(new_e)
print("\t".join(fields))
# Check if feat needs to be skipped because of ID or parent match,
# complex solution ensuring orphan feats aren't added e.g. parent gene overlaps but not its stop codon
if parent and parent in ovlp_ids:
log("WARNING", "Parent %s already excluded for overlapping two sub sequences. Skipping %s." % (parent, feat_id))
# To access any potential level 3 feats, add parent to ID exclusion list
ovlp_ids.append(feat_id)
continue
else:
# Get the data from the overlapping interval and print the new line
o = list(ovlps)[0]
new_s = s - o.begin
new_e = e - o.begin
fields[0] = o.data
fields[3] = str(new_s + 1) # back to one-based indexing for gff format
fields[4] = str(new_e)
print("\t".join(fields))


def sup_update(gff_file, agp_file):
Expand Down Expand Up @@ -132,10 +164,11 @@ def sup_update(gff_file, agp_file):


def main():
parser = argparse.ArgumentParser(description="Update gff intervals given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
parser = argparse.ArgumentParser(description="Update gff intervals from given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
parser.add_argument("gff", nargs='?', default="", metavar="<genes.gff>", type=str, help="gff file")
parser.add_argument("agp", nargs='?', default="", metavar="<ragtag.*.agp>", type=str, help="agp file")
parser.add_argument("-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)")
parser.add_argument("-s", action="store_true", default=False, help="update for assembly splitting from RagTag")

args = parser.parse_args()

Expand All @@ -149,9 +182,10 @@ def main():
gff_file = os.path.abspath(args.gff)
agp_file = os.path.abspath(args.agp)
is_sub = args.c
is_split = args.s

if is_sub:
sub_update(gff_file, agp_file)
if is_sub or is_split:
sub_update(gff_file, agp_file, is_split)
else:
sup_update(gff_file, agp_file)

Expand Down