Skip to content

Commit

Permalink
GenBank: Don't discard topology if given in LOCUS line
Browse files Browse the repository at this point in the history
Rebased from pull request biopython#812 to address biopython#363, with
minor changes as discussed on the pull request.
  • Loading branch information
kblin authored and peterjc committed Jul 27, 2016
1 parent b6f5423 commit dd389ee
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 4 deletions.
6 changes: 6 additions & 0 deletions Bio/GenBank/__init__.py
Expand Up @@ -1241,6 +1241,12 @@ def record_end(self, content):
raise ValueError("Could not determine alphabet for seq_type %s"
% self._seq_type)

# Also save the chomosome layout
if 'circular' in self._seq_type.lower():
self.data.annotations['topology'] = 'circular'
elif 'linear' in self._seq_type.lower():
self.data.annotations['topology'] = 'linear'

if not sequence and self.__expected_size:
self.data.seq = UnknownSeq(self._expected_size, seq_alphabet)
else:
Expand Down
21 changes: 17 additions & 4 deletions Bio/SeqIO/InsdcIO.py
Expand Up @@ -557,6 +557,18 @@ def _get_data_division(self, record):
assert len(division) == 3
return division

def _get_topology(self, record):
"""Set the topology to 'circular', 'linear' if defined"""
max_topology_len = len('circular')

# return an empty placeholder string if not given
if 'topology' not in record.annotations:
return ' ' * max_topology_len

template = '%%-%ds' % max_topology_len

return template % record.annotations['topology'][:max_topology_len]

def _write_the_first_line(self, record):
"""Write the LOCUS line."""

Expand Down Expand Up @@ -610,6 +622,8 @@ def _write_the_first_line(self, record):
# just the generic Alphabet (default for fasta files)
raise ValueError("Need a DNA, RNA or Protein alphabet")

topology = self._get_topology(record)

division = self._get_data_division(record)

name_length = str(len(record)).rjust(28)
Expand All @@ -619,12 +633,11 @@ def _write_the_first_line(self, record):

assert len(units) == 2
assert len(division) == 3
# TODO - date
# TODO - mol_type
line = "LOCUS %s %s %s %s %s\n" \
line = "LOCUS %s %s %s %s %s %s\n" \
% (name_length,
units,
mol_type.ljust(6),
mol_type.ljust(7),
topology,
division,
self._get_date(record))
assert len(line) == 79 + 1, repr(line) # plus one for new line
Expand Down
12 changes: 12 additions & 0 deletions Tests/test_GenBank_unittest.py
Expand Up @@ -161,6 +161,18 @@ def test_structured_comment_parsing(self):
'reference sequence was derived from AP000423.\n'
'COMPLETENESS: full length.')

def test_locus_line_topogoly(self):
"""Test if chromosome topology is conserved"""
record = SeqIO.read('GenBank/DS830848.gb', 'genbank')
self.assertEqual(record.annotations['topology'], 'linear')
out_handle = StringIO()
SeqIO.write([record], out_handle, 'genbank')
first_line = out_handle.getvalue().split('\n')[0]
self.assertIn('linear', first_line)
with open('GenBank/DS830848.gb', 'r') as fh:
orig_first_line = fh.readline().strip()
self.assertEqual(first_line, orig_first_line)

def test_long_names(self):
"""Various GenBank names which push the column based LOCUS line."""
original = SeqIO.read("GenBank/iro.gb", "gb")
Expand Down

0 comments on commit dd389ee

Please sign in to comment.