Skip to content

Commit

Permalink
Add utils helper functions and loop.gfa test
Browse files Browse the repository at this point in the history
Also added comments addressing #75, #67, #71 in test code.

Writing preprocessing script tests should be a lot quicker now,
ideally.
  • Loading branch information
fedarko committed Oct 17, 2018
1 parent 45c869a commit 0c535cb
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 15 deletions.
6 changes: 6 additions & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@ This directory contains test code for MetagenomeScope. As you can see, there
aren't a lot of these yet; I'm planning on adding a lot more. The current tests
only cover the preprocessing script so far, but I'm planning on adding some
for the viewer interface as well.

## Test Data Acknowledgements

`loop.gfa` was created by Shaun Jackman.
[Here](https://github.com/sjackman/assembly-graph/blob/fef9fada23ddfb3da04db8221fac1ca8c99bfc66/loop.gfa)
is a link to the version of the file used.
9 changes: 9 additions & 0 deletions tests/input/loop.gfa
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
H VN:Z:1.0
S 1 AAA
S 2 ACG
S 3 CAT
S 4 TTT
L 1 + 1 + 2M
L 2 + 2 - 2M
L 3 - 3 + 2M
L 4 - 4 - 2M
12 changes: 8 additions & 4 deletions tests/test_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ def test_cyclic_chain():
# block, we use contextlib.closing().
# For more information, see https://stackoverflow.com/a/19524679.
with contextlib.closing(connection):
# Check that edges are correct
utils.validate_edge_count(cursor, 4)
utils.validate_std_counts(cursor, 2, 4, 2)
edge_map = utils.get_edge_map(cursor)
assert edge_map["1"] == ["2"] and edge_map["2"] == ["1"]
assert edge_map["-2"] == ["-1"] and edge_map["-1"] == ["-2"]
Expand All @@ -57,8 +56,7 @@ def test_bubble():
connection, cursor = utils.create_and_open_db("bubble_test.gml")

with contextlib.closing(connection):
# Check edge validity
utils.validate_edge_count(cursor, 4)
utils.validate_std_counts(cursor, 4, 4, 1)
edge_map = utils.get_edge_map(cursor)
assert "2" in edge_map["1"] and "3" in edge_map["1"]
assert "4" in edge_map["2"] and "4" in edge_map["3"]
Expand All @@ -72,6 +70,12 @@ def test_longpatterns():
connection, cursor = utils.create_and_open_db("longtest_LastGraph")

with contextlib.closing(connection):
utils.validate_std_counts(cursor, 36, 72, 8)
cluster_type_2_freq = utils.get_cluster_frequencies(cursor)
assert cluster_type_2_freq["Bubble"] == 4
assert cluster_type_2_freq["Frayed Rope"] == 4

def test_self_implying_edges():
connection, cursor = utils.create_and_open_db("loop.gfa")
with contextlib.closing(connection):
utils.validate_std_counts(cursor, 4, 6, 6)
118 changes: 107 additions & 11 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,18 +74,63 @@ def create_and_open_db(graph_filename):
cursor = connection.cursor()
return connection, cursor

def validate_edge_count(cursor, num_edges):
"""Validates the number of edges in the graph.
def is_oriented_graph(cursor):
"""Determines if the graph is oriented or unoriented.
Parameters
----------
cursor : sqlite3.Cursor
A cursor for a .db file produced by the preprocessing script. You can
obtain a cursor by calling create_and_open_db().
num_edges : int
The expected number of edges in the graph. This will be compared with
both 1) the "assembly" table's all_edge_count column and
2) the observed number of rows in the "edges" table.
Returns
-------
bool
True if the graph is oriented (i.e. the contigs already have an
assigned orientation, as with MetaCarvel output), False if the graph is
unoriented (the contigs have no assigned orientation).
Currently this is determined based on the filetype of the input graph,
but eventually that should be independent of the filetype (so the user
can do things like specify a GFA graph is oriented).
See #67 and #71 on GitHub for details regarding work on that.
"""
cursor.execute("SELECT filetype FROM assembly;")
filetype = cursor.fetchone()[0]
if filetype == "GML":
return True
return False

def validate_std_counts(cursor, node_ct, all_edge_ct, cc_ct):
"""Validates the number of standard-mode elements in the graph.
Parameters
----------
cursor : sqlite3.Cursor
A cursor for a .db file produced by the preprocessing script. You can
obtain a cursor by calling create_and_open_db().
node_ct : int
The expected number of nodes (contigs) in the graph. If the input
assembly graph is unoriented, this will be interpreted as just the
number of positive nodes; if the assembly graph is oriented, this will
be interpreted as the total number of nodes.
This will be compared with both
1) the "assembly" table's node_count column and either
2.1) the observed number of rows in the "nodes" table, or
2.2) 0.5 * (the observed number of rows in the "nodes" table)
(if the assembly graph is unoriented).
all_edge_ct : int
The expected number of edges in the graph, INCLUDING negative edges (in
unoriented assembly graphs). This will be compared with both
1) the "assembly" table's all_edge_count column and
2) the observed number of rows in the "edges" table.
For unoriented assembly graphs, note that this count should only
contain self-implying edges once. See the comments below for details.
cc_ct : int
The expected number of weakly connected components in the graph. This
will be compared with
1) the "assembly" table's component_count column and
2) the observed number of rows in the "components" table.
Returns
-------
Expand All @@ -94,13 +139,64 @@ def validate_edge_count(cursor, num_edges):
Raises
------
AssertionError
If either of the two conditions mentioned in the parameter description
for num_edges are not satisifed.
If any of the above comparisons mentioned are not satisfied.
Makes an additional check that edge_count == all_edge_count in oriented
graphs, the failure of which will also cause an AssertionError.
ALSO NOTE that, for graphs containing duplicate edges, the behavior of
this test is undefined. Ideally duplicate edges should be ignored by
the preprocessing script (see #75 on GitHub), but we aren't there yet.
(Although once #75 is resolved, adding in a test case that features a
graph with duplicate edges would be a good idea.)
"""
is_oriented = is_oriented_graph(cursor)

cursor.execute("SELECT node_count FROM assembly")
assert cursor.fetchone()[0] == node_ct
cursor.execute("SELECT COUNT(*) FROM nodes")
if is_oriented:
assert cursor.fetchone()[0] == node_ct
else:
assert (0.5 * cursor.fetchone()[0]) == node_ct

# Explanation of what edge_count and all_edge_count mean:
#
# edge_count describes the number of "positive" edges in the graph. This is
# equivalent to the total number of edges in the graph for oriented graphs,
# and equivalent to half of the number of edges in the graph for unoriented
# graphs (e.g. those contained in LastGraph files).
#
# all_edge_count describes the total number of edges in the graph. For
# oriented graphs this should be equal to edge_count, and for unoriented
# graphs this *should* be equal to 2 * edge_count. However, the latter case
# can differ for unoriented graphs if the graph contains "self-implying
# edges" -- that is, an edge from a given contig to its reverse complement
# contig.
#
# This phenomenon is exhibited in loop.gfa (included as test input). We
# only count each self-implying edge once in all_edge_count: this allows
# us to obtain an accurate figure for the total number of edges in an
# unoriented graph (because 2 * edge_count would be an overestimate for
# graphs containing self-implying edges). Rendering the same edge twice
# wouldn't really be helpful, and doesn't seem to be standard practice (see
# https://github.com/fedarko/MetagenomeScope/issues/105 for discussion of
# this.)
cursor.execute("SELECT all_edge_count FROM assembly")
assert cursor.fetchone()[0] == num_edges
cursor.execute("SELECT COUNT(source_id) FROM edges")
assert cursor.fetchone()[0] == num_edges
assert cursor.fetchone()[0] == all_edge_ct
cursor.execute("SELECT COUNT(*) FROM edges")
assert cursor.fetchone()[0] == all_edge_ct

# In oriented graphs, all_edge_count should equal edge_count.
# In unoriented graphs, the two counts aren't necessarily relatable due to
# the possibility of self-implying edges, as discussed above.
if is_oriented:
cursor.execute("SELECT edge_count FROM assembly")
assert cursor.fetchone()[0] == all_edge_ct

cursor.execute("SELECT component_count FROM assembly")
assert cursor.fetchone()[0] == cc_ct
cursor.execute("SELECT COUNT(*) FROM components")
assert cursor.fetchone()[0] == cc_ct

def get_edge_map(cursor):
"""Returns a dict mapping source node IDs to a list of sink node IDs."""
Expand Down

0 comments on commit 0c535cb

Please sign in to comment.