Merge pull request #1293: fix: disable non-a-number filtering in clad…

…e file parsing
nextstrain · Aug 29, 2023 · 3ce0373 · 3ce0373
2 parents 435bd29 + bf14933
commit 3ce0373
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 6 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -8,12 +8,14 @@
 
 ### Bug fixes
 
+* clades: Fix outputs for genes named `NA` (previously the value was replaced by `nan`). [#1293][] (@rneher)
 * distance: Improve documentation by describing how gaps get treated as indels and how users can ignore specific characters in distance calculations. [#1285][] (@huddlej)
 * Fix help output compatibility with non-Unicode streams. [#1290][] (@victorlin)
 
 [#1284]: https://github.com/nextstrain/augur/pull/1284
 [#1285]: https://github.com/nextstrain/augur/pull/1285
 [#1290]: https://github.com/nextstrain/augur/pull/1290
+[#1293]: https://github.com/nextstrain/augur/pull/1293
 
 ## 22.3.0 (14 August 2023)
 

diff --git a/augur/clades.py b/augur/clades.py
@@ -1,7 +1,7 @@
 """
 Assign clades to nodes in a tree based on amino-acid or nucleotide signatures.
 
-Nodes which are members of a clade are stored via 
+Nodes which are members of a clade are stored via
 <OUTPUT_NODE_DATA> → nodes → <node_name> → clade_membership
 and if this file is used in `augur export v2` these will automatically become a coloring.
 
@@ -62,7 +62,8 @@ def read_in_clade_definitions(clade_file):
     df = pd.read_csv(
         clade_file,
         sep='\t' if clade_file.endswith('.tsv') else ',',
-        comment='#'
+        comment='#',
+        na_filter=False,
     )
 
     clade_inheritance_rows = df[df['gene'] == 'clade']
@@ -83,9 +84,13 @@ def read_in_clade_definitions(clade_file):
     # Use integer 0 as root so as not to conflict with any string clade names
     # String '0' can still be used this way
     root = 0
+
+    # Skip rows that are missing a clade name.
+    defined_clades = (clade for clade in df.clade.unique() if clade != '')
+
     # For every clade, add edge from root as default
     # This way all clades can be reached by traversal
-    for clade in df.clade.unique():
+    for clade in defined_clades:
         G.add_edge(root, clade)
 
     # Build inheritance graph
@@ -181,7 +186,7 @@ def ensure_no_multiple_mutations(all_muts):
             aa_positions = [int(mut[1:-1])-1 for mut in node['aa_muts'][gene]]
             if len(set(aa_positions))!=len(aa_positions):
                 multiples.append(f"Node {name} ({gene})")
-    
+
     if multiples:
         raise AugurError(f"Multiple mutations at the same position on a single branch were found: {', '.join(multiples)}")
 
@@ -310,7 +315,7 @@ def get_reference_sequence_from_root_node(all_muts, root_name):
         except KeyError:
             missing.append(gene)
 
-    if missing:            
+    if missing:
         print(f"WARNING in augur.clades: sequences at the root node have not been specified for {{{', '.join(missing)}}}, \
 even though mutations were observed. Clades which are annotated using bases/codons present at the root \
 of the tree may not be correctly inferred.")
@@ -358,7 +363,6 @@ def run(args):
         ref = get_reference_sequence_from_root_node(all_muts, tree.root.name)
 
     clade_designations = read_in_clade_definitions(args.clades)
-
     membership, labels = assign_clades(clade_designations, all_muts, tree, ref)
     warn_if_clades_not_found(membership, clade_designations)