Merge pull request #1366 from nextstrain/james/replace-augur-read-vcf

[vcf] replace augur's read_vcf with TreeTime's
nextstrain · Jan 25, 2024 · 2a08326 · 2a08326
2 parents d83d671 + 70ee412
commit 2a08326
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 25 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,7 +5,7 @@
 ### Features
 
 * `augur.io.read_metadata`: A new optional `dtype` argument allows custom data types for all columns. Automatic type inference still happens by default, so this is not a breaking change. [#1252][] (@victorlin)
-
+* `augur.io.read_vcf` has been removed and usage replaced with TreeTime's function of the same name which has improved validation of the VCF file.
 
 ### Bug Fixes
 

diff --git a/augur/index.py b/augur/index.py
@@ -7,7 +7,9 @@
 
 from .io.file import open_file
 from .io.sequences import read_sequences
-from .io.vcf import is_vcf, read_vcf
+from .io.vcf import is_vcf
+from treetime.vcf_utils import read_vcf
+
 
 
 DELIMITER = '\t'
@@ -40,7 +42,8 @@ def index_vcf(vcf_path, index_path):
         number of strains indexed
 
     """
-    strains, _ = read_vcf(vcf_path)
+    strains = list(read_vcf(vcf_path)['sequences'].keys())
+
     num_of_seqs = 0
 
     with open_file(index_path, 'wt') as out_file:

diff --git a/augur/io/vcf.py b/augur/io/vcf.py
@@ -24,22 +24,6 @@ def is_vcf(filename):
     return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))
 
 
-def read_vcf(filename):
-    if filename.lower().endswith(".gz"):
-        import gzip
-        file = gzip.open(filename, mode="rt", encoding='utf-8')
-    else:
-        file = open(filename, encoding='utf-8')
-
-    chrom_line = next(line for line in file if line.startswith("#C"))
-    file.close()
-    headers = chrom_line.strip().split("\t")
-    sequences = headers[headers.index("FORMAT") + 1:]
-
-    # because we need 'seqs to remove' for VCF
-    return sequences, sequences.copy()
-
-
 def write_vcf(input_filename, output_filename, dropped_samps):
     if _filename_gz(input_filename):
         input_arg = "--gzvcf"

diff --git a/tests/io/test_vcf.py b/tests/io/test_vcf.py
@@ -1,5 +1,6 @@
 import pytest
 import augur.io.vcf
+from treetime.vcf_utils import read_vcf
 
 
 @pytest.fixture
@@ -8,21 +9,20 @@ def mock_run_shell_command(mocker):
 
 
 class TestVCF:
+    # The `read_vcf` functionality used to be in an augur module when these
+    # tests were originally written but we now use TreeTime's function of the
+    # same name. The tests remain here to protect against any unforeseen changes.
     def test_read_vcf_compressed(self):
-        seq_keep, all_seq = augur.io.vcf.read_vcf(
-            "tests/data/tb_lee_2015.vcf.gz"
-        )
+        seq_keep = list(read_vcf("tests/data/tb_lee_2015.vcf.gz")['sequences'].keys())
 
         assert len(seq_keep) == 150
         assert seq_keep[149] == "G22733"
-        assert seq_keep == all_seq
 
     def test_read_vcf_uncompressed(self):
-        seq_keep, all_seq = augur.io.vcf.read_vcf("tests/data/tb_lee_2015.vcf")
+        seq_keep = list(read_vcf("tests/data/tb_lee_2015.vcf")['sequences'].keys())
 
         assert len(seq_keep) == 150
         assert seq_keep[149] == "G22733"
-        assert seq_keep == all_seq
 
     def test_write_vcf_compressed_input(self, mock_run_shell_command):
         augur.io.vcf.write_vcf(