Skip to content

Commit

Permalink
Merge pull request #1366 from nextstrain/james/replace-augur-read-vcf
Browse files Browse the repository at this point in the history
[vcf] replace augur's read_vcf with TreeTime's
  • Loading branch information
jameshadfield committed Jan 25, 2024
2 parents d83d671 + 70ee412 commit 2a08326
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 25 deletions.
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
### Features

* `augur.io.read_metadata`: A new optional `dtype` argument allows custom data types for all columns. Automatic type inference still happens by default, so this is not a breaking change. [#1252][] (@victorlin)

* `augur.io.read_vcf` has been removed and usage replaced with TreeTime's function of the same name which has improved validation of the VCF file.

### Bug Fixes

Expand Down
7 changes: 5 additions & 2 deletions augur/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

from .io.file import open_file
from .io.sequences import read_sequences
from .io.vcf import is_vcf, read_vcf
from .io.vcf import is_vcf
from treetime.vcf_utils import read_vcf



DELIMITER = '\t'
Expand Down Expand Up @@ -40,7 +42,8 @@ def index_vcf(vcf_path, index_path):
number of strains indexed
"""
strains, _ = read_vcf(vcf_path)
strains = list(read_vcf(vcf_path)['sequences'].keys())

num_of_seqs = 0

with open_file(index_path, 'wt') as out_file:
Expand Down
16 changes: 0 additions & 16 deletions augur/io/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,6 @@ def is_vcf(filename):
return bool(filename) and any(filename.lower().endswith(x) for x in ('.vcf', '.vcf.gz'))


def read_vcf(filename):
if filename.lower().endswith(".gz"):
import gzip
file = gzip.open(filename, mode="rt", encoding='utf-8')
else:
file = open(filename, encoding='utf-8')

chrom_line = next(line for line in file if line.startswith("#C"))
file.close()
headers = chrom_line.strip().split("\t")
sequences = headers[headers.index("FORMAT") + 1:]

# because we need 'seqs to remove' for VCF
return sequences, sequences.copy()


def write_vcf(input_filename, output_filename, dropped_samps):
if _filename_gz(input_filename):
input_arg = "--gzvcf"
Expand Down
12 changes: 6 additions & 6 deletions tests/io/test_vcf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
import augur.io.vcf
from treetime.vcf_utils import read_vcf


@pytest.fixture
Expand All @@ -8,21 +9,20 @@ def mock_run_shell_command(mocker):


class TestVCF:
# The `read_vcf` functionality used to be in an augur module when these
# tests were originally written but we now use TreeTime's function of the
# same name. The tests remain here to protect against any unforeseen changes.
def test_read_vcf_compressed(self):
seq_keep, all_seq = augur.io.vcf.read_vcf(
"tests/data/tb_lee_2015.vcf.gz"
)
seq_keep = list(read_vcf("tests/data/tb_lee_2015.vcf.gz")['sequences'].keys())

assert len(seq_keep) == 150
assert seq_keep[149] == "G22733"
assert seq_keep == all_seq

def test_read_vcf_uncompressed(self):
seq_keep, all_seq = augur.io.vcf.read_vcf("tests/data/tb_lee_2015.vcf")
seq_keep = list(read_vcf("tests/data/tb_lee_2015.vcf")['sequences'].keys())

assert len(seq_keep) == 150
assert seq_keep[149] == "G22733"
assert seq_keep == all_seq

def test_write_vcf_compressed_input(self, mock_run_shell_command):
augur.io.vcf.write_vcf(
Expand Down

0 comments on commit 2a08326

Please sign in to comment.