diff --git a/pyproject.toml b/pyproject.toml index 25d95dd..5cfec78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,8 @@ tests = [ "pytest-randomly>=3", "pytest-xdist>=3", "pytest>=7", + "pytest-vulture>=2", + "vulture>=2", ] typing = [ "mypy>=1", diff --git a/scripts/gen_benchmark_plot.py b/scripts/gen_benchmark_plot.py index 1709a89..61b9c42 100644 --- a/scripts/gen_benchmark_plot.py +++ b/scripts/gen_benchmark_plot.py @@ -191,10 +191,7 @@ def render_plot() -> str: if df.shape[0] == 0: return "" - name2func = { - name: globals().get(f"{name}_func", nothing) - for name in df.get_column("benchmark").unique().to_list() - } + name2func = {name: globals().get(f"{name}_func", nothing) for name in df.get_column("benchmark").unique().to_list()} bench2plot = {} for name, data in df.group_by("benchmark"): diff --git a/src/variantplaner/__init__.py b/src/variantplaner/__init__.py index 95c56c1..4e2b40c 100644 --- a/src/variantplaner/__init__.py +++ b/src/variantplaner/__init__.py @@ -7,7 +7,8 @@ from __future__ import annotations -from variantplaner import exception, extract, generate, io, normalization, struct +from variantplaner import extract, generate, io, normalization, struct +from variantplaner.objects import Vcf, VcfParsingBehavior __all__: list[str] = [ "exception", diff --git a/src/variantplaner/cli/__init__.py b/src/variantplaner/cli/__init__.py index e74c7a5..df68cb3 100644 --- a/src/variantplaner/cli/__init__.py +++ b/src/variantplaner/cli/__init__.py @@ -21,7 +21,7 @@ class MultipleValueOption(click.Option): def __init__(self, *args: list[typing.Any], **kwargs: dict[typing.Any, typing.Any]): """Intialise click option parser.""" - super(MultipleValueOption, self).__init__(*args,**kwargs) # type: ignore[arg-type] # noqa: UP008 false positive and complexe type + super(MultipleValueOption, self).__init__(*args, **kwargs) # type: ignore[arg-type] # noqa: UP008 false positive and complexe type self._previous_parser_process = None self._eat_all_parser = None @@ -109,8 +109,8 @@ def main(ctx: click.Context, *, threads: int = 1, verbose: int = 0, debug_info: # module import required after main definition -from variantplaner.cli import metadata # noqa: E402 F401 I001 these import should be here -from variantplaner.cli import parquet2vcf # noqa: E402 F401 these import should be here -from variantplaner.cli import struct # noqa: E402 F401 these import should be here -from variantplaner.cli import transmission # noqa: E402 F401 these import should be here -from variantplaner.cli import vcf2parquet # noqa: E402 F401 these import should be here +from variantplaner.cli import metadata # noqa: E402 F401 I001 these import should be here +from variantplaner.cli import parquet2vcf # noqa: E402 F401 these import should be here +from variantplaner.cli import struct # noqa: E402 F401 these import should be here +from variantplaner.cli import transmission # noqa: E402 F401 these import should be here +from variantplaner.cli import vcf2parquet # noqa: E402 F401 these import should be here diff --git a/src/variantplaner/cli/parquet2vcf.rs b/src/variantplaner/cli/parquet2vcf.rs deleted file mode 100644 index 8b13789..0000000 --- a/src/variantplaner/cli/parquet2vcf.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/variantplaner/cli/vcf2parquet.py b/src/variantplaner/cli/vcf2parquet.py index 56f1879..07a2067 100644 --- a/src/variantplaner/cli/vcf2parquet.py +++ b/src/variantplaner/cli/vcf2parquet.py @@ -9,12 +9,13 @@ # 3rd party import import click +import polars # project import -from variantplaner import cli, exception, extract, io +from variantplaner import Vcf, VcfParsingBehavior, cli, exception, extract, io -@cli.main.group("vcf2parquet", chain=True) # type: ignore[has-type] +@cli.main.group("vcf2parquet", chain=True) # type: ignore[has-type] @click.pass_context @click.option( "-i", @@ -54,26 +55,27 @@ def vcf2parquet( logger.debug(f"parameter: {input_path=} {chrom2length_path=} {append=}") - if not chrom2length_path: - logger.error("--chrom2length-path argument is required") - - try: - logger.debug("Start extract header") - headers = io.vcf.extract_header(input_path) - logger.debug("End extract header") - except exception.NotAVCFError: - logger.error("Input file seems no be a vcf") # noqa: TRY400 we are in cli exception isn't readable - sys.exit(11) + lf = Vcf() # Read vcf and manage structural variant logger.debug("Start read vcf") - lf = io.vcf.into_lazyframe(input_path, chrom2length_path, extension=io.vcf.IntoLazyFrameExtension.MANAGE_SV) + try: + lf.from_path(input_path, chrom2length_path, behavior=VcfParsingBehavior) + except exception.NotVcfHeaderError: + logging.error(f"Path {input_path} seems not contains Vcf.") # noqa: TRY400 we are in cli exception isn't readable + sys.exit(11) + except exception.NotAVCFError: + logging.error(f"Path {input_path} seems not contains Vcf.") # noqa: TRY400 we are in cli exception isn't readable + sys.exit(12) + except exception.NoContigsLengthInformationError: + logging.error("Vcf didn't contains contigs length information you could use chrom2length-path argument.") # noqa: TRY400 we are in cli exception isn't readable + sys.exit(13) logger.debug("End read vcf") ctx.obj["vcf_path"] = input_path ctx.obj["lazyframe"] = lf ctx.obj["append"] = append - ctx.obj["headers"] = headers + ctx.obj["headers"] = lf.header @vcf2parquet.command("variants") @@ -98,7 +100,12 @@ def variants( logger.debug(f"parameter: {output_path=}") logger.info(f"Start write variants in {output_path}") - extract.variants(lf).sink_parquet(output_path, maintain_order=False) + variants = lf.variants() + + try: + variants.sink_parquet(output_path, maintain_order=False) + except polars.exceptions.InvalidOperationError: + variants.collect(streaming=True).write_parquet(output_path) logger.info(f"End write variants in {output_path}") @@ -129,19 +136,22 @@ def genotypes( lf = ctx.obj["lazyframe"] append = ctx.obj["append"] # noqa: F841 not used now - headers = ctx.obj["headers"] + headers_obj = ctx.obj["headers"] input_path = ctx.obj["vcf_path"] logger.debug(f"parameter: {output_path=} {format_string=}") try: - genotypes_lf = extract.genotypes(lf, io.vcf.format2expr(headers, input_path), format_string) + genotypes_data = lf.genotypes(format_string) except exception.NoGenotypeError: logger.error("It's seems vcf not contains genotypes information.") # noqa: TRY400 we are in cli exception isn't readable sys.exit(12) logger.info(f"Start write genotypes in {output_path}") - genotypes_lf.sink_parquet(output_path, maintain_order=False) + try: + genotypes_data.lf.sink_parquet(output_path, maintain_order=False) + except polars.exceptions.InvalidOperationError: + genotypes_data.lf.collect(streaming=True).write_parquet(output_path) logger.info(f"End write genotypes in {output_path}") @@ -178,22 +188,25 @@ def annotations_subcommand( lf = ctx.obj["lazyframe"] append = ctx.obj["append"] # noqa: F841 not used now - headers = ctx.obj["headers"] + headers_obj = ctx.obj["headers"] input_path = ctx.obj["vcf_path"] logger.debug(f"parameter: {output_path=}") logger.info("Start extract annotations") - annotations_lf = lf.with_columns(io.vcf.info2expr(headers, input_path, info)) - annotations_lf = annotations_lf.drop(["chr", "pos", "ref", "alt", "filter", "qual", "info"]) + annotations_data = lf.lf.with_columns(headers_obj.info_parser(info)) + annotations_data = annotations_data.drop(["chr", "pos", "ref", "alt", "filter", "qual", "info"]) if rename_id: logger.info(f"Rename vcf variant id in {rename_id}") - annotations_lf = annotations_lf.rename({"vid": rename_id}) + annotations_data = annotations_data.rename({"vid": rename_id}) logger.info("End extract annotations") logger.info(f"Start write annotations in {output_path}") - annotations_lf.sink_parquet(output_path, maintain_order=False) + try: + annotations_data.sink_parquet(output_path, maintain_order=False) + except polars.exceptions.InvalidOperationError: + annotations_data.collect(streaming=True).write_parquet(output_path) logger.info(f"End write annotations in {output_path}") @@ -213,12 +226,12 @@ def headers( """Write vcf headers.""" logger = logging.getLogger("vcf2parquet.headers") - headers = ctx.obj["headers"] + headers_obj = ctx.obj["headers"] logger.debug(f"parameter: {output_path=}") logger.info(f"Start write headers in {output_path}") with open(output_path, "w") as fh_out: - for line in headers: + for line in headers_obj: print(line, file=fh_out) logger.info(f"End write headers in {output_path}") diff --git a/src/variantplaner/exception.py b/src/variantplaner/exception.py index 43950eb..7725ddd 100644 --- a/src/variantplaner/exception.py +++ b/src/variantplaner/exception.py @@ -13,6 +13,30 @@ # project import +class NoContigsLengthInformationError(Exception): + """Exception raise if we didn't get Contigs Length information in vcf or in compagnion file.""" + + def __init__(self): + """Initize no contigs length information error.""" + super().__init__("Contigs length information is required in vcf header of in compagnion file.") + + +class NotAVariantCsvError(Exception): + """Exception raise if file is a csv should contains variants info but columns name not match minimal requirement.""" + + def __init__(self, path: pathlib.Path): + """Initialize not a variant csv error.""" + super().__init__(f"{path} seems not be a csv variant.") + + +class NotVcfHeaderError(Exception): + """Exception raise if header isn't compatible with vcf.""" + + def __init__(self): + """Initialize not a vcf header error.""" + super().__init__("Not a vcf header") + + class NotAVCFError(Exception): """Exception raise if file read seems not be a vcf, generally not contains a line starts with '#CHROM'.""" diff --git a/src/variantplaner/normalization.py b/src/variantplaner/normalization.py index bc41cd1..16888b1 100644 --- a/src/variantplaner/normalization.py +++ b/src/variantplaner/normalization.py @@ -33,6 +33,7 @@ def add_variant_id(lf: polars.LazyFrame, chrom2length: polars.LazyFrame) -> pola Returns: [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) with chr column normalized """ + real_pos_max = chrom2length.select([polars.col("length").sum()]).collect().get_column("length").max() if "SVTYPE" in lf.columns and "SVLEN" in lf.columns: @@ -56,7 +57,7 @@ def add_variant_id(lf: polars.LazyFrame, chrom2length: polars.LazyFrame) -> pola ), ) - lf = lf.join(chrom2length, on="chr", how="left") + lf = lf.join(chrom2length, right_on="contig", left_on="chr", how="left") lf = lf.with_columns(real_pos=polars.col("pos") + polars.col("offset")) lf = lf.with_columns( diff --git a/src/variantplaner/objects/__init__.py b/src/variantplaner/objects/__init__.py new file mode 100644 index 0000000..40394f8 --- /dev/null +++ b/src/variantplaner/objects/__init__.py @@ -0,0 +1,8 @@ +"""Module to store variantplaner object.""" + +# std import +from __future__ import annotations + +# 3rd party import +# project import +from variantplaner.objects.vcf import Vcf, VcfParsingBehavior diff --git a/src/variantplaner/objects/annotations.py b/src/variantplaner/objects/annotations.py new file mode 100644 index 0000000..3422ec8 --- /dev/null +++ b/src/variantplaner/objects/annotations.py @@ -0,0 +1,24 @@ +"""Declare Genotypes object.""" + +# std import +from __future__ import annotations + +# 3rd party import +import polars + +# project import + + +class Annotations(polars.LazyFrame): + """Object to manage lazyframe as Annotations.""" + + def __init__(self): + """Initialize a Annotations object.""" + self.lf = polars.LazyFrame(schema=Annotations.minimal_schema()) + + @classmethod + def minimal_schema(cls) -> dict[str, type]: + """Get minimal schema of genotypes polars.LazyFrame.""" + return { + "id": polars.UInt64, + } diff --git a/src/variantplaner/objects/contigs_length.py b/src/variantplaner/objects/contigs_length.py new file mode 100644 index 0000000..b6bb7d3 --- /dev/null +++ b/src/variantplaner/objects/contigs_length.py @@ -0,0 +1,84 @@ +"""Declare Vcf object.""" + +# std import +from __future__ import annotations + +import re +import typing + +# 3rd party import +import polars + +# project import +from variantplaner.objects.csv import Csv + +if typing.TYPE_CHECKING: + import pathlib + import sys + + from variantplaner.objects.csv import ScanCsv + from variantplaner.objects.vcf_header import VcfHeader + + if sys.version_info >= (3, 11): + from typing import Unpack + else: + from typing_extensions import Unpack + + +class ContigsLength: + """Store contigs -> length information.""" + + def __init__(self): + """Initialise a contigs length.""" + self.lf = polars.LazyFrame( + schema={ + "contig": polars.String, + "length": polars.UInt64, + "offset": polars.UInt64, + } + ) + + def from_vcf_header(self, header: VcfHeader) -> int: + """Fill a object with VcfHeader. + + Argument: + header: VcfHeader + + Returns: Number of contigs line view + """ + contigs_id = re.compile(r"ID=(?P[^,]+)") + contigs_len = re.compile(r"length=(?P[^,>]+)") + + count = 0 + contigs2len = {"contig": list(), "length": list()} + for contig_line in header.contigs: + if (len_match := contigs_len.search(contig_line)) and (id_match := contigs_id.search(contig_line)): + contigs2len["contig"].append(id_match.groupdict()["id"]) + contigs2len["length"].append(int(len_match.groupdict()["length"])) + count += 1 + + self.lf = polars.LazyFrame(contigs2len, schema={"contig": polars.String, "length": polars.UInt64}) + + self.__compute_offset() + + return count + + def from_path(self, path: pathlib.Path, /, **scan_csv_args: Unpack[ScanCsv]) -> int: + """Fill object with file point by pathlib.Path. + + Argument: + path: path of input file + + Returns: Number of contigs line view + """ + csv = Csv() + csv.from_path(path, **scan_csv_args) + self.lf = csv.lf + + self.__compute_offset() + + return self.lf.collect().shape[0] + + def __compute_offset(self): + self.lf = self.lf.with_columns(offset=polars.col("length").cum_sum() - polars.col("length")) + self.lf = self.lf.cast({"offset": polars.UInt64}) diff --git a/src/variantplaner/objects/csv.py b/src/variantplaner/objects/csv.py new file mode 100644 index 0000000..9ac6166 --- /dev/null +++ b/src/variantplaner/objects/csv.py @@ -0,0 +1,82 @@ +"""Declare CSV object.""" + +# std import +from __future__ import annotations + +import dataclasses +import typing + +# 3rd party import +import polars + +# project import +from variantplaner.exception import NotAVariantCsvError + +if typing.TYPE_CHECKING: # pragma: no cover + import pathlib + import sys + from collections.abc import Sequence + + if sys.version_info >= (3, 11): + from typing import Unpack + else: + from typing_extensions import Unpack + + class ScanCsv(typing.TypedDict, total=False): + """A struct to check type of parameter give to [polars.scan_csv][].""" + + has_header: bool + separator: str + comment_prefix: str | None + quote_char: str | None + skip_rows: int + dtypes: polars.type_aliases.SchemaDict | Sequence[polars.type_aliases.PolarsDataType] | None + null_values: str | Sequence[str] | dict[str, str] | None + missing_utf8_is_empty_string: bool + ignore_errors: bool + cache: bool + with_column_names: typing.Callable[[list[str]], list[str]] | None + infer_schema_length: int | None + n_rows: int | None + encoding: polars.type_aliases.CsvEncoding + low_memory: bool + rechunk: bool + skip_rows_after_header: int + row_index_name: str | None + row_index_offset: int + try_parse_dates: bool + eol_char: str + new_columns: Sequence[str] | None + + +@dataclasses.dataclass +class ColRename: + """A struct to store rename parameter.""" + + chr: str = "chr" + ref: str = "ref" + alt: str = "alt" + other: dict[str, str] = dataclasses.field(default_factory=dict) + + +class Csv(polars.LazyFrame): + """Object to manage lazyframe as Csv.""" + + def __init__(self): + """Initialize a Csv object.""" + self.lf = polars.LazyFrame() + + def from_path(self, path: pathlib.Path, /, **scan_csv_args: Unpack[ScanCsv]) -> None: + """Populate Csv obejct with csv file content.""" + self.lf = polars.scan_csv(path, **scan_csv_args) + + def variants_from_path(self, path: pathlib.Path, col_rename: ColRename, /, **scan_csv_args: Unpack[ScanCsv]) -> None: + """Populate Csv object with csv file.""" + self.from_path(path, **scan_csv_args) + + self.lf = self.lf.rename(dataclasses.asdict(col_rename)) + + if any(elt not in super().columns for elt in ["chr", "pos", "ref", "alt"]): + raise NotAVariantCsvError(path) + + self.lf = self.lf.cast({"pos": polars.UInt64}) diff --git a/src/variantplaner/objects/genotypes.py b/src/variantplaner/objects/genotypes.py new file mode 100644 index 0000000..0d7f565 --- /dev/null +++ b/src/variantplaner/objects/genotypes.py @@ -0,0 +1,25 @@ +"""Declare Genotypes object.""" + +# std import +from __future__ import annotations + +# 3rd party import +import polars + +# project import + + +class Genotypes(polars.LazyFrame): + """Object to manage lazyframe as Genotypes.""" + + def __init__(self): + """Initialize a Genotypes object.""" + self.lf = polars.LazyFrame(schema=Genotypes.minimal_schema()) + + @classmethod + def minimal_schema(cls) -> dict[str, type]: + """Get minimal schema of genotypes polars.LazyFrame.""" + return { + "id": polars.UInt64, + "samples": polars.String, + } diff --git a/src/variantplaner/objects/origins.py b/src/variantplaner/objects/origins.py new file mode 100644 index 0000000..76a4921 --- /dev/null +++ b/src/variantplaner/objects/origins.py @@ -0,0 +1,28 @@ +"""Declare Origin object.""" + +# std import +from __future__ import annotations + +# 3rd party import +import polars + +# project import + + +class Origins(polars.LazyFrame): + """Object to manage lazyframe as Origins.""" + + def __init__(self): + """Initialize a Origins object.""" + self.lf = polars.LazyFrame(schema=Origins.minimal_schema()) + + @classmethod + def minimal_schema(cls) -> dict[str, type]: + """Get minimal schema of genotypes polars.LazyFrame.""" + return { + "id": polars.UInt64, + "index_gt": polars.UInt8, + "mother_gt": polars.UInt8, + "father_gt": polars.UInt8, + "origin": polars.String, + } diff --git a/src/variantplaner/objects/pedigree.py b/src/variantplaner/objects/pedigree.py new file mode 100644 index 0000000..7bdcd6e --- /dev/null +++ b/src/variantplaner/objects/pedigree.py @@ -0,0 +1,29 @@ +"""Declare Pedigree object.""" + +# std import +from __future__ import annotations + +# 3rd party import +import polars + +# project import + + +class Pedigree(polars.LazyFrame): + """Object to manage lazyframe as Variants.""" + + def __init__(self): + """Initialize a Variants object.""" + self.lf = polars.LazyFrame(schema=Pedigree.minimal_schema()) + + @classmethod + def minimal_schema(cls) -> dict[str, type]: + """Get schema of variants polars.LazyFrame.""" + return { + "family_id": polars.String, + "personal_id": polars.String, + "father_id": polars.String, + "mother_id": polars.String, + "sex": polars.String, + "affected": polars.Boolean, + } diff --git a/src/variantplaner/objects/variants.py b/src/variantplaner/objects/variants.py new file mode 100644 index 0000000..5c5fdcb --- /dev/null +++ b/src/variantplaner/objects/variants.py @@ -0,0 +1,28 @@ +"""Declare Variants object.""" + +# std import +from __future__ import annotations + +# 3rd party import +import polars + +# project import + + +class Variants(polars.LazyFrame): + """Object to manage lazyframe as Variants.""" + + def __init__(self): + """Initialize a Variants object.""" + self.lf = polars.LazyFrame(schema=Variants.minimal_schema()) + + @classmethod + def minimal_schema(cls) -> dict[str, type]: + """Get schema of variants polars.LazyFrame.""" + return { + "id": polars.UInt64, + "chr": polars.String, + "pos": polars.UInt64, + "ref": polars.String, + "alt": polars.String, + } diff --git a/src/variantplaner/objects/vcf.py b/src/variantplaner/objects/vcf.py new file mode 100644 index 0000000..dfc7db1 --- /dev/null +++ b/src/variantplaner/objects/vcf.py @@ -0,0 +1,149 @@ +"""Declare Vcf object.""" + +# std import +from __future__ import annotations + +import enum + +# 3rd party import +import typing + +import polars + +# project import +from variantplaner import normalization +from variantplaner.exception import NoContigsLengthInformationError, NoGenotypeError, NotAVCFError, NotVcfHeaderError +from variantplaner.objects.annotations import Annotations +from variantplaner.objects.contigs_length import ContigsLength +from variantplaner.objects.genotypes import Genotypes +from variantplaner.objects.variants import Variants +from variantplaner.objects.vcf_header import VcfHeader + +if typing.TYPE_CHECKING: + import pathlib + + +class VcfParsingBehavior(enum.Enum): + """Enumeration use to control behavior of IntoLazyFrame.""" + + NOTHING = 0 + """into_lazyframe not have any specific behavior""" + + MANAGE_SV = 1 + """into_lazyframe try to avoid structural variant id collision, SVTYPE/SVLEN info value must be present.""" + + +class Vcf: + """Object to manage lazyframe as Vcf.""" + + def __init__(self): + """Initialize a Vcf object.""" + self.lf = polars.LazyFrame(schema=Variants.minimal_schema()) + + self.header = VcfHeader() + + def from_path( + self, path: pathlib.Path, chr2len_path: pathlib.Path | None, behavior: VcfParsingBehavior = VcfParsingBehavior.NOTHING + ) -> None: + """Populate Vcf object with vcf file.""" + with open(path) as fh: + try: + self.header.from_lines(fh) + except NotVcfHeaderError as e: + raise NotAVCFError(path) from e + + chr2len = ContigsLength() + if chr2len.from_vcf_header(self.header) == 0 and (chr2len_path is None or chr2len.from_path(chr2len_path) == 0): + raise NoContigsLengthInformationError + + self.lf = polars.scan_csv( + path, + separator="\t", + comment_prefix="#", + has_header=False, + ) + + self.lf = self.lf.rename(dict(zip(self.lf.columns, self.header.column_name(self.lf.width)))) + self.lf = self.lf.cast(Vcf.schema()) + + if behavior == VcfParsingBehavior.MANAGE_SV: + self.lf = self.lf.with_columns(self.header.info_parser({"SVTYPE", "SVLEN"})) + + self.lf = normalization.add_variant_id(self.lf, chr2len.lf) + + if behavior == VcfParsingBehavior.MANAGE_SV: + self.lf = self.lf.drop("SVTYPE", "SVLEN") + + + def variants(self) -> Variants: + """Get variants of vcf.""" + return self.lf.select(Variants.minimal_schema()) + + + def genotypes(self, format_str: str = "GT:AD:DP:GQ") -> Genotypes: + """Get genotype of vcf.""" + if "format" not in self.lf.columns: + raise NoGenotypeError + + lf = self.lf.select([*self.lf.columns[self.lf.columns.index("format") :]]) + + # Clean bad variant + lf = lf.filter(polars.col("format").str.starts_with(format_str)).select(*lf.columns[1:]) + + # Found index of genotype value + col_index = { + key: index + for (index, key) in enumerate( + format_str.split(":"), + ) + } + + # Pivot value + genotypes = Genotypes() + genotypes.lf = lf.melt(id_vars=["id"]).with_columns( + [ + polars.col("id"), + polars.col("variable").alias("sample"), + polars.col("value").str.split(":"), + ], + ) + + # Split genotype column in sub value + col2expr = self.header.format_parser() + + genotypes.lf = genotypes.lf.with_columns( + [ + polars.col("value").list.get(index).pipe(function=col2expr[col], col_name=col) + for col, index in col_index.items() + ], + ) + + # Select intrusting column + genotypes.lf = genotypes.lf.select(["id", "sample", *[col.lower() for col in col_index]]) + + if "gt" in genotypes.lf.columns: + genotypes.lf = genotypes.lf.filter(polars.col("gt") != 0) + + return genotypes + + + def annotations(self, select_info: set[str] | None = None) -> Annotations: + """Get annotations of vcf.""" + lf = self.lf.with_columns(self.lf.header.info_parser(select_info)) + + return lf.drop("chr", "pos", "ref", "alt", "format", "info") + + @classmethod + def schema(cls) -> dict[str, type]: + """Get schema of Vcf polars.LazyFrame.""" + + return { + "chr": polars.String, + "pos": polars.UInt64, + "vid": polars.String, + "ref": polars.String, + "alt": polars.String, + "qual": polars.String, + "filter": polars.String, + "info": polars.String, + } diff --git a/src/variantplaner/objects/vcf_header.py b/src/variantplaner/objects/vcf_header.py new file mode 100644 index 0000000..c368538 --- /dev/null +++ b/src/variantplaner/objects/vcf_header.py @@ -0,0 +1,249 @@ +"""Declare Vcf object.""" + +# std import +from __future__ import annotations + +import functools + +# 3rd party import +import re +import typing + +import polars + +from variantplaner.exception import NotVcfHeaderError + +# project import + +MINIMAL_COL_NUMBER: int = 8 +SAMPLE_COL_BEGIN: int = 9 + + +class VcfHeader: + """Object that parse and store vcf information.""" + + def __init__(self): + """Initialise VcfHeader.""" + self._header = [] + + def from_lines(self, lines: typing.Iterator[str]) -> None: + """Extract all header information of vcf lines. + + Line between start of file and first line start with '#CHROM' or not start with '#' + + Args: + lines: Iterator of line + + Returns: None + + Raises: + NotAVcfHeader: If a line not starts with '#' + NotAVcfHeader: If no line start by '#CHROM' + """ + for full_line in lines: + line = full_line.strip() + + if not line.startswith("#"): + raise NotVcfHeaderError + + if line.startswith("#CHROM"): + self._header.append(line) + return + + self._header.append(line) + + raise NotVcfHeaderError + + def info_parser(self, select_info: set[str] | None = None) -> list[polars.Expr]: + """Generate a list of [polars.Expr](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/index.html) to extract variants information. + + Args: + header: Line of vcf header + input_path: Path to vcf file. + select_info: List of target info field + + Returns: + List of [polars.Expr](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/index.html) to parse info columns. + + Raises: + NotVcfHeaderError: If all line not start by '#CHR' + """ + info_re = re.compile( + r"ID=(?P([A-Za-z_][0-9A-Za-z_.]*|1000G)),Number=(?P[ARG0-9\.]+),Type=(?PInteger|Float|String|Character)", + ) + + expressions: list[polars.Expr] = [] + + for line in self._header: + if line.startswith("#CHROM"): + return expressions + + if not line.startswith("##INFO"): + continue + + if (search := info_re.search(line)) and (not select_info or search["id"] in select_info): + regex = rf"{search['id']}=([^;]+);?" + + local_expr = polars.col("info").str.extract(regex, 1) + + if search["number"] == "1": + if search["type"] == "Integer": + local_expr = local_expr.cast(polars.Int64) + elif search["type"] == "Float": + local_expr = local_expr.cast(polars.Float64) + elif search["type"] in {"String", "Character"}: + pass # Not do anything on string or character + else: + pass # Not reachable + + else: + local_expr = local_expr.str.split(",") + if search["type"] == "Integer": + local_expr = local_expr.cast(polars.List(polars.Int64)) + elif search["type"] == "Float": + local_expr = local_expr.cast(polars.List(polars.Float64)) + elif search["type"] in {"String", "Character"}: + pass # Not do anything on string or character + else: + pass # Not reachable + + expressions.append(local_expr.alias(search["id"])) + + raise NotVcfHeaderError + + def format_parser( + self, + select_format: set[str] | None = None, + ) -> dict[str, typing.Callable[[polars.Expr, str], polars.Expr]]: + """Generate a list of [polars.Expr](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/index.html) to extract genotypes information. + + **Warning**: Float values can't be converted for the moment they are stored as String to keep information + + Args: + header: Line of vcf header. + input_path: Path to vcf file. + select_format: List of target format field. + + Returns: + A dict to link format id to pipeable function with Polars.Expr + + Raises: + NotVcfHeaderError: If all line not start by '#CHR' + """ + format_re = re.compile( + "ID=(?P[A-Za-z_][0-9A-Za-z_.]*),Number=(?P[ARG0-9\\.]+),Type=(?PInteger|Float|String|Character)", + ) + + expressions: dict[str, typing.Callable[[polars.Expr, str], polars.Expr]] = {} + + for line in self._header: + if line.startswith("#CHROM"): + return expressions + + if not line.startswith("##FORMAT"): + continue + + if (search := format_re.search(line)) and (not select_format or search["id"] in select_format): + name = search["id"] + number = search["number"] + format_type = search["type"] + + if name == "GT": + expressions["GT"] = VcfHeader.__format_gt + continue + + if number == "1": + if format_type == "Integer": + expressions[name] = VcfHeader.__format_one_int + elif format_type == "Float": # noqa: SIM114 Float isn't already support but in future + expressions[name] = VcfHeader.__format_one_str + elif format_type in {"String", "Character"}: + expressions[name] = VcfHeader.__format_one_str + else: + pass # Not reachable + + elif format_type == "Integer": + expressions[name] = VcfHeader.__format_list_int + elif format_type == "Float": # noqa: SIM114 Float isn't already support but in future + expressions[name] = VcfHeader.__format_list_str + elif format_type in {"String", "Character"}: + expressions[name] = VcfHeader.__format_list_str + else: + pass # Not reachable + + raise NotVcfHeaderError + + @functools.cached_property + def samples_index(self) -> dict[str, int] | None: + """Read vcf header to generate an association map between sample name and index. + + Args: + header: Header string. + + Returns: + Map that associate a sample name to is sample index. + + Raises: + NotVcfHeaderError: If all line not start by '#CHR' + """ + for line in reversed(self._header): + if line.startswith("#CHR"): + split_line = line.strip().split("\t") + if len(split_line) <= MINIMAL_COL_NUMBER: + return None + + return {sample: i for (i, sample) in enumerate(split_line[SAMPLE_COL_BEGIN:])} + + raise NotVcfHeaderError + + @functools.cached_property + def contigs(self) -> typing.Iterator[str]: + """Get an iterator of line contains chromosomes information. + + Returns: String iterator + """ + for full_line in self._header: + if full_line.startswith("##contig"): + yield full_line.strip() + + def column_name(self, number_of_column: int = MINIMAL_COL_NUMBER) -> typing.Iterator[str]: + """Get an iterator of correct column name. + + Returns: String iterator + """ + base_col_name = ["chr", "pos", "vid", "ref", "alt", "qual", "filter", "info"] + + yield from base_col_name + + if number_of_column > MINIMAL_COL_NUMBER and (samples := self.samples_index): + yield "format" + yield from (sample for (sample, _) in samples.items()) + + @staticmethod + def __format_gt(expr: polars.Expr, /, col_name: str) -> polars.Expr: + """Manage gt field.""" + return expr.str.count_matches("1").cast(polars.UInt8).alias(col_name.lower()) + + @staticmethod + def __format_one_int(expr: polars.Expr, /, col_name: str) -> polars.Expr: + """Manage integer field.""" + return expr.str.to_integer(base=10, strict=False).cast(polars.UInt32).alias(col_name.lower()) + + @staticmethod + def __format_one_str(expr: polars.Expr, /, col_name: str) -> polars.Expr: + """Manage string field.""" + return expr.alias(col_name.lower()) + + @staticmethod + def __format_list_int(expr: polars.Expr, /, col_name: str) -> polars.Expr: + """Manage list of integer field.""" + return ( + expr.str.split(",") + .list.eval(polars.element().str.to_integer(base=10, strict=False).cast(polars.UInt32)) + .alias(col_name.lower()) + ) + + @staticmethod + def __format_list_str(expr: polars.Expr, /, col_name: str) -> polars.Expr: + """Manag list string field.""" + return expr.str.split(",").alias(col_name.lower()) diff --git a/tests/data/all_info.vcf b/tests/data/all_info.vcf index 58b48c6..8288310 100644 --- a/tests/data/all_info.vcf +++ b/tests/data/all_info.vcf @@ -2,6 +2,31 @@ ##fileDate=2023-04-10 ##source=ClinVar ##reference=GRCh38 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= ##ID= ##INFO= ##INFO= diff --git a/tests/data/grch38.92.csv b/tests/data/grch38.92.csv index 07cb9e1..6b2f3a4 100644 --- a/tests/data/grch38.92.csv +++ b/tests/data/grch38.92.csv @@ -1,4 +1,4 @@ -chr,length +contig,length 1,248956422 10,133797422 11,135086622 diff --git a/tests/data/no_genotypes.annotations.parquet b/tests/data/no_genotypes.annotations.parquet index c680791..d9ad0d0 100644 Binary files a/tests/data/no_genotypes.annotations.parquet and b/tests/data/no_genotypes.annotations.parquet differ diff --git a/tests/data/sv.genotypes.parquet b/tests/data/sv.genotypes.parquet index 9d9948e..3546f6e 100644 Binary files a/tests/data/sv.genotypes.parquet and b/tests/data/sv.genotypes.parquet differ diff --git a/tests/data/sv.variants.parquet b/tests/data/sv.variants.parquet index 5065e3f..4bbb087 100644 Binary files a/tests/data/sv.variants.parquet and b/tests/data/sv.variants.parquet differ diff --git a/tests/test_cli.py b/tests/test_cli.py index 1764595..c54237d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -208,7 +208,7 @@ def test_vcf2parquet_not_vcf(tmp_path: pathlib.Path) -> None: ], ) - assert result.exit_code == 11 + assert result.exit_code == 12 def test_vcf2parquet_no_genotype(tmp_path: pathlib.Path) -> None: @@ -486,7 +486,7 @@ def test_annotations_vcf_not_vcf(tmp_path: pathlib.Path) -> None: ], ) - assert result.exit_code == 11 + assert result.exit_code == 12 def test_annotations_vcf_select(tmp_path: pathlib.Path) -> None: diff --git a/tests/test_extract.py b/tests/test_extract.py deleted file mode 100644 index c76b2ee..0000000 --- a/tests/test_extract.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Tests for the `extract` module.""" - -# std import -from __future__ import annotations - -import pathlib - -# 3rd party import -import polars -import polars.testing -import pytest - -# project import -from variantplaner import exception, extract, io - -DATA_DIR = pathlib.Path(__file__).parent / "data" - - -def test_extract_variants() -> None: - """Check extract variants.""" - truth = polars.scan_parquet(DATA_DIR / "no_info.variants.parquet") - - df = io.vcf.into_lazyframe(DATA_DIR / "no_info.vcf", DATA_DIR / "grch38.92.csv") - - lf = extract.variants(df.lazy()) - - polars.testing.assert_frame_equal(truth, lf, check_row_order=False, check_column_order=False) - - -def test_extract_genotypes() -> None: - """Check extract genotypes.""" - truth = polars.scan_parquet(DATA_DIR / "no_info.genotypes.parquet") - - df = io.vcf.into_lazyframe(DATA_DIR / "no_info.vcf", DATA_DIR / "grch38.92.csv") - - vcf_header = io.vcf.extract_header(DATA_DIR / "no_info.vcf") - lf = extract.genotypes(df, io.vcf.format2expr(vcf_header, DATA_DIR / "no_info.vcf")) - - try: - polars.testing.assert_frame_equal(truth, lf) - except OverflowError: # TODO remove this - assert truth.columns == lf.columns - assert truth.dtypes == lf.dtypes - assert truth.width == lf.width - - -def test_extract_genotypes_no_gt() -> None: - """Check extract genotypes without gt value.""" - input_path = DATA_DIR / "no_info_no_gt.vcf" - - lf = io.vcf.into_lazyframe(input_path, DATA_DIR / "grch38.92.csv") - - vcf_header = io.vcf.extract_header(input_path) - - format2expr = io.vcf.format2expr(vcf_header, input_path) - - lf = extract.genotypes(lf, format2expr, format_str="AD:DP:GQ") - - truth = polars.scan_parquet(DATA_DIR / "no_info_no_gt.genotypes.parquet") - - polars.testing.assert_frame_equal(truth, lf) - - -def test_extract_genotypes_without_genotypes() -> None: - """Check extract genotypes failled if vcf not containts genotypes.""" - df = io.vcf.into_lazyframe(DATA_DIR / "no_genotypes.vcf", DATA_DIR / "grch38.92.csv") - vcf_header = io.vcf.extract_header(DATA_DIR / "no_info.vcf") - - with pytest.raises(exception.NoGenotypeError): - extract.genotypes(df.lazy(), io.vcf.format2expr(vcf_header, DATA_DIR / "no_info.vcf")) - - -def test_extract_merge_variant_genotypes() -> None: - """Check merge_variants_genotypes.""" - vcf = io.vcf.into_lazyframe(DATA_DIR / "no_info.vcf", DATA_DIR / "grch38.92.csv") - vcf_header = io.vcf.extract_header(DATA_DIR / "no_info.vcf") - sample_name = io.vcf.sample_index(vcf_header, DATA_DIR / "no_info.vcf") - if sample_name is None: - raise AssertionError # pragma: no cover Not reachable code - - format2expr = io.vcf.format2expr(vcf_header, DATA_DIR / "no_info.vcf") - - variants = extract.variants(vcf) - genotypes = extract.genotypes(vcf, format2expr, format_str="GT:AD:DP:GQ") - - merge = extract.merge_variants_genotypes(variants, genotypes, list(sample_name.keys())) - - assert merge.columns == [ - "id", - "chr", - "pos", - "ref", - "alt", - "sample_1_gt", - "sample_1_ad", - "sample_1_dp", - "sample_1_gq", - "sample_2_gt", - "sample_2_ad", - "sample_2_dp", - "sample_2_gq", - "sample_3_gt", - "sample_3_ad", - "sample_3_dp", - "sample_3_gq", - ] - - assert merge.dtypes == [ - polars.UInt64, - polars.Utf8, - polars.UInt64, - polars.Utf8, - polars.Utf8, - polars.UInt8, - polars.List(polars.UInt32), - polars.UInt32, - polars.UInt32, - polars.UInt8, - polars.List(polars.UInt32), - polars.UInt32, - polars.UInt32, - polars.UInt8, - polars.List(polars.UInt32), - polars.UInt32, - polars.UInt32, - ] diff --git a/tests/test_io_csv.py b/tests/test_io_csv.py deleted file mode 100644 index e2893cc..0000000 --- a/tests/test_io_csv.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Tests for the `io.csv` module.""" - -# std import -from __future__ import annotations - -import pathlib - -# 3rd party import -import polars -import polars.testing - -# project import -from variantplaner import extract, io - -DATA_DIR = pathlib.Path(__file__).parent / "data" - - -def test_into_lazyframe() -> None: - """Check io.csv.into_lazyframe.""" - vcf = extract.variants(io.vcf.into_lazyframe(DATA_DIR / "no_info.vcf", DATA_DIR / "grch38.92.csv")) - - csv = io.csv.into_lazyframe( - DATA_DIR / "no_info.csv", - DATA_DIR / "grch38.92.csv", - "chr", - "pos", - "ref", - "alt", - ["id"], - separator=",", - dtypes={"id": polars.UInt64, "pos": polars.UInt64}, - ) - - polars.testing.assert_frame_equal(vcf, csv, check_column_order=False) diff --git a/tests/test_io_vcf.py b/tests/test_io_vcf.py deleted file mode 100644 index 6eb4cf9..0000000 --- a/tests/test_io_vcf.py +++ /dev/null @@ -1,470 +0,0 @@ -"""Tests for the `io.vcf` module.""" - -# std import -from __future__ import annotations - -import filecmp -import typing - -if typing.TYPE_CHECKING: # pragma: no cover - import sys - - if sys.version_info >= (3, 11): - from typing import ParamSpec - else: - from typing_extensions import ParamSpec - - T = typing.TypeVar("T") - P = ParamSpec("P") - -# 3rd party import -import pathlib - -import polars -import polars.testing -import pytest - -# project import -from variantplaner import exception, extract, io - -DATA_DIR = pathlib.Path(__file__).parent / "data" - - -def test_extract_header() -> None: - """Check extract_header.""" - header = io.vcf.extract_header(DATA_DIR / "no_info.vcf") - - assert header == [ - "##fileformat=VCFv4.2", - '##FILTER=', - '##FILTER=', - '##FORMAT=', - '##FORMAT=', - '##FORMAT=', - '##FORMAT=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - '##FILTER=', - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample_1\tsample_2\tsample_3", - ] - - -def test_extract_header_exception() -> None: - """Check extract_header.""" - with pytest.raises(exception.NotAVCFError): - io.vcf.extract_header(DATA_DIR / "no_info.tsv") - - with pytest.raises(exception.NotAVCFError): - io.vcf.extract_header(DATA_DIR / "only_header.vcf") - - -def test_info2expr_no_info_vcf() -> None: - """Check info2expr.""" - header = io.vcf.extract_header(DATA_DIR / "no_info.vcf") - expressions = io.vcf.info2expr(header, DATA_DIR / "no_info.vcf") - - assert expressions == [] - - -def test_info2expr() -> None: - """Check info2expr.""" - header = io.vcf.extract_header(DATA_DIR / "no_genotypes.vcf") - expressions = io.vcf.info2expr(header, DATA_DIR / "no_genotypes.vcf") - - assert len(expressions) == 21 - - -def test_info2expr_exception() -> None: - """Check info2expr.""" - header = io.vcf.extract_header(DATA_DIR / "no_genotypes.vcf") - - with pytest.raises(exception.NotAVCFError): - io.vcf.info2expr(header[:-1], DATA_DIR / "no_genotypes.vcf") - - -def test_format2expr_no_format_vcf() -> None: - """Check format2expr.""" - header = io.vcf.extract_header(DATA_DIR / "no_genotypes.vcf") - assert ( - io.vcf.format2expr( - header, - DATA_DIR / "no_genotypes.vcf", - ) - == {} - ) - - -def test_format2expr() -> None: - """Check format2expr.""" - header = io.vcf.extract_header(DATA_DIR / "no_info.vcf") - - assert set(io.vcf.format2expr(header, DATA_DIR / "no_info.vcf").keys()) == { - "AD", - "DP", - "GQ", - "GT", - } - - -def test_format2expr_exception() -> None: - """Check format2expr.""" - header = io.vcf.extract_header(DATA_DIR / "no_info.vcf") - - with pytest.raises(exception.NotAVCFError): - io.vcf.format2expr(header[:-1], DATA_DIR / "no_info.vcf") - - -def test_sample_index() -> None: - """Check sample index.""" - truth = {"sample_1": 0, "sample_2": 1, "sample_3": 2} - - header = io.vcf.extract_header(DATA_DIR / "no_info.vcf") - header.reverse() - sample2idx = io.vcf.sample_index(header, DATA_DIR / "no_info.vcf") - - assert sample2idx - assert len(truth) == len(sample2idx) - assert all(v == sample2idx[k] for k, v in truth.items()) - - -def test_sample_index_no_genotypes() -> None: - """Check sample index.""" - header = io.vcf.extract_header(DATA_DIR / "no_genotypes.vcf") - assert io.vcf.sample_index(header, DATA_DIR / "no_genotypes.vcf") is None - - -def test_sample_index_exception() -> None: - """Check sample index exception.""" - with pytest.raises(exception.NotAVCFError): - io.vcf.extract_header(DATA_DIR / "no_info.tsv") - - with pytest.raises(exception.NotAVCFError): - io.vcf.sample_index([], DATA_DIR / "no_info.tsv") - - with pytest.raises(exception.NotAVCFError): - io.vcf.extract_header(DATA_DIR / "only_header.vcf") - - -def test__column_name() -> None: - """Check __column_name.""" - header = io.vcf.extract_header(DATA_DIR / "no_info.vcf") - assert io.vcf.__column_name(header, DATA_DIR / "no_info.vcf") == [ - "chr", - "pos", - "vid", - "ref", - "alt", - "qual", - "filter", - "info", - "format", - "sample_1", - "sample_2", - "sample_3", - ] - - with pytest.raises(exception.NotAVCFError): - io.vcf.__column_name(header[:-1], DATA_DIR / "no_info.vcf") - - -def test_into_lazyframe() -> None: - """Check into lazyframe.""" - lf = io.vcf.into_lazyframe(DATA_DIR / "no_info.vcf", DATA_DIR / "grch38.92.csv") - - truth = polars.scan_parquet(DATA_DIR / "no_info.parquet") - - polars.testing.assert_frame_equal(truth, lf) - - lf = io.vcf.into_lazyframe(DATA_DIR / "no_genotypes.vcf", DATA_DIR / "grch38.92.csv") - - truth = polars.scan_parquet(DATA_DIR / "no_genotypes.parquet") - - polars.testing.assert_frame_equal(truth, lf) - - -def test_into_lazyframe_sv() -> None: - """Check into lazyframe work on structural variant.""" - input_path = DATA_DIR / "sv.vcf" - - lf = io.vcf.into_lazyframe( - input_path, - DATA_DIR / "grch38.92.csv", - extension=io.vcf.IntoLazyFrameExtension.MANAGE_SV, - ) - - polars.testing.assert_frame_equal(lf, polars.scan_parquet(DATA_DIR / "sv.parquet"), check_row_order=False) - - -def test_into_lazyframe_exception() -> None: - """Check into_lazyframe exception.""" - with pytest.raises(exception.NotAVCFError): - io.vcf.into_lazyframe(DATA_DIR / "no_info.tsv", DATA_DIR / "grch38.92.csv") - - with pytest.raises(exception.NotAVCFError): - io.vcf.into_lazyframe(DATA_DIR / "only_header.vcf", DATA_DIR / "grch38.92.csv") - - -def test_build_rename_column() -> None: - """Check build_rename_column.""" - assert io.vcf.build_rename_column("chr", "pos", "id", "ref", "alt") == { - "#CHROM": "chr", - "POS": "pos", - "ID": "id", - "REF": "ref", - "ALT": "alt", - "QUAL": ".", - "FILTER": ".", - "INFO": [], - "FORMAT": "", - "sample": {}, - } - - assert io.vcf.build_rename_column( - "chr", - "pos", - "id", - "ref", - "alt", - "quality", - "FILTER", - [("GENE", "gene_name")], - "GT:AD:DP:GQ", - ) == { - "#CHROM": "chr", - "POS": "pos", - "ID": "id", - "REF": "ref", - "ALT": "alt", - "QUAL": "quality", - "FILTER": "FILTER", - "INFO": [("GENE", "gene_name")], - "FORMAT": "GT:AD:DP:GQ", - "sample": {}, - } - - assert io.vcf.build_rename_column( - "chr", - "pos", - "id", - "ref", - "alt", - "quality", - "FILTER", - [("GENE", "gene_name")], - "GT:AD:DP:GQ", - { - "sample": { - "gt": "sample_gt", - "ad": "sample_ad", - "dp": "sample_dp", - "gq": "sample_gq", - }, - }, - ) == { - "#CHROM": "chr", - "POS": "pos", - "ID": "id", - "REF": "ref", - "ALT": "alt", - "QUAL": "quality", - "FILTER": "FILTER", - "INFO": [("GENE", "gene_name")], - "FORMAT": "GT:AD:DP:GQ", - "sample": { - "sample": { - "gt": "sample_gt", - "ad": "sample_ad", - "dp": "sample_dp", - "gq": "sample_gq", - } - }, - } - - -def test_from_lazyframe(tmp_path: pathlib.Path) -> None: - """Check from_lazyframe.""" - tmp_file = tmp_path / "output.vcf" - - lf = polars.scan_parquet(DATA_DIR / "no_info.parquet") - - io.vcf.from_lazyframe(lf, tmp_file) - - assert filecmp.cmp(tmp_file, DATA_DIR / "no_info.parquet2vcf.vcf") - - io.vcf.from_lazyframe( - lf, - tmp_file, - io.vcf.build_rename_column("chr", "pos", "id", "ref", "alt", "vid", "vid"), - ) - - assert filecmp.cmp(tmp_file, DATA_DIR / "no_info.parquet2vcf.vcf") - - -def test_from_lazyframe_qual_filter_info(tmp_path: pathlib.Path) -> None: - """Check from_lazyframe qual, filter, info.""" - output_path = tmp_path / "output.vcf" - input_path = DATA_DIR / "no_genotypes.vcf" - - vcf_header = io.vcf.extract_header(input_path) - info_parser = io.vcf.info2expr(vcf_header, input_path) - lf = io.vcf.into_lazyframe(DATA_DIR / "no_genotypes.vcf", DATA_DIR / "grch38.92.csv").with_columns(info_parser) - - io.vcf.from_lazyframe( - lf, - output_path, - io.vcf.build_rename_column( - "chr", - "pos", - "id", - "ref", - "alt", - "qual", - "filter", - [(col_name, col_name) for col_name in lf.columns if col_name.isupper()], - ), - ) - - assert filecmp.cmp(output_path, DATA_DIR / "no_genotypes.vcf2parquet2vcf.vcf") - - -def test_from_lazyframe_qual_filter_format(tmp_path: pathlib.Path) -> None: - """Check from_lazyframe qual, filter, info.""" - output_path = tmp_path / "output.vcf" - input_path = DATA_DIR / "no_info.vcf" - - format_string = "GT:AD:DP:GQ" - vcf = io.vcf.into_lazyframe(input_path, DATA_DIR / "grch38.92.csv") - vcf_header = io.vcf.extract_header(input_path) - sample_name = io.vcf.sample_index(vcf_header, input_path) - if sample_name is None: - raise AssertionError # pragma: no cover Not reachable code - - format2expr = io.vcf.format2expr(vcf_header, input_path) - - variants = extract.variants(vcf) - genotypes = extract.genotypes(vcf, format2expr, format_string) - - merge = extract.merge_variants_genotypes(variants, genotypes, list(sample_name.keys())) - sample2vcf_col2polars_col: dict[str, dict[str, str]] = {} - for sample in sample_name: - sample2vcf_col2polars_col[sample] = {} - for format_col in format_string.split(":"): - sample2vcf_col2polars_col[sample][format_col] = f"{sample}_{format_col.lower()}" - - io.vcf.from_lazyframe( - merge, - output_path, - io.vcf.build_rename_column( - "chr", - "pos", - "id", - "ref", - "alt", - None, - None, - [], - "GT:AD:DP:GQ", - sample2vcf_col2polars_col, - ), - ) - - assert filecmp.cmp(output_path, DATA_DIR / "no_info.vcf2parquet2vcf.vcf") - - -def test_add_info_column() -> None: - """Check add_info_column.""" - vcf_header = io.vcf.extract_header(DATA_DIR / "no_genotypes.vcf") - info_parser = io.vcf.info2expr(vcf_header, DATA_DIR / "no_genotypes.vcf") - lf = io.vcf.into_lazyframe(DATA_DIR / "no_genotypes.vcf", DATA_DIR / "grch38.92.csv").with_columns(info_parser) - - info = io.vcf.add_info_column(lf, [(col_name, col_name) for col_name in lf.columns if col_name.isupper()]) - - truth = [ - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=2193183;CLNDN=Inborn_genetic_diseases;CLNDNINCL=.;CLNDISDB=MeSH:D030342,MedGen:C0950123;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.69134A>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=2238986;CLNDN=Inborn_genetic_diseases;CLNDNINCL=.;CLNDISDB=MeSH:D030342,MedGen:C0950123;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.69581C>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=2386655;CLNDN=Inborn_genetic_diseases;CLNDNINCL=.;CLNDISDB=MeSH:D030342,MedGen:C0950123;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.69682G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=2278803;CLNDN=Inborn_genetic_diseases;CLNDNINCL=.;CLNDISDB=MeSH:D030342,MedGen:C0950123;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.69769T>C;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=2333177;CLNDN=Inborn_genetic_diseases;CLNDNINCL=.;CLNDISDB=MeSH:D030342,MedGen:C0950123;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.69995G>C;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1983057;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.925946C>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1003021;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.925952G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=1640863258", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1632777;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.925956C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001819|synonymous_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=2129477;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.925961A>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1600580;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.925969C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1396033;CLNDN=Inborn_genetic_diseases|not_provided;CLNDNINCL=.;CLNDISDB=MeSH:D030342,MedGen:C0950123|MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.925976T>C;CLNREVSTAT=criteria_provided,_multiple_submitters,_no_conflicts;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1986319;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.925980C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001819|synonymous_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1570515;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.925986C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001819|synonymous_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1502313;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.926003C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1545352;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.926010G>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001819|synonymous_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1473095;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.926014G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001575|splice_donor_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=2034738;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.926018G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001627|intron_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1550067;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.926025G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001627|intron_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=2155344;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.926026G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001627|intron_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1641615;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.926027C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001627|intron_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1629212;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.926029C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001627|intron_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1631640;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.930136T>C;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001627|intron_variant;ORIGIN=1;RS=.", - "AF_ESP=.;AF_EXAC=.;AF_TGP=.;ALLELEID=1563949;CLNDN=not_provided;CLNDNINCL=.;CLNDISDB=MedGen:CN517202;CLNDISDBINCL=.;CLNHGVS=NC_000001.11:g.930139CCT[1];CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGCONF=.;CLNSIGINCL=.;CLNVC=Microsatellite;CLNVCSO=SO:0000289;CLNVI=.;DBVARID=.;GENEINFO=SAMD11:148398;MC=SO:0001627|intron_variant;ORIGIN=1;RS=.", - ] - - assert info.select([polars.col("INFO")]).collect()["INFO"].to_list() == truth - - -def test_generate_header() -> None: - """Check generate header.""" - vcf_header = io.vcf.extract_header(DATA_DIR / "no_genotypes.vcf") - info_parser = io.vcf.info2expr(vcf_header, DATA_DIR / "no_genotypes.vcf") - lf = io.vcf.into_lazyframe(DATA_DIR / "no_genotypes.vcf", DATA_DIR / "grch38.92.csv").with_columns(info_parser) - - assert ( - io.vcf.__generate_header(lf) - == """##fileformat=VCFv4.3 -##source=VariantPlanner -""" - ) - - info = [(col_name, col_name) for col_name in lf.columns if col_name.isupper()] - - header = io.vcf.__generate_header(lf, info) - truth = """##fileformat=VCFv4.3 -##source=VariantPlanner -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -""" - - assert header == truth diff --git a/tests/test_normalization.py b/tests/test_normalization.py index 0357a71..5a73ebd 100644 --- a/tests/test_normalization.py +++ b/tests/test_normalization.py @@ -25,7 +25,7 @@ def __generate_chr2len() -> polars.DataFrame: """Generate a chr2len dataframe.""" chr2len = polars.DataFrame( data={ - "chr": ["1", "2", "3", "22", "X"], + "contig": ["1", "2", "3", "22", "X"], "length": [10_000_000, 50_000, 120_000_500, 99_239_816, 10_000], }, schema_overrides={"length": polars.UInt64},