Skip to content

Commit

Permalink
[WIP] Create first class
Browse files Browse the repository at this point in the history
  • Loading branch information
natir committed Feb 26, 2024
1 parent c51c932 commit 62ebbb7
Show file tree
Hide file tree
Showing 17 changed files with 790 additions and 26 deletions.
5 changes: 1 addition & 4 deletions scripts/gen_benchmark_plot.py
Expand Up @@ -191,10 +191,7 @@ def render_plot() -> str:
if df.shape[0] == 0:
return ""

name2func = {
name: globals().get(f"{name}_func", nothing)
for name in df.get_column("benchmark").unique().to_list()
}
name2func = {name: globals().get(f"{name}_func", nothing) for name in df.get_column("benchmark").unique().to_list()}

bench2plot = {}
for name, data in df.group_by("benchmark"):
Expand Down
3 changes: 2 additions & 1 deletion src/variantplaner/__init__.py
Expand Up @@ -7,7 +7,8 @@

from __future__ import annotations

from variantplaner import exception, extract, generate, io, normalization, struct
from variantplaner import extract, generate, io, normalization, struct
from variantplaner.objects import Vcf, VcfParsingBehavior

__all__: list[str] = [
"exception",
Expand Down
12 changes: 6 additions & 6 deletions src/variantplaner/cli/__init__.py
Expand Up @@ -21,7 +21,7 @@ class MultipleValueOption(click.Option):

def __init__(self, *args: list[typing.Any], **kwargs: dict[typing.Any, typing.Any]):
"""Intialise click option parser."""
super(MultipleValueOption, self).__init__(*args,**kwargs) # type: ignore[arg-type] # noqa: UP008 false positive and complexe type
super(MultipleValueOption, self).__init__(*args, **kwargs) # type: ignore[arg-type] # noqa: UP008 false positive and complexe type
self._previous_parser_process = None
self._eat_all_parser = None

Expand Down Expand Up @@ -109,8 +109,8 @@ def main(ctx: click.Context, *, threads: int = 1, verbose: int = 0, debug_info:


# module import required after main definition
from variantplaner.cli import metadata # noqa: E402 F401 I001 these import should be here
from variantplaner.cli import parquet2vcf # noqa: E402 F401 these import should be here
from variantplaner.cli import struct # noqa: E402 F401 these import should be here
from variantplaner.cli import transmission # noqa: E402 F401 these import should be here
from variantplaner.cli import vcf2parquet # noqa: E402 F401 these import should be here
from variantplaner.cli import metadata # noqa: E402 F401 I001 these import should be here
from variantplaner.cli import parquet2vcf # noqa: E402 F401 these import should be here
from variantplaner.cli import struct # noqa: E402 F401 these import should be here
from variantplaner.cli import transmission # noqa: E402 F401 these import should be here
from variantplaner.cli import vcf2parquet # noqa: E402 F401 these import should be here
33 changes: 19 additions & 14 deletions src/variantplaner/cli/vcf2parquet.py
Expand Up @@ -9,12 +9,13 @@

# 3rd party import
import click
import polars

# project import
from variantplaner import cli, exception, extract, io
from variantplaner import Vcf, VcfParsingBehavior, cli, exception, extract, io


@cli.main.group("vcf2parquet", chain=True) # type: ignore[has-type]
@cli.main.group("vcf2parquet", chain=True) # type: ignore[has-type]
@click.pass_context
@click.option(
"-i",
Expand Down Expand Up @@ -54,20 +55,18 @@ def vcf2parquet(

logger.debug(f"parameter: {input_path=} {chrom2length_path=} {append=}")

if not chrom2length_path:
logger.error("--chrom2length-path argument is required")

try:
logger.debug("Start extract header")
headers = io.vcf.extract_header(input_path)
logger.debug("End extract header")
except exception.NotAVCFError:
logger.error("Input file seems no be a vcf") # noqa: TRY400 we are in cli exception isn't readable
sys.exit(11)
lf = Vcf()

# Read vcf and manage structural variant
logger.debug("Start read vcf")
lf = io.vcf.into_lazyframe(input_path, chrom2length_path, extension=io.vcf.IntoLazyFrameExtension.MANAGE_SV)
try:
lf.from_path(input_path, chrom2length_path, behavior=VcfParsingBehavior)
except exception.NotVcfHeaderError:
logging.error(f"Path {input_path} seems not contains Vcf.") # noqa: TRY400 we are in cli exception isn't readable
sys.exit(11)
except exception.NoContigsLengthInformationError:
logging.error("Vcf didn't contains contigs length information you could use chrom2length-path argument.") # noqa: TRY400 we are in cli exception isn't readable
sys.exit(12)
logger.debug("End read vcf")

ctx.obj["vcf_path"] = input_path
Expand Down Expand Up @@ -98,7 +97,13 @@ def variants(
logger.debug(f"parameter: {output_path=}")

logger.info(f"Start write variants in {output_path}")
extract.variants(lf).sink_parquet(output_path, maintain_order=False)
variants = lf.variants()
print(f"variants: {variants.fetch(10)}")

try:
variants.sink_parquet(output_path, maintain_order=False)
except polars.exceptions.InvalidOperationError:
variants.collect(streaming=True).write_parquet(output_path)
logger.info(f"End write variants in {output_path}")


Expand Down
24 changes: 24 additions & 0 deletions src/variantplaner/exception.py
Expand Up @@ -13,6 +13,30 @@
# project import


class NoContigsLengthInformationError(Exception):
"""Exception raise if we didn't get Contigs Length information in vcf or in compagnion file."""

def __init__(self):
"""Initize no contigs length information error."""
super().__init__("Contigs length information is required in vcf header of in compagnion file.")


class NotAVariantCsvError(Exception):
"""Exception raise if file is a csv should contains variants info but columns name not match minimal requirement."""

def __init__(self, path: pathlib.Path):
"""Initialize not a variant csv error."""
super().__init__(f"{path} seems not be a csv variant.")


class NotVcfHeaderError(Exception):
"""Exception raise if header isn't compatible with vcf."""

def __init__(self):
"""Initialize not a vcf header error."""
super().__init__("Not a vcf header")


class NotAVCFError(Exception):
"""Exception raise if file read seems not be a vcf, generally not contains a line starts with '#CHROM'."""

Expand Down
8 changes: 7 additions & 1 deletion src/variantplaner/normalization.py
Expand Up @@ -33,6 +33,8 @@ def add_variant_id(lf: polars.LazyFrame, chrom2length: polars.LazyFrame) -> pola
Returns:
[polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) with chr column normalized
"""
print(f"normalization {lf.fetch(10)}, {chrom2length.fetch(10)}")

real_pos_max = chrom2length.select([polars.col("length").sum()]).collect().get_column("length").max()

if "SVTYPE" in lf.columns and "SVLEN" in lf.columns:
Expand All @@ -56,8 +58,12 @@ def add_variant_id(lf: polars.LazyFrame, chrom2length: polars.LazyFrame) -> pola
),
)

lf = lf.join(chrom2length, on="chr", how="left")

print(f"BEFORE SV {chrom2length.fetch(10)}")
lf = lf.join(chrom2length, right_on="contig", left_on="chr", how="left")
print(f"LF join {lf.fetch(10)}")
lf = lf.with_columns(real_pos=polars.col("pos") + polars.col("offset"))
print(f"LF real_pos {lf.fetch(10)}")

lf = lf.with_columns(
id=polars.col("real_pos").variant_id.compute( # type: ignore # noqa: PGH003
Expand Down
8 changes: 8 additions & 0 deletions src/variantplaner/objects/__init__.py
@@ -0,0 +1,8 @@
"""Module to store variantplaner object."""

# std import
from __future__ import annotations

# 3rd party import
# project import
from variantplaner.objects.vcf import Vcf, VcfParsingBehavior
24 changes: 24 additions & 0 deletions src/variantplaner/objects/annotations.py
@@ -0,0 +1,24 @@
"""Declare Genotypes object."""

# std import
from __future__ import annotations

# 3rd party import
import polars

# project import


class Annotations(polars.LazyFrame):
"""Object to manage lazyframe as Annotations."""

def __init__(self):
"""Initialize a Annotations object."""
super(polars.LazyFrame, self)(schema=Annotations.minimal_schema())

@classmethod
def minimal_schema(cls) -> dict[str, type]:
"""Get minimal schema of genotypes polars.LazyFrame."""
return {
"id": polars.UInt64,
}
78 changes: 78 additions & 0 deletions src/variantplaner/objects/contigs_length.py
@@ -0,0 +1,78 @@
"""Declare Vcf object."""

# std import
from __future__ import annotations

import re
import typing

# 3rd party import
import polars

# project import
from variantplaner.objects.csv import Csv

if typing.TYPE_CHECKING:
import pathlib
import sys

from variantplaner.objects.csv import ScanCsv
from variantplaner.objects.vcf_header import VcfHeader

if sys.version_info >= (3, 11):
from typing import Unpack
else:
from typing_extensions import Unpack


class ContigsLength:
"""Store contigs -> length information."""

def __init__(self):
"""Initialise a contigs length."""
self.lf = polars.LazyFrame(
schema={
"contig": polars.String,
"length": polars.UInt64,
"offset": polars.UInt64,
}
)

def from_vcf_header(self, header: VcfHeader) -> int:
"""Fill a object with VcfHeader.
Argument:
header: VcfHeader
Returns: Number of contigs line view
"""
contigs_id = re.compile(r"ID=(?P<id>[^,]+)")
contigs_len = re.compile(r"length=(?P<length>[^,>]+)")

count = 0
contigs2len = {"contig": list(), "length": list()}
for contig_line in header.contigs:
if (len_match := contigs_len.search(contig_line)) and (id_match := contigs_id.search(contig_line)):
contigs2len["contig"].append(id_match.groupdict()["id"])
contigs2len["length"].append(int(len_match.groupdict()["length"]))
count += 1

self.lf = polars.LazyFrame(contigs2len, schema={"contig": polars.String, "length": polars.UInt64})
self.lf = self.lf.with_columns(offset=polars.col("length").cum_sum() - polars.col("length"))

return count

def from_path(self, path: pathlib.Path, /, **scan_csv_args: Unpack[ScanCsv]) -> int:
"""Fill object with file point by pathlib.Path.
Argument:
path: path of input file
Returns: Number of contigs line view
"""
csv = Csv()
csv.from_path(path, **scan_csv_args)
count = super().collect().shape[0]
self.lf = csv.with_columns(offset=polars.col("length").cum_sum() - polars.col("length"))

return count
82 changes: 82 additions & 0 deletions src/variantplaner/objects/csv.py
@@ -0,0 +1,82 @@
"""Declare CSV object."""

# std import
from __future__ import annotations

import dataclasses
import typing

# 3rd party import
import polars

# project import
from variantplaner.exception import NotAVariantCsvError

if typing.TYPE_CHECKING: # pragma: no cover
import pathlib
import sys
from collections.abc import Sequence

if sys.version_info >= (3, 11):
from typing import Unpack
else:
from typing_extensions import Unpack

class ScanCsv(typing.TypedDict, total=False):
"""A struct to check type of parameter give to [polars.scan_csv][]."""

has_header: bool
separator: str
comment_prefix: str | None
quote_char: str | None
skip_rows: int
dtypes: polars.type_aliases.SchemaDict | Sequence[polars.type_aliases.PolarsDataType] | None
null_values: str | Sequence[str] | dict[str, str] | None
missing_utf8_is_empty_string: bool
ignore_errors: bool
cache: bool
with_column_names: typing.Callable[[list[str]], list[str]] | None
infer_schema_length: int | None
n_rows: int | None
encoding: polars.type_aliases.CsvEncoding
low_memory: bool
rechunk: bool
skip_rows_after_header: int
row_index_name: str | None
row_index_offset: int
try_parse_dates: bool
eol_char: str
new_columns: Sequence[str] | None


@dataclasses.dataclass
class ColRename:
"""A struct to store rename parameter."""

chr: str = "chr"
ref: str = "ref"
alt: str = "alt"
other: dict[str, str] = dataclasses.field(default_factory=dict)


class Csv(polars.LazyFrame):
"""Object to manage lazyframe as Csv."""

def __init__(self):
"""Initialize a Csv object."""
super().__init__()

def from_path(self, path: pathlib.Path, /, **scan_csv_args: Unpack[ScanCsv]) -> None:
"""Populate Csv obejct with csv file content."""
self = typing.cast(Csv, polars.scan_csv(path, **scan_csv_args)) # noqa: F841

def variants_from_path(self, path: pathlib.Path, col_rename: ColRename, /, **scan_csv_args: Unpack[ScanCsv]) -> None:
"""Populate Csv object with csv file."""
self.from_path(path, **scan_csv_args)

self = super().rename(dataclasses.asdict(col_rename))

if any(elt not in super().columns for elt in ["chr", "pos", "ref", "alt"]):
raise NotAVariantCsvError(path)

self = super().cast({"pos": polars.UInt64})
25 changes: 25 additions & 0 deletions src/variantplaner/objects/genotypes.py
@@ -0,0 +1,25 @@
"""Declare Genotypes object."""

# std import
from __future__ import annotations

# 3rd party import
import polars

# project import


class Genotypes(polars.LazyFrame):
"""Object to manage lazyframe as Genotypes."""

def __init__(self):
"""Initialize a Genotypes object."""
super(polars.LazyFrame, self)(schema=Genotypes.minimal_schema())

@classmethod
def minimal_schema(cls) -> dict[str, type]:
"""Get minimal schema of genotypes polars.LazyFrame."""
return {
"id": polars.UInt64,
"samples": polars.String,
}

0 comments on commit 62ebbb7

Please sign in to comment.