diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 2d91a474..d43d99b3 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -28,9 +28,15 @@ jobs: - name: Run unit tests shell: bash -l {0} run: | - micromamba run -n neat pytest -q tests + micromamba run -n neat pytest --cov=neat --cov-report=xml --cov-report=term tests - name: Run CLI integration test shell: bash -l {0} run: | micromamba run -n neat pytest -q tests/test_cli + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + file: ./coverage.xml + fail_ci_if_error: false diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..9b5b0c11 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,43 @@ + +# GitNexus — Code Intelligence + +This project is indexed by GitNexus as **NEAT** (2553 symbols, 5138 relationships, 57 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely. + +> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first. + +## Always Do + +- **MUST run impact analysis before editing any symbol.** Before modifying a function, class, or method, run `gitnexus_impact({target: "symbolName", direction: "upstream"})` and report the blast radius (direct callers, affected processes, risk level) to the user. +- **MUST run `gitnexus_detect_changes()` before committing** to verify your changes only affect expected symbols and execution flows. +- **MUST warn the user** if impact analysis returns HIGH or CRITICAL risk before proceeding with edits. +- When exploring unfamiliar code, use `gitnexus_query({query: "concept"})` to find execution flows instead of grepping. It returns process-grouped results ranked by relevance. +- When you need full context on a specific symbol — callers, callees, which execution flows it participates in — use `gitnexus_context({name: "symbolName"})`. + +## Never Do + +- NEVER edit a function, class, or method without first running `gitnexus_impact` on it. +- NEVER ignore HIGH or CRITICAL risk warnings from impact analysis. +- NEVER rename symbols with find-and-replace — use `gitnexus_rename` which understands the call graph. +- NEVER commit changes without running `gitnexus_detect_changes()` to check affected scope. + +## Resources + +| Resource | Use for | +|----------|---------| +| `gitnexus://repo/NEAT/context` | Codebase overview, check index freshness | +| `gitnexus://repo/NEAT/clusters` | All functional areas | +| `gitnexus://repo/NEAT/processes` | All execution flows | +| `gitnexus://repo/NEAT/process/{name}` | Step-by-step execution trace | + +## CLI + +| Task | Read this skill file | +|------|---------------------| +| Understand architecture / "How does X work?" | `.claude/skills/gitnexus/gitnexus-exploring/SKILL.md` | +| Blast radius / "What breaks if I change X?" | `.claude/skills/gitnexus/gitnexus-impact-analysis/SKILL.md` | +| Trace bugs / "Why is X failing?" | `.claude/skills/gitnexus/gitnexus-debugging/SKILL.md` | +| Rename / extract / split / refactor | `.claude/skills/gitnexus/gitnexus-refactoring/SKILL.md` | +| Tools, resources, schema reference | `.claude/skills/gitnexus/gitnexus-guide/SKILL.md` | +| Index, status, clean, wiki CLI commands | `.claude/skills/gitnexus/gitnexus-cli/SKILL.md` | + + \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..9b5b0c11 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,43 @@ + +# GitNexus — Code Intelligence + +This project is indexed by GitNexus as **NEAT** (2553 symbols, 5138 relationships, 57 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely. + +> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first. + +## Always Do + +- **MUST run impact analysis before editing any symbol.** Before modifying a function, class, or method, run `gitnexus_impact({target: "symbolName", direction: "upstream"})` and report the blast radius (direct callers, affected processes, risk level) to the user. +- **MUST run `gitnexus_detect_changes()` before committing** to verify your changes only affect expected symbols and execution flows. +- **MUST warn the user** if impact analysis returns HIGH or CRITICAL risk before proceeding with edits. +- When exploring unfamiliar code, use `gitnexus_query({query: "concept"})` to find execution flows instead of grepping. It returns process-grouped results ranked by relevance. +- When you need full context on a specific symbol — callers, callees, which execution flows it participates in — use `gitnexus_context({name: "symbolName"})`. + +## Never Do + +- NEVER edit a function, class, or method without first running `gitnexus_impact` on it. +- NEVER ignore HIGH or CRITICAL risk warnings from impact analysis. +- NEVER rename symbols with find-and-replace — use `gitnexus_rename` which understands the call graph. +- NEVER commit changes without running `gitnexus_detect_changes()` to check affected scope. + +## Resources + +| Resource | Use for | +|----------|---------| +| `gitnexus://repo/NEAT/context` | Codebase overview, check index freshness | +| `gitnexus://repo/NEAT/clusters` | All functional areas | +| `gitnexus://repo/NEAT/processes` | All execution flows | +| `gitnexus://repo/NEAT/process/{name}` | Step-by-step execution trace | + +## CLI + +| Task | Read this skill file | +|------|---------------------| +| Understand architecture / "How does X work?" | `.claude/skills/gitnexus/gitnexus-exploring/SKILL.md` | +| Blast radius / "What breaks if I change X?" | `.claude/skills/gitnexus/gitnexus-impact-analysis/SKILL.md` | +| Trace bugs / "Why is X failing?" | `.claude/skills/gitnexus/gitnexus-debugging/SKILL.md` | +| Rename / extract / split / refactor | `.claude/skills/gitnexus/gitnexus-refactoring/SKILL.md` | +| Tools, resources, schema reference | `.claude/skills/gitnexus/gitnexus-guide/SKILL.md` | +| Index, status, clean, wiki CLI commands | `.claude/skills/gitnexus/gitnexus-cli/SKILL.md` | + + \ No newline at end of file diff --git a/README.md b/README.md index 51d31a6e..07851672 100755 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ To cite this work, please use both of the following: * [`neat gen-mut-model`](#neat-gen-mut-model) * [`neat model-seq-err`](#neat-model-seq-err) * [`neat model-qual-score`](#neat-model-qual-score) + * [`neat model-gc-bias`](#neat-model-gc-bias) * [`neat vcf_compare`](#neat-vcf_compare) * [Tests](#tests) * [Guide to run locally](#guide-to-run-locally) @@ -191,6 +192,7 @@ More parameters are below: | `error_model` | Full path to an error model or quality score model generated by NEAT. Leave empty to use default model (default model based on human, sequenced by Illumina). | | `mutation_model` | Full path to a mutation model generated by NEAT. Leave empty to use a default model (default model based on human data sequenced by Illumina). | | `fragment_model` | Full path to fragment length model generated by NEAT. Leave empty to use default model (default model based on human data sequenced by Illumina). | +| `gc_model` | Full path to GC-bias model generated by NEAT. Leave empty for no GC bias. | | `threads` | The number of threads for NEAT to use. Increasing the number will speed up read generation. | | `avg_seq_error` | Average sequencing error rate for the sequencing machine. Use to increase or decrease the rate of errors in the reads. Float between 0 and 0.3. Default is set by the error model. | | `rescale_qualities` | Rescale the quality scores to reflect the `avg_seq_error` rate above. Set `True` to activate if you notice issues with the sequencing error rates in your dataset. | @@ -243,6 +245,7 @@ Features: - Can accurately simulate large, single-end reads with high indel error rates (PacBio-like) given a model - Specify simple fragment length model with mean and standard deviation or an empirically learned fragment distribution - Simulates quality scores using either the default model or empirically learned quality scores using `neat gen_mut_model` +- Introduces GC-bias using an empirically learned model from `neat model-gc-bias` - Introduces sequencing substitution errors using either the default model or empirically learned in `utilities` - Output a VCF file with the 'golden' set of true positive variants. These can be compared to bioinformatics workflow output (includes coverage and allele balance information) - Output a BAM file with the 'golden' set of aligned reads. These indicate where each read originated and how it should be aligned with the reference @@ -512,6 +515,19 @@ Similarly, use `-i2` to produce a model for paired-ended data. `-q` denotes the Finally, `-o` is the output directory for the model file and `-p` is the prefix for the output model, such that the file will be written as `.p.gz` inside the output folder. +### `neat model-gc-bias` + +Computes GC-bias model from a BAM file and reference genome. It calculates the relative weight of fragments based on their GC content. + +```bash +neat model-gc-bias \ + -i input.bam \ + -r reference.fa \ + -o /path/to/prefix +``` + +and creates `gc_model.pickle.gz` model in the working directory. + ### `neat vcf_compare` Tool for comparing VCF files (Not yet implemented in NEAT 4.4). diff --git a/config_template/simple_template.yml b/config_template/simple_template.yml index 2520a09f..05969722 100644 --- a/config_template/simple_template.yml +++ b/config_template/simple_template.yml @@ -11,6 +11,7 @@ produce_vcf: . produce_fastq: . error_model: . +gc_model: . mutation_model: . fragment_model: . diff --git a/config_template/template_neat_config.yml b/config_template/template_neat_config.yml index a0db4968..6a2ea6eb 100644 --- a/config_template/template_neat_config.yml +++ b/config_template/template_neat_config.yml @@ -25,6 +25,11 @@ coverage: . # type = string | required: no | default: /neat/models/defaults/default_error_model.pickle.gz error_model: . +# Absolute path to file containing GC-bias model. +# GC-bias models can be produced by neat model-gc-bias (learned from BAM alignments and reference) +# type = string | required: no | default: None (no GC bias) +gc_model: . + # Average sequencing error rate for the sequencing machine # type = float | required = no | must be between 0.0 and 0.3 avg_seq_error: . diff --git a/neat/cli/commands/model_gc_bias.py b/neat/cli/commands/model_gc_bias.py new file mode 100644 index 00000000..4e5332bc --- /dev/null +++ b/neat/cli/commands/model_gc_bias.py @@ -0,0 +1,62 @@ +""" +Command line interface for NEAT's compute GC bias function +""" + +import argparse + +from ...model_gc_bias import compute_gc_bias_runner +from .base import BaseCommand +from .options import output_group + +class Command(BaseCommand): + """ + Class that generates a model of GC bias, derived from real data + """ + name = "model-gc-bias" + description = "Generate GC bias model from a BAM file and a reference FASTA." + + def add_arguments(self, parser: argparse.ArgumentParser): + """ + Add the command's arguments to its parser + """ + parser.add_argument('-i', + type=str, + metavar="input_bam", + dest="input_bam", + required=True, + help="BAM input file.") + + parser.add_argument('-r', + type=str, + metavar="reference", + dest="reference", + required=True, + help="Reference FASTA file.") + + parser.add_argument('-w', + type=int, + metavar="window_size", + dest="window_size", + required=False, + default=100, + help="Window size for GC bias calculation. Default is 100.") + + parser.add_argument('--overwrite', + action='store_true', + required=False, + default=False, + help="Set this flag to overwrite existing output.") + output_group.add_to_parser(parser) + + def execute(self, arguments: argparse.Namespace): + """ + Execute the command + """ + compute_gc_bias_runner( + arguments.input_bam, + arguments.reference, + arguments.output_dir, + arguments.prefix, + arguments.window_size, + arguments.overwrite + ) diff --git a/neat/common/io.py b/neat/common/io.py index a7fbca92..afabee79 100644 --- a/neat/common/io.py +++ b/neat/common/io.py @@ -63,7 +63,13 @@ def open_input(path: str | Path) -> Iterator[TextIO]: # - https://github.com/python/mypy/issues/12053 open_: Callable[..., TextIO] if is_compressed(path): - handle = bgzf.BgzfReader(path, 'r') + try: + handle = bgzf.BgzfReader(path, 'r') + # Trigger reading a block to verify it IS actually a BGZF file + handle.read(0) + except ValueError: + # Fall back to regular gzip if it's not BGZF + handle = gzip.open(path, 'rt', encoding='utf-8') else: handle = open(path, 'rt', encoding='utf-8') try: diff --git a/neat/model_gc_bias/__init__.py b/neat/model_gc_bias/__init__.py new file mode 100644 index 00000000..42a243ce --- /dev/null +++ b/neat/model_gc_bias/__init__.py @@ -0,0 +1 @@ +from .runner import compute_gc_bias_runner diff --git a/neat/model_gc_bias/runner.py b/neat/model_gc_bias/runner.py new file mode 100644 index 00000000..075b4864 --- /dev/null +++ b/neat/model_gc_bias/runner.py @@ -0,0 +1,61 @@ +""" +Creates a model of GC bias in a dataset +""" + +import gzip +import pickle +import numpy as np +import logging +import sys +import pysam +from pathlib import Path +from Bio import SeqIO + +from .utils import calculate_gc_content, get_gc_bias_weights +from ..models import GCBiasModel +from ..common import validate_output_path + +__all__ = [ + "compute_gc_bias_runner" +] + +_LOG = logging.getLogger(__name__) + +def compute_gc_bias_runner( + bam_file: str | Path, + reference_file: str | Path, + output_dir: str | Path, + output_prefix: str, + window_size: int = 100, + overwrite: bool = False): + """ + Main function for computing GC bias model. + """ + _LOG.info("Generating GC bias model") + + output_file = Path(output_dir) / (output_prefix + ".pickle.gz") + validate_output_path(output_file, True, overwrite) + + _LOG.debug(f"BAM: {bam_file}") + _LOG.debug(f"Reference: {reference_file}") + _LOG.debug(f"Window size: {window_size}") + _LOG.debug(f"Output: {output_file}") + + _LOG.info("Calculating GC bias") + + # Simple estimation: + # 1. Calculate GC content for many windows across the reference. + # 2. Count number of reads mapping to those windows. + # 3. Normalize to get weights. + + weights = get_gc_bias_weights(bam_file, reference_file, window_size) + + model = GCBiasModel(weights, window_size) + + _LOG.info(f"Saving model: {output_file}") + with gzip.open(output_file, 'wb') as outfile: + # Saving in NEAT 2.1 compatible format [GC_SCALE_COUNT, GC_SCALE_VAL] + gc_scale_count = list(range(1, 101)) + [window_size] + pickle.dump([gc_scale_count, model.weights.tolist()], outfile) + + _LOG.info("Modeling complete.") diff --git a/neat/model_gc_bias/utils.py b/neat/model_gc_bias/utils.py new file mode 100644 index 00000000..774bc5cb --- /dev/null +++ b/neat/model_gc_bias/utils.py @@ -0,0 +1,86 @@ +""" +Utilities for modeling GC bias +""" + +import numpy as np +import pysam +from Bio import SeqIO +from pathlib import Path + +def calculate_gc_content(sequence: str) -> float: + """ + Calculates GC content of a sequence. + """ + if not sequence: + return 0.0 + sequence = sequence.upper() + gc_count = sequence.count('G') + sequence.count('C') + total_count = sum(1 for base in sequence if base in 'ATGC') + if total_count == 0: + return 0.0 + return gc_count / total_count + +def get_gc_bias_weights(bam_file: str | Path, reference_file: str | Path, window_size: int = 100) -> list[float]: + """ + Estimates GC bias weights from a BAM file and a reference. + """ + # 1. Sample windows across the genome + # 2. For each window, calculate GC content and count reads + # 3. Aggregate counts by GC percentage + # 4. Normalize by the number of windows at each GC percentage + + gc_bins = [0] * 101 + gc_counts = [0.0] * 101 + + with pysam.AlignmentFile(str(bam_file), "rb") as bam: + ref_index = SeqIO.index(str(reference_file), "fasta") + + for contig in bam.references: + if contig not in ref_index: + continue + + contig_seq = ref_index[contig].seq + contig_len = len(contig_seq) + + # Sample windows (not all windows to be faster) + step = max(window_size, contig_len // 1000) + for start in range(0, contig_len - window_size, step): + end = start + window_size + window_seq = contig_seq[start:end] + gc_fraction = calculate_gc_content(str(window_seq)) + gc_percent = int(round(gc_fraction * 100)) + + # Count reads in this window + # Note: count() returns number of alignments overlapping the region + read_count = bam.count(contig, start, end) + + gc_bins[gc_percent] += 1 + gc_counts[gc_percent] += read_count + + # Normalize + weights = [0.0] * 101 + for i in range(101): + if gc_bins[i] > 0: + weights[i] = gc_counts[i] / gc_bins[i] + else: + weights[i] = 0.0 + + # Rescale so max weight is 1.0 + max_w = max(weights) if any(weights) else 1.0 + if max_w > 0: + weights = [w / max_w for w in weights] + else: + weights = [1.0] * 101 + + # Smoothing might be good here, but keeping it simple for now + # Fill in zeros with neighbors or global average if needed + avg_w = sum(weights) / sum(1 for w in weights if w > 0) if any(w > 0 for w in weights) else 1.0 + for i in range(101): + if weights[i] == 0: + weights[i] = avg_w + + # Rescale again + max_w = max(weights) + weights = [w / max_w for w in weights] + + return weights diff --git a/neat/model_sequencing_error/utils.py b/neat/model_sequencing_error/utils.py index 4c3425f4..1abffd03 100644 --- a/neat/model_sequencing_error/utils.py +++ b/neat/model_sequencing_error/utils.py @@ -107,6 +107,8 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse _LOG.debug(f'Assuming read length of {read_length}') temp_q_count = np.zeros((read_length, len(quality_scores)), dtype=int) + # Create a mapping from quality score to index in temp_q_count + qual_to_idx = {q: i for i, q in enumerate(quality_scores)} qual_score_counter = {x: 0 for x in quality_scores} if max_reads == np.inf: _LOG.info("Reading all records...") @@ -147,8 +149,12 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse for j in range(read_length): # The qualities of each read_position_scores - temp_q_count[j][qualities_to_check[j]] += 1 - qual_score_counter[qualities_to_check[j]] += 1 + q_score = qualities_to_check[j] + if q_score in qual_to_idx: + temp_q_count[j][qual_to_idx[q_score]] += 1 + qual_score_counter[q_score] += 1 + else: + _LOG.debug(f"Quality score {q_score} not in expected range") records_read += 1 diff --git a/neat/models/__init__.py b/neat/models/__init__.py index 7fb80f1b..c8c7d1e7 100644 --- a/neat/models/__init__.py +++ b/neat/models/__init__.py @@ -1,4 +1,5 @@ from .error_models import * from .mutation_model import * from .fragment_model import * +from .gc_bias_model import * from .markov_quality_model import * \ No newline at end of file diff --git a/neat/models/gc_bias_model.py b/neat/models/gc_bias_model.py new file mode 100644 index 00000000..4c7e0992 --- /dev/null +++ b/neat/models/gc_bias_model.py @@ -0,0 +1,105 @@ +""" +GC-bias model for NEAT. +""" + +import logging +import pickle +import numpy as np +from typing import Union +from pathlib import Path + +_LOG = logging.getLogger(__name__) + +class GCBiasModel: + """ + Per-GC-content weight table used to bias fragment retention during read simulation. + Weights are stored as 101 values indexed by integer GC percentage (0-100%). + """ + + def __init__(self, weights: Union[list, np.ndarray], window_size: int): + if len(weights) != 101: + raise ValueError("GC bias weights must have exactly 101 elements.") + if window_size <= 0: + raise ValueError("GC bias window size must be positive.") + + self.weights = np.array(weights, dtype=float) + if np.any(self.weights < 0): + raise ValueError("GC bias weights must be non-negative.") + if np.all(self.weights == 0): + raise ValueError("GC bias model must contain at least one positive weight.") + + self.window_size = window_size + self.is_uniform = np.all(self.weights == self.weights[0]) + self.max_weight = np.max(self.weights) + + @classmethod + def from_file(cls, path: Union[str, Path]) -> 'GCBiasModel': + """ + Loads GC bias model from a pickle file. + The pickle file is expected to contain [GC_SCALE_COUNT, GC_SCALE_VAL] + where GC_SCALE_COUNT[-1] is the window size and GC_SCALE_VAL is the weights. + """ + import gzip + try: + with gzip.open(path, 'rb') as f: + data = pickle.load(f) + except (OSError, EOFError, pickle.UnpicklingError): + # Fallback for non-gzipped files + with open(path, 'rb') as f: + data = pickle.load(f) + + if isinstance(data, list) and len(data) == 2: + gc_scale_count, weights = data + window_size = gc_scale_count[-1] + return cls(weights, window_size) + else: + raise ValueError(f"Unexpected data format in GC bias model file: {path}") + + def save(self, path: Union[str, Path]): + """ + Saves GC bias model to a pickle file in the format compatible with NEAT 2.1. + """ + import gzip + gc_scale_count = list(range(1, 101)) + [self.window_size] + data = [gc_scale_count, self.weights.tolist()] + with gzip.open(path, 'wb') as f: + pickle.dump(data, f) + + def get_weight(self, gc_fraction: float) -> float: + """ + Returns the weight for a given GC fraction. + """ + if self.is_uniform: + return self.weights[0] + + index = int(round(gc_fraction * 100)) + index = max(0, min(100, index)) + return self.weights[index] + + def get_weight_for_sequence(self, sequence: str) -> float: + """ + Calculates GC fraction of the sequence and returns the corresponding weight. + """ + if self.is_uniform: + return self.weights[0] + + called_bases = 0 + gc_count = 0 + for base in sequence.upper(): + if base in 'GC': + gc_count += 1 + called_bases += 1 + elif base in 'AT': + called_bases += 1 + + if called_bases == 0: + return 1.0 # Neutral weight for all-N sequences + + gc_fraction = gc_count / called_bases + return self.get_weight(gc_fraction) + +def get_uniform_gc_model(window_size: int = 100) -> GCBiasModel: + """ + Returns a uniform GC bias model (no bias). + """ + return GCBiasModel([1.0] * 101, window_size) diff --git a/neat/read_simulator/single_runner.py b/neat/read_simulator/single_runner.py index 53f3dab1..742864bc 100644 --- a/neat/read_simulator/single_runner.py +++ b/neat/read_simulator/single_runner.py @@ -16,7 +16,8 @@ generate_variants, generate_reads, Options, recalibrate_mutation_regions from ..variants import ContigVariants -from ..models import MutationModel, SequencingErrorModel, FragmentLengthModel, TraditionalQualityModel +from ..models import MutationModel, SequencingErrorModel, FragmentLengthModel, TraditionalQualityModel, \ + GCBiasModel, get_uniform_gc_model __all__ = ["read_simulator_single"] @@ -71,7 +72,8 @@ def read_simulator_single( mut_model, seq_error_model, qual_score_model, - fraglen_model + fraglen_model, + gc_model ) = initialize_all_models(local_options) """ @@ -117,6 +119,7 @@ def read_simulator_single( errors_per_read, qual_score_model, fraglen_model, + gc_model, local_variants, target_regions, discard_regions, @@ -251,9 +254,14 @@ def initialize_all_models(options: Options): fraglen_model = FragmentLengthModel(fragment_mean, fragment_st_dev) # _LOG.debug("Fragment length model loaded") + if options.gc_model: + gc_model = GCBiasModel.from_file(options.gc_model) + else: + gc_model = get_uniform_gc_model() return \ mut_model, \ error_model, \ quality_score_model, \ - fraglen_model + fraglen_model, \ + gc_model diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index ae9cb299..f359bd34 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -4,12 +4,13 @@ from math import ceil from pathlib import Path +import numpy as np from Bio.SeqRecord import SeqRecord from bisect import bisect_left, bisect_right from .output_file_writer import OutputFileWriter from ...common import open_output -from ...models import SequencingErrorModel, FragmentLengthModel, TraditionalQualityModel +from ...models import SequencingErrorModel, FragmentLengthModel, TraditionalQualityModel, GCBiasModel from .options import Options from ...variants import ContigVariants from .read import Read @@ -24,85 +25,217 @@ def cover_dataset( - span_length: int, + reference: SeqRecord, options: Options, fragment_model: FragmentLengthModel | None, + gc_model: GCBiasModel | None, ) -> list: """ Covers a dataset to the desired depth in the paired ended case. This is the main algorithm for creating the reads to the proper coverage depth. It uses an abstract representation of the reads, by end points. - :param span_length: The total length the cover needs to span + :param reference: The reference sequence for this block :param options: The options for the run :param fragment_model: The fragment model used for to generate random fragment lengths + :param gc_model: The GC bias model used for fragment selection """ final_reads = [] + span_length = len(reference) # sanity check - if span_length/fragment_model.fragment_mean < 5: + if span_length / fragment_model.fragment_mean < 5: _LOG.warning("The fragment mean is relatively large compared to the chromosome size. You may need to increase " "standard deviation, or decrease fragment mean, if NEAT cannot complete successfully.") + # precompute how many reads we want # The numerator is the total number of base pair calls needed. # Divide that by read length gives the number of reads needed - number_reads_per_layer = ceil(span_length / fragment_model.fragment_mean) if options.paired_ended: # TODO use gc bias to skew this number. Calculate at the runner level. number_reads = ceil(span_length * options.coverage / (2 * options.read_len)) else: number_reads = ceil(span_length * options.coverage / options.read_len) - # step 1: Divide the span up into segments drawn from the fragment pool. Assign reads based on that. - # step 2: repeat above until number of reads exceeds number_reads - # step 3: shuffle pool, then draw number_reads (or number_reads/2 for paired ended) reads to be our reads - read_count = 0 - loop_count = 0 - - while read_count <= number_reads: - start = 0 - loop_count += 1 - # We use fragments to model the DNA - fragment_pool = fragment_model.generate_fragments(number_reads_per_layer, options.rng) - - temp_fragments = [] - # Mapping random fragments onto genome - i = 0 - while start < span_length: - # We take the first element and put it back on the end to create an endless pool of fragments to draw from - fragment = fragment_pool[i] - i = (i + 1) % len(fragment_pool) - end = min(start + fragment, span_length) - - # Ensure the read is long enough to form a read, else we will not use it. - if (end > options.read_len) and (end - start > options.read_len + 10): - temp_fragments.append((start, end)) - start = end - - # Generating reads from fragments - for fragment in temp_fragments: - read_start = fragment[0] + if gc_model and not gc_model.is_uniform: + # CDF-based sampling for GC bias + window_size = gc_model.window_size + if span_length <= window_size: + # Fallback to uniform if region is too short + return _uniform_sampling(span_length, number_reads, options, fragment_model) + + # Build prefix sum of weights + # We need weights for every possible start position that can produce a read. + # For single-ended, start must be in [0, span_length - read_len] + max_start = span_length - options.read_len + if max_start < 0: + return [] + + n_positions = max_start + 1 + weights = np.zeros(n_positions, dtype=float) + + # Initial window GC count + seq_str = str(reference.seq).upper() + effective_window = min(window_size, span_length) + + # The window for start position i is [i, i + effective_window] + # BUT we must ensure i + effective_window does not exceed span_length. + # Sliding window logic: + # i = 0: window [0, effective_window] + # last i = n_positions - 1: window [max_start, max_start + effective_window] + # max_start + effective_window = span_length - read_len + min(window_size, span_length) + # If window_size > read_len, then max_start + window_size > span_length! + + # We need to cap the window end at span_length. + + gc_count = seq_str[:effective_window].count('G') + seq_str[:effective_window].count('C') + n_count = seq_str[:effective_window].count('N') + + def get_weight(gc, n, win_len): + called = win_len - n + if called <= 0: + return 1.0 + return gc_model.get_weight(gc / called) + + weights[0] = get_weight(gc_count, n_count, effective_window) + + # Sliding window + for i in range(1, n_positions): + # Start position i means window [i, min(i + window_size, span_length)] + # Previous window was [i-1, min(i-1 + window_size, span_length)] + + # Outgoing base is always i-1 + outgoing = seq_str[i-1] + if outgoing in 'GC': + gc_count -= 1 + elif outgoing == 'N': + n_count -= 1 + + # Incoming base depends on whether the window is still expanding or sliding + # current_window_end = min(i + window_size, span_length) + # prev_window_end = min(i - 1 + window_size, span_length) + + current_window_len = effective_window + if i + window_size <= span_length: + # Still sliding full window + incoming = seq_str[i + window_size - 1] + if incoming in 'GC': + gc_count += 1 + elif incoming == 'N': + n_count += 1 + else: + # Window is shrinking at the end of the span + current_window_len = span_length - i + # No new base incoming, outgoing already removed + + weights[i] = get_weight(gc_count, n_count, current_window_len) + + prefix_sum = np.cumsum(weights) + total_weight = prefix_sum[-1] + + if total_weight == 0: + _LOG.debug("All positions in region have zero GC bias weight; no fragments generated.") + return [] + + mean_weight = total_weight / n_positions + + # Coverage adjustment: match rejection-sampling behavior where probability + # of picking a position was weight/max_weight. + number_reads = ceil(number_reads * mean_weight / gc_model.max_weight) + + if number_reads == 0: + return [] + + # Sampling + read_count = 0 + # Increased attempts for edge cases + max_attempts = number_reads * 100 + attempts = 0 + + while read_count < number_reads and attempts < max_attempts: + attempts += 1 + v = options.rng.random() * total_weight + start = np.searchsorted(prefix_sum, v) + + # Generate fragment length + fragment_len = fragment_model.generate_fragments(1, options.rng)[0] + + # If the fragment is too close to the end, wrap around or just clip? + # Original code seems to clip. Let's stick to simple. + if start + options.read_len > span_length: + continue + + end = min(start + fragment_len, span_length) + + # Ensure the read is long enough to form a read + if end - start < options.read_len: + continue + + read_start = start read_end = read_start + options.read_len read1 = (read_start, read_end) if options.paired_ended: - # This will be valid because of the check above - read2 = (fragment[1] - options.read_len, fragment[1]) + # For paired ended, we need enough room for both reads + if end - start < options.read_len + 10: + continue + read2 = (end - options.read_len, end) else: read2 = (0, 0) - # The structure for these reads will be (left_start, left_end, right_start, right_end) - # where start and end are ints with end > start. Reads can overlap, so right_start < left_end - # is possible, but the reads cannot extend past each other, so right_start < left_start and - # left_end > right_end are not possible. - - # sanity check that we haven't created an unrealistic read: + read = read1 + read2 final_reads.append(read) read_count += 1 + + if read_count < number_reads: + _LOG.warning(f"GC-weighted fragment generation placed {read_count}/{number_reads} fragments after {attempts} attempts.") + + else: + # Uniform sampling + final_reads = _uniform_sampling(span_length, number_reads, options, fragment_model) - _LOG.debug(f"Coverage required {loop_count} loops") # Now we shuffle them to add some randomness options.rng.shuffle(final_reads) - # And only return the number we needed - return final_reads[:number_reads] + return final_reads + + +def _uniform_sampling(span_length, number_reads, options, fragment_model): + final_reads = [] + read_count = 0 + # Increase max attempts to handle small spans where many random choices might be invalid + max_attempts = number_reads * 100 + attempts = 0 + while read_count < number_reads and attempts < max_attempts: + attempts += 1 + # Use random sampling that respects span boundaries better + # For single-ended, we need start in [0, span_length - read_len] + if span_length <= options.read_len: + # Region too small for a full read + break + + start = options.rng.integers(0, span_length - options.read_len + 1) + # Generate fragment length + fragment_len = fragment_model.generate_fragments(1, options.rng)[0] + + end = min(start + fragment_len, span_length) + + # Ensure the read is long enough to form a read + if end - start < options.read_len: + continue + + read_start = start + read_end = read_start + options.read_len + read1 = (read_start, read_end) + if options.paired_ended: + # For paired ended, we need enough room for both reads + if end - start < options.read_len + 10: + continue + read2 = (end - options.read_len, end) + else: + read2 = (0, 0) + + read = read1 + read2 + final_reads.append(read) + read_count += 1 + return final_reads def find_applicable_mutations(my_read: Read, all_variants: ContigVariants) -> dict: @@ -151,6 +284,7 @@ def generate_reads( errors_per_read: int, qual_model: TraditionalQualityModel, fraglen_model: FragmentLengthModel, + gc_model: GCBiasModel, contig_variants: ContigVariants, targeted_regions: list, discarded_regions: list, @@ -194,9 +328,10 @@ def generate_reads( # _LOG.debug("Covering dataset.") t = time.time() reads = cover_dataset( - len(reference), + reference, options, fraglen_model, + gc_model, ) # _LOG.debug(f"Dataset coverage took: {(time.time() - t)/60:.2f} m") diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index 6aa2ee2b..4e6c3007 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -86,6 +86,7 @@ def __init__(self, cleanup_splits: bool = True, splits_dir: Path | None = None, reuse_splits: bool = False, + gc_model: Path | None = None, **kwargs: Any ): """ @@ -173,6 +174,7 @@ def __init__(self, self.cleanup_splits: bool = cleanup_splits self.splits_dir: Path | None = splits_dir self.reuse_splits: bool = reuse_splits + self.gc_model: Path | None = Path(gc_model) if gc_model else None # Actual output files self.fq1: Path | None = fq1 @@ -241,7 +243,8 @@ def from_cli(output_dir: Path, 'parallel_block_size': (int, 500000, None, None), 'threads': (int, 1, 1, 1000), 'cleanup_splits': (bool, True, None, None), - 'reuse_splits': (bool, False, None, None) + 'reuse_splits': (bool, False, None, None), + 'gc_model': (Path, None, 'exists', None) } input_args = {} diff --git a/poetry.lock b/poetry.lock index 30cd88c1..bb3286b8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,12 +1,12 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "biopython" version = "1.85" description = "Freely available tools for computational molecular biology." -category = "main" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "biopython-1.85-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a6308053a61f3bdbb11504ece4cf24e264c6f1d6fad278f7e59e6b84b0d9a7b4"}, {file = "biopython-1.85-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:434dd23e972b0c89e128f2ebbd16b38075d609184f4f1fd16368035f923019c2"}, @@ -49,9 +49,10 @@ numpy = "*" name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] +markers = "sys_platform == \"win32\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -61,9 +62,9 @@ files = [ name = "contourpy" version = "1.3.2" description = "Python library for calculating contours of 2D quadrilateral grids" -category = "main" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ {file = "contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934"}, {file = "contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989"}, @@ -134,13 +135,135 @@ mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", " test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] +[[package]] +name = "coverage" +version = "7.14.0" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "coverage-7.14.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:84c32d90bf4537f0e7b4dec9aaa9a938fb8205136b9d2ecf4d7629d5262dc075"}, + {file = "coverage-7.14.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7c843572c605ab51cfdb5c6b5f2586e2a8467c0d28eca4bdef4ec70c5fecbd82"}, + {file = "coverage-7.14.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0c451757d3fa2603354fdc789b5e58a0e327a117c370a40e3476ba4eabab228c"}, + {file = "coverage-7.14.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3fd43f0616e765ab78d069cf8358def7363957a45cee446d65c502dcfeea7893"}, + {file = "coverage-7.14.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:731e535b1498b27d13594a0527a79b0510867b0ad891532be41cb883f2128e20"}, + {file = "coverage-7.14.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c7492f2d493b976941c7ca050f273cbda2f43c381124f7586a3e3c16d1804fec"}, + {file = "coverage-7.14.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:dc38367eaa2abb1b766ac333142bce7655335a73537f5c8b75aaa89c2b987757"}, + {file = "coverage-7.14.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0a951308cde22cf77f953955a754d04dccb57fe3bb8e345d685778ed9fc1632a"}, + {file = "coverage-7.14.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:fab3877e4ebb06bd9d4d4d00ee53309ee5478e66873c66a382272e3ee33eb7ea"}, + {file = "coverage-7.14.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:b812eb847b19876ebf33fb6c4f11819af05ab6050b0bfa1bc53412ae81779adb"}, + {file = "coverage-7.14.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d9c8ef6ed820c433de075657d72dda1f89a2984955e58b8a75feb3f184250218"}, + {file = "coverage-7.14.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d128b1bba9361fbaaf6a19e179e6cfd6a9103ce0c0555876f72780acc93efd85"}, + {file = "coverage-7.14.0-cp310-cp310-win32.whl", hash = "sha256:65f267ca1370726ec2c1aa38bbe4df9a71a740f22878d2d4bf59d71a4cd8d323"}, + {file = "coverage-7.14.0-cp310-cp310-win_amd64.whl", hash = "sha256:b34ece8065914f938ed7f2c5872bb865336977a52919149846eac3744327267a"}, + {file = "coverage-7.14.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a78e2a9d9c5e3b8d4ab9b9d28c985ea66fced0a7d7c2aec1f216e03a2011480"}, + {file = "coverage-7.14.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a1816c505187592dcd1c5a5f226601a549f70365fbd00930ac88b0c225b76bb4"}, + {file = "coverage-7.14.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d8e1762f0e9cbc26ec315471e7b47855218e833cd5a032d706fbf43845d878c7"}, + {file = "coverage-7.14.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9336e23e8bb3a3925398261385e2a1533957d3e760e91070dcb0e98bfa514eed"}, + {file = "coverage-7.14.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cd1169b2230f9cbe9c638ba38022ed7a2b1e641cc07f7cea0365e4be2a74980"}, + {file = "coverage-7.14.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d1bb3543b58fea74d2cd1abc4054cc927e4724687cb4560cd2ed88d2c7d820c0"}, + {file = "coverage-7.14.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a93bac2cb577ef60074999ed56d8a1535894398e2ed920d4185c3ec0c8864742"}, + {file = "coverage-7.14.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5904abf7e18cddc463219b17552229650c6b79e061d31a1059283051169cf7d5"}, + {file = "coverage-7.14.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:741f57cddc9004a8c81b084660215f33a6b597dbe62c31386b983ee26310e327"}, + {file = "coverage-7.14.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:664123feb0929d7affc135717dbd70d61d98688a08ab1e5ba464739620c6252d"}, + {file = "coverage-7.14.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:c83d2399a51bbec8429266905d33616f04bc5726b1138c35844d5fcd896b2e20"}, + {file = "coverage-7.14.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb2e855b87321259a037429288ae85216d191c74de3e79bf57cd2bc0761992c"}, + {file = "coverage-7.14.0-cp311-cp311-win32.whl", hash = "sha256:731dc15b385ac52289743d476245b61e1a2927e803bef655b52bc3b2a75a21f3"}, + {file = "coverage-7.14.0-cp311-cp311-win_amd64.whl", hash = "sha256:bfb0ed8ec5d25e93face268115d7964db9df8b9aae8edcde9ec6b16c726a7cc1"}, + {file = "coverage-7.14.0-cp311-cp311-win_arm64.whl", hash = "sha256:7ebb1c6df9f78046a1b1e0a89674cd4bf73b7c648914eebcf976a57fd99a5627"}, + {file = "coverage-7.14.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7ffd19fc8aed057fd686a17a4935eef5f9859d69208f96310e893e64b9b6ccf5"}, + {file = "coverage-7.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:829994cfe1aeb773ca27bf246d4badc1e764893e3bfb98fff820fcecd1ca4662"}, + {file = "coverage-7.14.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b4f07cf7edcb7ec39431a5074d7ea83b29a9f71fcfc494f0f40af4e65180420f"}, + {file = "coverage-7.14.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ca3d9cf2c32b521bd9518385608787fa86f38daf993695307531822c3430ed67"}, + {file = "coverage-7.14.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92af52828e7f29d827346b0294e5a0853fa206db77db0395b282918d41e28db9"}, + {file = "coverage-7.14.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7b2bb6c9d7e769360d0f20a0f219603fd64f0c8f97de17ab25853261602be0fb"}, + {file = "coverage-7.14.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1c9ed6ef99f88fb8c14aa8e2bf8eb0fe55fa2edfea68f8675d78741df1a5ac0e"}, + {file = "coverage-7.14.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8231ade007f37959fbf58acc677f26b922c02eda6f0428ea307da0fd39681bf3"}, + {file = "coverage-7.14.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d8b013632cc1ce1d09dbe4f32667b4d320ec2f54fc326ebeffcd0b0bcc2bb6c4"}, + {file = "coverage-7.14.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1733198802d71ec4c524f322e2867ee05c62e9e75df86bdca545407a221827d1"}, + {file = "coverage-7.14.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:72a305291fa8ee01332f1aaf38b348ca34097f6aa0b0ef627eef2837e57bbba5"}, + {file = "coverage-7.14.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fcaba850dd317c65423a9d63d88f9573c53b00354d6dd95724576cc98a131595"}, + {file = "coverage-7.14.0-cp312-cp312-win32.whl", hash = "sha256:5ac83957a80d0701310e96d8bec68cdcf4f90a7674b7d13f15a344315b41ab27"}, + {file = "coverage-7.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:70390b0da32cb90b501953716302906e8bcce087cb283e70d8c97729f22e92b2"}, + {file = "coverage-7.14.0-cp312-cp312-win_arm64.whl", hash = "sha256:91b993743d959b8be85b4abf9d5478216a69329c321efe5be0433c1a841d691d"}, + {file = "coverage-7.14.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f2bbb8254370eb4c628ff3d6fa8a7f74ddc40565394d4f7ab791d1fe568e37ef"}, + {file = "coverage-7.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:23b81107f46d3f21d0cbce30664fcec0f5d9f585638a67081750f99738f6bf66"}, + {file = "coverage-7.14.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:22a7e06a5f11a757cdfe79018e9095f9f69ae283c5cd8123774c788deec8717b"}, + {file = "coverage-7.14.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9d1aa57a1dc8e05bdc42e81c5d671d849577aeedf279f4c449d6d286f9ed88ca"}, + {file = "coverage-7.14.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:90c1a51bcfddf645b3bb7ec333d9e94393a8e94f55642380fa8a9a5a9e636cb7"}, + {file = "coverage-7.14.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a841fae2fadcae4f438d43b6ccc4aac2ad609f47cdb6cfdce60cbb3fe5ca7bc2"}, + {file = "coverage-7.14.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c79d2319cabef1fe8e86df73371126931550804738f78ad7d31e3aad85a67367"}, + {file = "coverage-7.14.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1b23b0c6f0b1db6ad769b7050c8b641c0bf215ded26c1816955b17b7f26edfa9"}, + {file = "coverage-7.14.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:55d3089079ce181a4566b1065ab28d2575eb76d8ac8f81f4fcda2bf037fee087"}, + {file = "coverage-7.14.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:49c005cba1e2f9677fb2845dcdf9a2e72a52a17d63e8231aaaae35d9f50215ef"}, + {file = "coverage-7.14.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:9117377b823daa28aa8635fbb08cda1cd6be3d7143257345459559aeef852d52"}, + {file = "coverage-7.14.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7b79d646cf46d5cf9a9f40281d4441df5849e445726e369006d2b117710b33fe"}, + {file = "coverage-7.14.0-cp313-cp313-win32.whl", hash = "sha256:fb609b3658479e33f9516d46f1a89dbb9b6c261366e3a11844a96ec487533dae"}, + {file = "coverage-7.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:0773d8329cf32b6fd222e4b52622c61fe8d503eb966cfc8d3c3c10c96266d50e"}, + {file = "coverage-7.14.0-cp313-cp313-win_arm64.whl", hash = "sha256:b4e26a0f1b696faf283bffe5b8569e44e336c582439df5d53281ab89ee0cba96"}, + {file = "coverage-7.14.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:953f521ca9445300397e65fda3dca58b2dbd68fee983777420b57ac3c77e9f90"}, + {file = "coverage-7.14.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:98af83fd65ae24b1fdd03aaead967a9f523bcd2f1aab2d4f3ffda65bb568a6f1"}, + {file = "coverage-7.14.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:668b92e6958c4db7cf92e81caac328dfbbdbb215db2850ad28f0cbe1eea0bfbd"}, + {file = "coverage-7.14.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9fbd898551762dea00d3fef2b1c4f99afd2c6a3ff952ea07d60a9bd5ed4f34bc"}, + {file = "coverage-7.14.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:68af363c07ecd8d4b7d4043d85cb376d7d227eceb54e5323ee45da73dbd3e426"}, + {file = "coverage-7.14.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6e57054a583da8ac55edf24117ea4c9133032cfc4cf72aa2d48c1e5d4b52f899"}, + {file = "coverage-7.14.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cc3499459bbcdd51a65b64c35ab7ed2764eaf3cba826e0df3f1d7fe2e102b70b"}, + {file = "coverage-7.14.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:45899ec2138a4346ed34d601dedf5076fb74edf2d1dd9dc76a78e82397edee90"}, + {file = "coverage-7.14.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8767486808c436f05b23ab98eb963fb29185e32a9357a166971685cb3459900f"}, + {file = "coverage-7.14.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:a3b5ddfd6aa7ddad53ee3edb231e88a2151507a43229b7d71b953916deca127d"}, + {file = "coverage-7.14.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:63df0fe568e698e1045792399f8ab6da3a6c2dce3182813fb92afa2641087b47"}, + {file = "coverage-7.14.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:827d6397dbd95144939b18f89edf31f63e1f99633e8d5f32f22ba8bdda567477"}, + {file = "coverage-7.14.0-cp313-cp313t-win32.whl", hash = "sha256:7bf43e000d24012599b879791cff41589af90674722421ef11b11a5431920bab"}, + {file = "coverage-7.14.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3f5549365af25d770e06b1f8f5682d9a5637d06eb494db91c6fa75d3950cc917"}, + {file = "coverage-7.14.0-cp313-cp313t-win_arm64.whl", hash = "sha256:6d160217ec6fe890f16ad3a9531761589443749e448f91986c972714fad361c8"}, + {file = "coverage-7.14.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9aed9fa983514ca032790f3fe0d1c0e42ca7e16b42432af1706b50a9a46bef5d"}, + {file = "coverage-7.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ba3b8390db29296dbbf49e91b6fe08f990743a90c8f447ba4c2ffc29670dfa63"}, + {file = "coverage-7.14.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3a5d8e876dfa2f102e970b183863d6dedd023d3c0eeca1fe7a9787bc5f28b212"}, + {file = "coverage-7.14.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5ebb8f4614a3787d567e610bbfdf96a4798dd69a1afb1bd8ad228d4111fe6ff3"}, + {file = "coverage-7.14.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b9bf47223dd8db3d4c4b2e443b02bace480d428f0822c3f991600448a176c97"}, + {file = "coverage-7.14.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3485a836550b303d006d57cc06e3d5afaabc642c77050b7c985a97b13e3776b8"}, + {file = "coverage-7.14.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3e7e88110bae996d199d1693ca8ec3fd52441d426401ae963437598667b4c5eb"}, + {file = "coverage-7.14.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:15228a6800ce7bdf1b74800595e56db7138cecb338fdbf044806e10dcf182dfe"}, + {file = "coverage-7.14.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9d26ac7f5398bafc5b57421ad994e8a4749e8a7a0e62d05ec7d53014d5963bfa"}, + {file = "coverage-7.14.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:2fb73254ff43c911c967a899e1359bc5049b4b115d6e8fbdde4937d0a2246cd5"}, + {file = "coverage-7.14.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:454a380af72c6adada298ed270d38c7a391288198dbfb8467f786f588751a90c"}, + {file = "coverage-7.14.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:65c86fb646d2bd2972e96bd1a8b45817ed907cee68655d6295fe7ec031d04cca"}, + {file = "coverage-7.14.0-cp314-cp314-win32.whl", hash = "sha256:6a6516b02a6101398e19a3f44820f69bab2590697f7def4331f668b14adaf828"}, + {file = "coverage-7.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:45e0f79d8351fa76e256716df91eab12890d32678b9590df7ae1042e4bd4cf5d"}, + {file = "coverage-7.14.0-cp314-cp314-win_arm64.whl", hash = "sha256:4b899594a8b2d81e5cc064a0d7f9cac2081fed91049456cae7676787e41549c9"}, + {file = "coverage-7.14.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:f580f8c80acd94ac72e863efe2cab791d8c38d153e0b463b92dfa000d5c84cd1"}, + {file = "coverage-7.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a2bd259c442cd43c49b30fbafc51776eb19ea396faf159d26a83e6a0a5f13b0c"}, + {file = "coverage-7.14.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:a706b908dfa85538863504c624b237a3cc34232bf403c057414ebfdb3b4d9f84"}, + {file = "coverage-7.14.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7333cd944ee4393b9b3d3c1b598c936d4fc8d70573a4c7dacfec5590dd50e436"}, + {file = "coverage-7.14.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f162bc9a15b82d947b02651b0c7e1609d6f7a8735ca330cfadec8481dd97d5a"}, + {file = "coverage-7.14.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:362cb78e01a5dc82009d88004cf60f2e6b6d6fcbfdec05b05af73b0abf40118f"}, + {file = "coverage-7.14.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:acebd068fca5512c3a6fde9c045f901613478781a73f0e82b307b214daef23fb"}, + {file = "coverage-7.14.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:29fe3da551dface75deb2ccbf87b6b66e2e7ef38f6d89050b428be94afff3490"}, + {file = "coverage-7.14.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:b4cc4fce8672fffcb09b0eafc167b396b3ba53c4a7230f54b7aaffbf6c835fa9"}, + {file = "coverage-7.14.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:5d4a51aad8ba8bdcd2b8bd8f03d4aca19693fa2327a3470e4718a25b03481020"}, + {file = "coverage-7.14.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:9f323af3e1e4f68b60b7b247e37b8515563a61375518fa59de1af48ba28a3db6"}, + {file = "coverage-7.14.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:1a0abc7342ea9711c469dd8b821c6c311e6bc6aac1442e5fbd6b27fae0a8f3db"}, + {file = "coverage-7.14.0-cp314-cp314t-win32.whl", hash = "sha256:a9f864ef57b7172e2db87a096642dd51e179e085ab6b2c371c29e885f65c8fb2"}, + {file = "coverage-7.14.0-cp314-cp314t-win_amd64.whl", hash = "sha256:29943e552fdc08e082eb51400fb2f58e118a83b5542bd06531214e084399b644"}, + {file = "coverage-7.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:742a73ea621953b012f2c4c2219b512180dd84489acf5b1596b0aafc55b9100b"}, + {file = "coverage-7.14.0-py3-none-any.whl", hash = "sha256:8de5b61163aee3d05c8a2beab6f47913df7981dad1baf82c414d99158c286ab1"}, + {file = "coverage-7.14.0.tar.gz", hash = "sha256:057a6af2f160a85384cde4ab36f0d2777bae1057bae255f95413cdd382aa5c74"}, +] + +[package.dependencies] +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} + +[package.extras] +toml = ["tomli ; python_full_version <= \"3.11.0a6\""] + [[package]] name = "cycler" version = "0.12.1" description = "Composable style cycles" -category = "main" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"}, {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"}, @@ -154,9 +277,10 @@ tests = ["pytest", "pytest-cov", "pytest-xdist"] name = "exceptiongroup" version = "1.3.1" description = "Backport of PEP 654 (exception groups)" -category = "dev" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version == \"3.10\"" files = [ {file = "exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"}, {file = "exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219"}, @@ -172,9 +296,9 @@ test = ["pytest (>=6)"] name = "fonttools" version = "4.61.1" description = "Tools to manipulate font files" -category = "main" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24"}, {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958"}, @@ -229,25 +353,25 @@ files = [ ] [package.extras] -all = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres", "pycairo", "scipy", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.45.0)", "unicodedata2 (>=17.0.0)", "xattr", "zopfli (>=0.1.4)"] +all = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\"", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.45.0)", "unicodedata2 (>=17.0.0) ; python_version <= \"3.14\"", "xattr ; sys_platform == \"darwin\"", "zopfli (>=0.1.4)"] graphite = ["lz4 (>=1.7.4.2)"] -interpolatable = ["munkres", "pycairo", "scipy"] +interpolatable = ["munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\""] lxml = ["lxml (>=4.0)"] pathops = ["skia-pathops (>=0.5.0)"] plot = ["matplotlib"] repacker = ["uharfbuzz (>=0.45.0)"] symfont = ["sympy"] -type1 = ["xattr"] -unicode = ["unicodedata2 (>=17.0.0)"] -woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] +type1 = ["xattr ; sys_platform == \"darwin\""] +unicode = ["unicodedata2 (>=17.0.0) ; python_version <= \"3.14\""] +woff = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "zopfli (>=0.1.4)"] [[package]] name = "frozendict" version = "2.4.7" description = "A simple immutable dictionary" -category = "main" optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "frozendict-2.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bd37c087a538944652363cfd77fb7abe8100cc1f48afea0b88b38bf0f469c3d2"}, {file = "frozendict-2.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2b96f224a5431889f04b2bc99c0e9abe285679464273ead83d7d7f2a15907d35"}, @@ -346,9 +470,9 @@ files = [ name = "iniconfig" version = "2.3.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.10" +groups = ["dev"] files = [ {file = "iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12"}, {file = "iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730"}, @@ -358,9 +482,9 @@ files = [ name = "kiwisolver" version = "1.4.9" description = "A fast implementation of the Cassowary constraint solver" -category = "main" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b"}, {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f"}, @@ -469,9 +593,9 @@ files = [ name = "matplotlib" version = "3.10.8" description = "Python plotting package" -category = "main" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ {file = "matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7"}, {file = "matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656"}, @@ -548,9 +672,9 @@ dev = ["meson-python (>=0.13.1,<0.17.0)", "pybind11 (>=2.13.2,!=2.13.3)", "setup name = "numpy" version = "2.2.6" description = "Fundamental package for array computing in Python" -category = "main" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ {file = "numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb"}, {file = "numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90"}, @@ -613,9 +737,9 @@ files = [ name = "packaging" version = "26.0" description = "Core utilities for Python packages" -category = "main" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529"}, {file = "packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4"}, @@ -625,9 +749,9 @@ files = [ name = "pillow" version = "12.1.1" description = "Python Imaging Library (fork)" -category = "main" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ {file = "pillow-12.1.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f1625b72740fdda5d77b4def688eb8fd6490975d06b909fd19f13f391e077e0"}, {file = "pillow-12.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:178aa072084bd88ec759052feca8e56cbb14a60b39322b99a049e58090479713"}, @@ -734,9 +858,9 @@ xmp = ["defusedxml"] name = "pip" version = "25.3" description = "The PyPA recommended tool for installing Python packages." -category = "main" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "pip-25.3-py3-none-any.whl", hash = "sha256:9655943313a94722b7774661c21049070f6bbb0a1516bf02f7c8d5d9201514cd"}, {file = "pip-25.3.tar.gz", hash = "sha256:8d0538dbbd7babbd207f261ed969c65de439f6bc9e5dbd3b3b9a77f25d95f343"}, @@ -746,9 +870,9 @@ files = [ name = "pkginfo" version = "1.12.1.2" description = "Query metadata from sdists / bdists / installed packages." -category = "main" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pkginfo-1.12.1.2-py3-none-any.whl", hash = "sha256:c783ac885519cab2c34927ccfa6bf64b5a704d7c69afaea583dd9b7afe969343"}, {file = "pkginfo-1.12.1.2.tar.gz", hash = "sha256:5cd957824ac36f140260964eba3c6be6442a8359b8c48f4adf90210f33a04b7b"}, @@ -761,9 +885,9 @@ testing = ["pytest", "pytest-cov", "wheel"] name = "pluggy" version = "1.6.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, @@ -777,9 +901,9 @@ testing = ["coverage", "pytest", "pytest-benchmark"] name = "pygments" version = "2.19.2" description = "Pygments is a syntax highlighting package written in Python." -category = "dev" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, @@ -792,9 +916,9 @@ windows-terminal = ["colorama (>=0.4.6)"] name = "pyparsing" version = "3.3.2" description = "pyparsing - Classes and methods to define and execute parsing grammars" -category = "main" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "pyparsing-3.3.2-py3-none-any.whl", hash = "sha256:850ba148bd908d7e2411587e247a1e4f0327839c40e2e5e6d05a007ecc69911d"}, {file = "pyparsing-3.3.2.tar.gz", hash = "sha256:c777f4d763f140633dcb6d8a3eda953bf7a214dc4eff598413c070bcdc117cbc"}, @@ -807,9 +931,9 @@ diagrams = ["jinja2", "railroad-diagrams"] name = "pysam" version = "0.23.3" description = "Package for reading, manipulating, and writing genomic data" -category = "main" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "pysam-0.23.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a0b99d875f293fad0bd9c9c923e8910c03af62d291ebb7d20e69ceaf39e383d4"}, {file = "pysam-0.23.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:725a32970cf4ce322f4ab2a52b755163297027a0349f0d151537fe16bdf525e5"}, @@ -854,9 +978,9 @@ files = [ name = "pytest" version = "8.4.2" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.9" +groups = ["dev"] files = [ {file = "pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79"}, {file = "pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01"}, @@ -874,13 +998,33 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-cov" +version = "6.3.0" +description = "Pytest plugin for measuring coverage." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pytest_cov-6.3.0-py3-none-any.whl", hash = "sha256:440db28156d2468cafc0415b4f8e50856a0d11faefa38f30906048fe490f1749"}, + {file = "pytest_cov-6.3.0.tar.gz", hash = "sha256:35c580e7800f87ce892e687461166e1ac2bcb8fb9e13aea79032518d6e503ff2"}, +] + +[package.dependencies] +coverage = {version = ">=7.5", extras = ["toml"]} +pluggy = ">=1.2" +pytest = ">=6.2.5" + +[package.extras] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] + [[package]] name = "python-dateutil" version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] files = [ {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, @@ -893,9 +1037,9 @@ six = ">=1.5" name = "pyyaml" version = "6.0.3" description = "YAML parser and emitter for Python" -category = "main" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"}, {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"}, @@ -976,9 +1120,9 @@ files = [ name = "scipy" version = "1.15.3" description = "Fundamental algorithms for scientific computing in Python" -category = "main" optional = false python-versions = ">=3.10" +groups = ["main"] files = [ {file = "scipy-1.15.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a345928c86d535060c9c2b25e71e87c39ab2f22fc96e9636bd74d1dbf9de448c"}, {file = "scipy-1.15.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:ad3432cb0f9ed87477a8d97f03b763fd1d57709f1bbde3c9369b1dff5503b253"}, @@ -1034,15 +1178,15 @@ numpy = ">=1.23.5,<2.5" [package.extras] dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"] doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.0.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"] -test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] +test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja ; sys_platform != \"emscripten\"", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] [[package]] name = "six" version = "1.17.0" description = "Python 2 and 3 compatibility utilities" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] files = [ {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, @@ -1052,9 +1196,10 @@ files = [ name = "tomli" version = "2.4.0" description = "A lil' TOML parser" -category = "dev" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_full_version <= \"3.11.0a6\"" files = [ {file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"}, {file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"}, @@ -1109,15 +1254,16 @@ files = [ name = "typing-extensions" version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" -category = "dev" optional = false python-versions = ">=3.9" +groups = ["dev"] +markers = "python_version == \"3.10\"" files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, ] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = "^3.10" -content-hash = "03e55829ef01dd7c5cf9e50c158bcb78b92e53d47313f158b726160a2dfbdaca" +content-hash = "0b7574444f7409ad1b559e3b7c7a3428a0855136a15a20b3004c4a3f40233d27" diff --git a/pyproject.toml b/pyproject.toml index 4f5a2e6c..0fbe6c23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ optional = true [tool.poetry.group.dev.dependencies] pytest = "^8.0" +pytest-cov = "^6.0" [tool.poetry.scripts] neat = 'neat.cli.cli:run' diff --git a/tests/test_models/test_gc_bias_model.py b/tests/test_models/test_gc_bias_model.py new file mode 100644 index 00000000..5c88d969 --- /dev/null +++ b/tests/test_models/test_gc_bias_model.py @@ -0,0 +1,65 @@ +import numpy as np +import pytest +from pathlib import Path +import pickle +import os +from neat.models.gc_bias_model import GCBiasModel, get_uniform_gc_model + +def test_gc_bias_model_init(): + weights = [1.0] * 101 + model = GCBiasModel(weights, window_size=100) + assert model.window_size == 100 + assert len(model.weights) == 101 + assert model.is_uniform + assert model.max_weight == 1.0 + +def test_gc_bias_model_invalid_init(): + with pytest.raises(ValueError): + GCBiasModel([1.0] * 100, 100) + with pytest.raises(ValueError): + GCBiasModel([1.0] * 101, 0) + with pytest.raises(ValueError): + GCBiasModel([-1.0] * 101, 100) + +def test_gc_bias_model_get_weight(): + weights = [0.0] * 101 + weights[50] = 1.0 + model = GCBiasModel(weights, window_size=100) + + assert model.get_weight(0.5) == 1.0 + assert model.get_weight(0.496) == 1.0 # rounds to 0.50 + assert model.get_weight(0.0) == 0.0 + assert model.get_weight(1.0) == 0.0 + +def test_gc_bias_model_get_weight_for_sequence(): + weights = [0.0] * 101 + weights[50] = 1.0 + model = GCBiasModel(weights, window_size=100) + + # 50% GC + assert model.get_weight_for_sequence("GCAT") == 1.0 + # 100% GC + assert model.get_weight_for_sequence("GGCC") == weights[100] + # 0% GC + assert model.get_weight_for_sequence("AATT") == weights[0] + + # Empty/N + assert model.get_weight_for_sequence("NNNN") == 1.0 + +def test_gc_bias_model_save_load(tmp_path): + weights = np.random.random(101).tolist() + window_size = 150 + model = GCBiasModel(weights, window_size) + + path = tmp_path / "gc_model.pickle" + model.save(path) + + loaded_model = GCBiasModel.from_file(path) + assert loaded_model.window_size == window_size + np.testing.assert_allclose(loaded_model.weights, weights) + +def test_uniform_gc_model(): + model = get_uniform_gc_model(200) + assert model.window_size == 200 + assert model.is_uniform + assert model.get_weight(0.3) == 1.0 diff --git a/tests/test_read_simulator/test_gc_bias_integration.py b/tests/test_read_simulator/test_gc_bias_integration.py new file mode 100644 index 00000000..7b3a01b7 --- /dev/null +++ b/tests/test_read_simulator/test_gc_bias_integration.py @@ -0,0 +1,46 @@ +import numpy as np +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from neat.models.gc_bias_model import GCBiasModel +from neat.models.fragment_model import FragmentLengthModel +from neat.read_simulator.utils import Options +from neat.read_simulator.utils.generate_reads import cover_dataset + +def test_gc_biased_sampling(): + # Create a reference with two halves: one 100% AT, one 100% GC + span_length = 10000 + half = span_length // 2 + seq = "A" * half + "G" * half + reference = SeqRecord(Seq(seq), id="chr1") + + # Create a GC bias model that favors GC-rich regions (100% weight for 100% GC, 0% for 0% GC) + weights = [0.0] * 101 + weights[100] = 1.0 + weights[0] = 0.01 # Allow a tiny bit of AT to avoid infinite loops if possible + gc_model = GCBiasModel(weights, window_size=100) + + options = Options(rng_seed=42) + options.read_len = 50 + options.paired_ended = False + options.coverage = 10 + options.overwrite_output = True + + fragment_model = FragmentLengthModel(150, 20) + + # Generate reads + reads = cover_dataset(reference, options, fragment_model, gc_model) + + # Count how many reads are in the AT half vs GC half + at_half_count = 0 + gc_half_count = 0 + + for start, end, _, _ in reads: + if start < half: + at_half_count += 1 + else: + gc_half_count += 1 + + print(f"AT half reads: {at_half_count}, GC half reads: {gc_half_count}") + + # We expect significantly more reads in the GC half + assert gc_half_count > at_half_count * 4 diff --git a/tests/test_read_simulator/test_gc_bias_pipeline.py b/tests/test_read_simulator/test_gc_bias_pipeline.py new file mode 100644 index 00000000..5517adee --- /dev/null +++ b/tests/test_read_simulator/test_gc_bias_pipeline.py @@ -0,0 +1,170 @@ +import gzip +import pickle +import textwrap +import pytest +import pysam +import numpy as np +from pathlib import Path +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from neat.read_simulator.runner import read_simulator_runner +from neat.model_gc_bias.runner import compute_gc_bias_runner + +def _write_ref(path: Path, seq: str) -> Path: + """Write a FASTA reference.""" + path.write_text(f">chr1\n{seq}\n", encoding="utf-8") + return path + +def _write_config(path: Path, ref_path: Path, **overrides) -> Path: + defaults = { + "reference": str(ref_path), + "produce_fastq": "true", + "produce_vcf": "false", + "produce_bam": "false", + "read_len": 50, + "coverage": 2, + "rng_seed": 42, + "overwrite_output": "true", + "cleanup_splits": "true", + } + defaults.update(overrides) + lines = "\n".join(f"{k}: {v}" for k, v in defaults.items()) + path.write_text(lines + "\n", encoding="utf-8") + return path + +def create_fake_bam(bam_path, ref_path, read_count_at=10, read_count_gc=1000): + """Creates a fake BAM file with more reads in GC-rich region.""" + # Reference is 10000bp: 5000bp A, 5000bp G + header = {'HD': {'VN': '1.0'}, + 'SQ': [{'LN': 10000, 'SN': 'chr1'}]} + + with pysam.AlignmentFile(bam_path, "wb", header=header) as out_bam: + # Reads in AT region (0-5000) + for i in range(read_count_at): + a = pysam.AlignedSegment() + a.query_name = f"read_at_{i}" + a.query_sequence = "A" * 50 + a.reference_id = 0 + a.reference_start = 1000 + i + a.cigar = ((0, 50),) + a.qual = "B" * 50 + out_bam.write(a) + + # Reads in GC region (5000-10000) + for i in range(read_count_gc): + a = pysam.AlignedSegment() + a.query_name = f"read_gc_{i}" + a.query_sequence = "G" * 50 + a.reference_id = 0 + a.reference_start = 6000 + i + a.cigar = ((0, 50),) + a.qual = "B" * 50 + out_bam.write(a) + + pysam.index(str(bam_path)) + +def test_gc_bias_full_pipeline(tmp_path): + """ + Test the full GC bias pipeline: + 1. Create a reference and a BAM with GC bias. + 2. Run model-gc-bias to create a model. + 3. Run read-simulator using that model. + 4. Verify the output reads reflect the bias. + """ + # 1. Setup data + ref_seq = "A" * 5000 + "G" * 5000 + ref_path = _write_ref(tmp_path / "ref.fa", ref_seq) + bam_path = tmp_path / "input.bam" + # Create BAM where GC region has much more reads + create_fake_bam(bam_path, ref_path, read_count_at=10, read_count_gc=1000) + + # 2. Run model-gc-bias + model_prefix = "gc_model" + compute_gc_bias_runner( + bam_file=bam_path, + reference_file=ref_path, + output_dir=tmp_path, + output_prefix=model_prefix, + window_size=100, + overwrite=True + ) + + model_path = tmp_path / (model_prefix + ".pickle.gz") + assert model_path.exists() + + # 3. Run read-simulator with the generated model + out_dir = tmp_path / "sim_out" + cfg_path = _write_config( + tmp_path / "sim.yml", + ref_path, + gc_model=str(model_path), + coverage=1, + read_len=50 + ) + + read_simulator_runner(str(cfg_path), str(out_dir), "sim") + + # 4. Verify bias in simulated reads + fq_files = list(out_dir.glob("*.fastq.gz")) + assert len(fq_files) >= 1 + + at_reads = 0 + gc_reads = 0 + + # NEAT_generated_chr1_0_0000000101_0000000151 + # The read name contains the coordinates. + with gzip.open(fq_files[0], "rt") as f: + for line in f: + if line.startswith("@NEAT_generated"): + parts = line.split("_") + start = int(parts[4]) + if start < 5000: + at_reads += 1 + else: + gc_reads += 1 + + print(f"Simulated AT reads: {at_reads}, GC reads: {gc_reads}") + # Since our model was trained on 10x more GC reads, we expect more GC reads here too. + # It won't be exactly 10x because of how cover_dataset works and smoothing in the model, + # but it should be significantly more. + assert gc_reads > at_reads + +def test_gc_bias_multi_contig(tmp_path): + """Test GC bias simulation with multiple contigs.""" + ref_path = tmp_path / "multi.fa" + ref_path.write_text(">chr1\n" + "A"*1000 + "\n>chr2\n" + "G"*1000 + "\n") + + # Model that favors G + weights = [0.1] * 101 + weights[100] = 1.0 + from neat.models.gc_bias_model import GCBiasModel + model = GCBiasModel(weights, window_size=100) + model_path = tmp_path / "skewed_model.pickle.gz" + model.save(model_path) + + out_dir = tmp_path / "multi_out" + cfg_path = _write_config( + tmp_path / "multi.yml", + ref_path, + gc_model=str(model_path), + coverage=10, + read_len=50 + ) + + read_simulator_runner(str(cfg_path), str(out_dir), "sim_multi") + + fq_files = list(out_dir.glob("*.fastq.gz")) + chr1_reads = 0 + chr2_reads = 0 + + with gzip.open(fq_files[0], "rt") as f: + for line in f: + if line.startswith("@NEAT_generated_chr1"): + chr1_reads += 1 + elif line.startswith("@NEAT_generated_chr2"): + chr2_reads += 1 + + # chr2 is 100% G, chr1 is 100% A. With our model, chr2 should have significantly more reads. + assert chr2_reads > chr1_reads diff --git a/tests/test_read_simulator/test_generate_reads.py b/tests/test_read_simulator/test_generate_reads.py index 184a1b6e..adaa3239 100644 --- a/tests/test_read_simulator/test_generate_reads.py +++ b/tests/test_read_simulator/test_generate_reads.py @@ -4,7 +4,7 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -from neat.models import FragmentLengthModel, SequencingErrorModel, TraditionalQualityModel +from neat.models import FragmentLengthModel, SequencingErrorModel, TraditionalQualityModel, GCBiasModel, get_uniform_gc_model from neat.read_simulator.utils import Options from neat.read_simulator.utils.generate_reads import cover_dataset, overlaps, find_applicable_mutations, generate_reads from neat.read_simulator.utils.read import Read @@ -44,6 +44,7 @@ def _compute_avg_coverage(reads, span_length, paired): def test_cover_dataset(): """Test that a cover is successfully generated for different coverage values""" span_length = 5000 + reference = SeqRecord(Seq("A" * span_length), id="chr1") options = Options(rng_seed=0) options.read_len = 100 options.paired_ended = False @@ -53,7 +54,7 @@ def test_cover_dataset(): coverage_values = [1, 2, 5] for coverage in coverage_values: options.coverage = coverage - reads = cover_dataset(span_length, options, fragment_model) + reads = cover_dataset(reference, options, fragment_model, None) coverage_check = [] for i in range(span_length): # Single-ended test, only need read1 @@ -72,6 +73,7 @@ def test_cover_dataset(): def test_paired_cover_dataset(): """Test that a cover is successfully generated for different coverage values""" span_length = 10000 + reference = SeqRecord(Seq("A" * span_length), id="chr1") options = Options(rng_seed=0) options.read_len = 100 options.paired_ended = True @@ -86,7 +88,7 @@ def test_paired_cover_dataset(): for coverage in coverage_values: options.coverage = coverage - reads = cover_dataset(span_length, options, fragment_model) + reads = cover_dataset(reference, options, fragment_model, None) expected_pairs = coverage * (span_length / fragment_model.fragment_mean) expected_reads = 2.0 * expected_pairs @@ -123,7 +125,8 @@ def test_various_read_lengths(): for read_len in range(10, 251, 10): options.read_len = read_len - reads = cover_dataset(span_length, options, fragment_model) + reference = SeqRecord(Seq("A" * span_length), id="chr1") + reads = cover_dataset(reference, options, fragment_model, None) assert isinstance(reads, list) @@ -145,9 +148,10 @@ def test_coverage_ploidy_combinations(): for ploidy in ploidy_values: options.ploidy = ploidy prev_n_reads = None + reference = SeqRecord(Seq("A" * span_length), id="chr1") for coverage in coverage_values: options.coverage = coverage - reads = cover_dataset(span_length, options, fragment_model) + reads = cover_dataset(reference, options, fragment_model, None) assert isinstance(reads, list) for read in reads: @@ -162,6 +166,7 @@ def test_coverage_ploidy_combinations(): def test_single_ended_mode(): """Test cover_dataset in single-ended mode for various configurations""" span_length = 100 + reference = SeqRecord(Seq("A" * span_length), id="chr1") options = Options(rng_seed=0) options.read_len = 50 options.paired_ended = False @@ -171,7 +176,7 @@ def test_single_ended_mode(): options.overwrite_output = True fragment_model = FragmentLengthModel(40, 10) - reads = cover_dataset(span_length, options, fragment_model) + reads = cover_dataset(reference, options, fragment_model, None) coverage_check = [] for i in range(span_length): # Single-ended test, only need read1 @@ -204,7 +209,8 @@ def test_single_ended_coverage_accuracy(fragment_mean, target_coverage): options.coverage = target_coverage options.overwrite_output = True - reads = cover_dataset(span_length, options, fragment_model) + reference = SeqRecord(Seq("A" * span_length), id="chr1") + reads = cover_dataset(reference, options, fragment_model, None) avg = _compute_avg_coverage(reads, span_length, paired=False) assert abs(avg - target_coverage) / target_coverage < 0.10, ( @@ -235,7 +241,8 @@ def test_paired_ended_coverage_accuracy(fragment_mean, target_coverage): options.coverage = target_coverage options.overwrite_output = True - reads = cover_dataset(span_length, options, fragment_model) + reference = SeqRecord(Seq("A" * span_length), id="chr1") + reads = cover_dataset(reference, options, fragment_model, None) avg = _compute_avg_coverage(reads, span_length, paired=True) assert abs(avg - target_coverage) / target_coverage < 0.10, ( @@ -401,7 +408,7 @@ def test_generate_reads_single_ended_returns_read_none_pairs(): opts = _make_options(paired=False) cv = ContigVariants() - results = generate_reads(0, ref, err, 3, qual, frag, cv, + results = generate_reads(0, ref, err, 3, qual, frag, get_uniform_gc_model(), cv, _all_span_targeted(), _nothing_discarded(), opts, None, "chr1", 0, 0) @@ -418,7 +425,7 @@ def test_generate_reads_paired_ended_returns_read_read_pairs(): opts = _make_options(paired=True) cv = ContigVariants() - results = generate_reads(0, ref, err, 3, qual, frag, cv, + results = generate_reads(0, ref, err, 3, qual, frag, get_uniform_gc_model(), cv, _all_span_targeted(), _nothing_discarded(), opts, None, "chr1", 0, 0) @@ -434,7 +441,7 @@ def test_generate_reads_read_length_matches_options(): opts = _make_options(paired=False) cv = ContigVariants() - results = generate_reads(0, ref, err, 3, qual, frag, cv, + results = generate_reads(0, ref, err, 3, qual, frag, get_uniform_gc_model(), cv, _all_span_targeted(), _nothing_discarded(), opts, None, "chr1", 0, 0) @@ -454,7 +461,7 @@ def test_generate_reads_targeted_region_flag_false_filters_all(): cv = ContigVariants() no_target = [(0, _SPAN, False)] - results = generate_reads(0, ref, err, 3, qual, frag, cv, + results = generate_reads(0, ref, err, 3, qual, frag, get_uniform_gc_model(), cv, no_target, _nothing_discarded(), opts, None, "chr1", 0, 0) @@ -469,7 +476,7 @@ def test_generate_reads_discard_region_removes_all(): cv = ContigVariants() discard_all = [(0, _SPAN, True)] - results = generate_reads(0, ref, err, 3, qual, frag, cv, + results = generate_reads(0, ref, err, 3, qual, frag, get_uniform_gc_model(), cv, _all_span_targeted(), discard_all, opts, None, "chr1", 0, 0) @@ -483,7 +490,7 @@ def test_generate_reads_discard_flag_false_keeps_reads(): opts = _make_options(paired=False) cv = ContigVariants() - results = generate_reads(0, ref, err, 3, qual, frag, cv, + results = generate_reads(0, ref, err, 3, qual, frag, get_uniform_gc_model(), cv, _all_span_targeted(), _nothing_discarded(), opts, None, "chr1", 0, 0) @@ -509,7 +516,7 @@ def test_generate_reads_variants_populated_on_reads(): ) cv.add_variant(snv) - results = generate_reads(0, ref, err, 3, qual, frag, cv, + results = generate_reads(0, ref, err, 3, qual, frag, get_uniform_gc_model(), cv, _all_span_targeted(), _nothing_discarded(), opts, None, "chr1", 0, 0) @@ -535,7 +542,7 @@ def test_generate_reads_paired_discard_region_removes_all(): cv = ContigVariants() discard_all = [(0, _SPAN, True)] - results = generate_reads(0, ref, err, 3, qual, frag, cv, + results = generate_reads(0, ref, err, 3, qual, frag, get_uniform_gc_model(), cv, _all_span_targeted(), discard_all, opts, None, "chr1", 0, 0) @@ -549,7 +556,7 @@ def test_generate_reads_paired_no_discard_produces_read_pairs(): opts = _make_options(paired=True) cv = ContigVariants() - results = generate_reads(0, ref, err, 3, qual, frag, cv, + results = generate_reads(0, ref, err, 3, qual, frag, get_uniform_gc_model(), cv, _all_span_targeted(), _nothing_discarded(), opts, None, "chr1", 0, 0) diff --git a/tests/test_read_simulator/test_models_pipeline.py b/tests/test_read_simulator/test_models_pipeline.py new file mode 100644 index 00000000..6706b9f9 --- /dev/null +++ b/tests/test_read_simulator/test_models_pipeline.py @@ -0,0 +1,115 @@ +import gzip +import pickle +import pytest +from pathlib import Path +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +import numpy as np + +from neat.read_simulator.runner import read_simulator_runner +from neat.model_fragment_lengths.runner import compute_fraglen_runner +from neat.model_sequencing_error.runner import model_seq_err_runner + +def _write_ref(path: Path, seq: str = "ACGT" * 2500) -> Path: + """Write a FASTA reference (10kb default).""" + path.write_text(f">chr1\n{seq}\n", encoding="utf-8") + return path + +def _write_config(path: Path, ref_path: Path, **overrides) -> Path: + defaults = { + "reference": str(ref_path), + "produce_fastq": "true", + "produce_vcf": "false", + "produce_bam": "true", + "read_len": 50, + "coverage": 2, + "rng_seed": 42, + "overwrite_output": "true", + "cleanup_splits": "true", + } + defaults.update(overrides) + lines = "\n".join(f"{k}: {v}" for k, v in defaults.items()) + path.write_text(lines + "\n", encoding="utf-8") + return path + +def test_fraglen_model_pipeline(tmp_path): + """Test training a fragment length model and using it in simulation.""" + ref_path = _write_ref(tmp_path / "ref.fa") + + # 1. Run simulation to get a BAM + out_dir1 = tmp_path / "sim1" + cfg_path1 = _write_config(tmp_path / "sim1.yml", ref_path, paired_ended="true", fragment_mean=300, fragment_st_dev=30) + read_simulator_runner(str(cfg_path1), str(out_dir1), "sim1") + + # NEAT might append .bam to what we thought was the filename if not careful, + # but Options.from_cli usually uses the prefix. + # Looking at runner.py: output_files = list(out_dir.glob("sim1*")) + print(f"Files in {out_dir1}: {list(out_dir1.glob('*'))}") + bam_path = out_dir1 / "sim1_golden.bam" + assert bam_path.exists() + + # 2. Train fraglen model from BAM + model_prefix = "fraglen_model" + compute_fraglen_runner( + file=bam_path, + filter_minreads=1, + output_dir=tmp_path, + output_prefix=model_prefix, + overwrite=True + ) + + model_path = tmp_path / (model_prefix + ".pickle.gz") + assert model_path.exists() + + # Verify model content + with gzip.open(model_path, 'rb') as f: + model = pickle.load(f) + assert hasattr(model, 'fragment_mean') + # Should be around 300 + assert 250 < model.fragment_mean < 350 + + # 3. Run simulation with this model + out_dir2 = tmp_path / "sim2" + cfg_path2 = _write_config(tmp_path / "sim2.yml", ref_path, fragment_model=str(model_path), paired_ended="true") + read_simulator_runner(str(cfg_path2), str(out_dir2), "sim2") + assert (out_dir2 / "sim2_r1.fastq.gz").exists() + +def test_seq_err_model_pipeline(tmp_path): + """Test training a sequencing error model and using it in simulation.""" + ref_path = _write_ref(tmp_path / "ref.fa") + + # 1. Run simulation to get FASTQs + out_dir1 = tmp_path / "sim1" + cfg_path1 = _write_config(tmp_path / "sim1.yml", ref_path, coverage=1) + read_simulator_runner(str(cfg_path1), str(out_dir1), "sim1") + + # Check what files were actually produced + print(f"Files in {out_dir1}: {list(out_dir1.glob('*'))}") + # Based on previous failure, maybe it's sim1.fastq.gz if single ended? + fq_path = out_dir1 / "sim1.fastq.gz" + if not fq_path.exists(): + fq_path = out_dir1 / "sim1_r1.fastq.gz" + + assert fq_path.exists() + + # 2. Train sequencing error model from FASTQ + model_prefix = "seq_err_model" + model_seq_err_runner( + files=[str(fq_path)], + offset=33, + qual_scores=42, + max_reads=100, + overwrite=True, + output_dir=str(tmp_path), + output_prefix=model_prefix + ) + + model_path = tmp_path / (model_prefix + ".p.gz") + assert model_path.exists() + + # 3. Run simulation with this model + out_dir2 = tmp_path / "sim2" + cfg_path2 = _write_config(tmp_path / "sim2.yml", ref_path, error_model=str(model_path)) + read_simulator_runner(str(cfg_path2), str(out_dir2), "sim2") + assert (out_dir2 / "sim2.fastq.gz").exists() diff --git a/tests/test_read_simulator/test_single_runner.py b/tests/test_read_simulator/test_single_runner.py index 8b220566..93a52139 100644 --- a/tests/test_read_simulator/test_single_runner.py +++ b/tests/test_read_simulator/test_single_runner.py @@ -21,6 +21,7 @@ MutationModel, SequencingErrorModel, TraditionalQualityModel, + GCBiasModel, ) from neat.read_simulator.single_runner import ( initialize_all_models, @@ -94,10 +95,10 @@ def _make_contig_variants(positions_alts: list, genotype_ploidy: int = 2) -> Con class TestInitializeAllModels: - def test_returns_four_objects(self, tmp_path): + def test_returns_five_objects(self, tmp_path): opts = _make_opts(tmp_path) result = initialize_all_models(opts) - assert len(result) == 4 + assert len(result) == 5 def test_default_mut_model_type(self, tmp_path): opts = _make_opts(tmp_path) @@ -106,19 +107,24 @@ def test_default_mut_model_type(self, tmp_path): def test_default_seq_error_model_type(self, tmp_path): opts = _make_opts(tmp_path) - _, seq_error_model, _, _ = initialize_all_models(opts) + _, seq_error_model, _, _, _ = initialize_all_models(opts) assert isinstance(seq_error_model, SequencingErrorModel) def test_default_qual_score_model_type(self, tmp_path): opts = _make_opts(tmp_path) - _, _, qual_score_model, _ = initialize_all_models(opts) + _, _, qual_score_model, _, _ = initialize_all_models(opts) assert isinstance(qual_score_model, TraditionalQualityModel) def test_default_fraglen_model_type(self, tmp_path): opts = _make_opts(tmp_path) - _, _, _, fraglen_model = initialize_all_models(opts) + _, _, _, fraglen_model, _ = initialize_all_models(opts) assert isinstance(fraglen_model, FragmentLengthModel) + def test_default_gc_model_type(self, tmp_path): + opts = _make_opts(tmp_path) + _, _, _, _, gc_model = initialize_all_models(opts) + assert isinstance(gc_model, GCBiasModel) + def test_mut_model_rng_is_set(self, tmp_path): opts = _make_opts(tmp_path) mut_model, *_ = initialize_all_models(opts) @@ -141,21 +147,21 @@ def test_fragment_mean_creates_fraglen_model_with_mean(self, tmp_path): opts = _make_opts(tmp_path) opts.fragment_mean = 200.0 opts.fragment_st_dev = 40.0 - _, _, _, fraglen_model = initialize_all_models(opts) + _, _, _, fraglen_model, _ = initialize_all_models(opts) assert fraglen_model.fragment_mean == pytest.approx(200.0) def test_no_fragment_mean_uses_read_len_times_two(self, tmp_path): opts = _make_opts(tmp_path) opts.fragment_mean = None opts.read_len = 75 - _, _, _, fraglen_model = initialize_all_models(opts) + _, _, _, fraglen_model, _ = initialize_all_models(opts) assert fraglen_model.fragment_mean == pytest.approx(75 * 2.0) def test_fraglen_model_std_dev_set_from_fragment_mean(self, tmp_path): opts = _make_opts(tmp_path) opts.fragment_mean = None opts.read_len = 100 - _, _, _, fraglen_model = initialize_all_models(opts) + _, _, _, fraglen_model, _ = initialize_all_models(opts) expected_std = 100 * 2.0 * 0.2 assert fraglen_model.fragment_st_dev == pytest.approx(expected_std)