In [229]:
%%writefile ../config/patterns.yaml
patterns:
  r1:
    - "_1"
    - "_r1"
    - "_R1"
    - "_r"
    - "_F"
  r2:
    - "_2"
    - "_r2"
    - "_R2"
    - "_f"
    - "_R"
  ignore:
    - "^i_"
    - "^I_"
    - "_i\\d+"
    - "_I\\d+"


Overwriting ../config/patterns.yaml


In [173]:
! ln -s ../config .

In [7]:
# %load  ../frmatcher/fastq_file_name_checker.py
import re
from typing import Dict, List

import pkg_resources
import yaml
from loguru import logger


class FastqFileNameChecker:
    def __init__(
        self,
        filenames: List[str],
        config_path: str = None,
        length_check: bool = False,
        verbose: bool = False,
    ):
        """
        Initialize the FastqFileNameChecker with a list of filenames.

        Args:
            filenames (List[str]): List of filenames to categorize.
            config_path (str): Path to the YAML configuration file. Default loads from package if None.
            length_check (bool): Whether to check if all filenames have the same length. Default is False.
            verbose (bool): Whether to enable detailed logging. Default is False.

        Raises:
            ValueError: If filenames have different lengths and length_check is True.
        """
        self.filenames = filenames
        self.verbose = verbose

        # Set logging level based on verbosity
        logger.remove()  # Remove the default handler
        if self.verbose:
            logger.add(lambda msg: print(msg, end=""), level="DEBUG", colorize=True)
        else:
            logger.add(lambda msg: print(msg, end=""), level="ERROR", colorize=True)

        # Load patterns from the package if no config_path is provided
        if config_path is None:
            config_path = pkg_resources.resource_filename(
                __name__, "config/patterns.yaml"
            )
        self.patterns = self.load_patterns(config_path)

        if length_check:
            self._check_filename_lengths()

    def load_patterns(self, config_path: str) -> Dict[str, List[str]]:
        """
        Load patterns from a YAML configuration file.

        Args:
            config_path (str): Path to the YAML configuration file.

        Returns:
            Dict[str, List[str]]: A dictionary with R1, R2, and ignore patterns.
        """
        with open(config_path, "r") as file:
            config = yaml.safe_load(file)
        logger.debug(f"Loaded patterns from {config_path}")
        return config["patterns"]

    def _check_filename_lengths(self) -> None:
        """
        Checks if all filenames have the same length.

        Raises:
            ValueError: If filenames do not have the same length.
        """
        lengths = list(map(len, self.filenames))
        if len(set(lengths)) > 1:
            logger.error(
                "Filenames do not all have the same length. Please ensure all filenames are consistent."
            )
            raise ValueError(
                "Filenames do not all have the same length. Please ensure all filenames are consistent."
            )
        logger.info("All filenames have the same length.")

    def categorize_fastq_files(self) -> Dict[str, List[str]]:
        """
        Categorizes FASTQ files into R1, R2, or ignored based on filename patterns.

        Returns:
            Dict[str, List[str]]: A dictionary with keys 'R1', 'R2', and 'ignored', each containing lists of filenames.
        """
        if not hasattr(self, "patterns"):
            raise ValueError(
                "No patterns loaded. Either provide a config_path or manually inject patterns."
            )

        # Compile regex patterns from the YAML configuration
        r1_patterns = [
            re.compile(f".*({pattern})(\.|\_|\-).*") for pattern in self.patterns["r1"]
        ]
        r2_patterns = [
            re.compile(f".*({pattern})(\.|\_|\-).*") for pattern in self.patterns["r2"]
        ]
        ignore_patterns = [
            re.compile(f".*({pattern}).*") for pattern in self.patterns["ignore"]
        ]

        # Initialize the result dictionary
        categorized_files = {"R1": [], "R2": [], "ignored": []}

        # Categorize each file
        for filename in self.filenames:
            if any(pattern.search(filename) for pattern in ignore_patterns):
                categorized_files["ignored"].append(filename)
                logger.debug(f"Ignored file: {filename}")
            elif any(pattern.search(filename) for pattern in r1_patterns):
                categorized_files["R1"].append(filename)
                logger.debug(f"Categorized as R1: {filename}")
            elif any(pattern.search(filename) for pattern in r2_patterns):
                categorized_files["R2"].append(filename)
                logger.debug(f"Categorized as R2: {filename}")
            else:
                # If it doesn't match any of the patterns, categorize as ignored
                categorized_files["ignored"].append(filename)
                logger.debug(
                    f"File did not match any R1 or R2 patterns. Index file? {filename}"
                )

        # Sort the filenames alphabetically in each category
        for category in categorized_files:
            categorized_files[category].sort()

        # Check if the number of R1 and R2 files is balanced
        len_r1 = len(categorized_files.get("R1", []))
        len_r2 = len(categorized_files.get("R2", []))

        if len_r1 != len_r2:
            logger.error(f"Unbalanced categories: R1={len_r1}, R2={len_r2}")
            raise ValueError(f"Unbalanced categories: R1={len_r1}, R2={len_r2}")

        return categorized_files

  import pkg_resources


In [8]:
# Sample usage
filenames = [
    "sample_R_L001.fastq.gz",  # R1 with lane info
    "sample_F_L001.fastq.gz",  # R2 with lane info
    "sample_R_L011.fastq.gz",  # R1 with lane info
    "sample_F_L011.fastq.gz",  # R2 with lane info
    # "sample_2_L011.fastq.gz",  # R2 with lane info
    "i_sample_1_L001.fastq.gz",  # Ignored due to prefix "i_"
    "I_sample_2_L001.fastq.gz",  # Ignored due to prefix "I_"
    #     "sample_r1_i1.fastq.gz",   # Ignored due to multiplex index
    #     "sample_A_L001.fastq.gz",  # R1 with lane info
    #     "sample_B_L001.fastq.gz",  # R2 with lane info
]

try:
    checker = FastqFileNameChecker(
        filenames,
        length_check=False,
        verbose=True,
        config_path="../frmatcher/config/patterns.yaml",
    )
    categorized_files = checker.categorize_fastq_files()

    # Output the categorized files
    for category, files in categorized_files.items():
        logger.info(f"{category}:")
        for file in files:
            logger.info(f"  - {file}")
except ValueError as e:
    logger.error(str(e))

[32m2024-08-21 16:16:11.437[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_patterns[0m:[36m62[0m - [34m[1mLoaded patterns from ../frmatcher/config/patterns.yaml[0m
[32m2024-08-21 16:16:11.438[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcategorize_fastq_files[0m:[36m118[0m - [34m[1mCategorized as R2: sample_R_L001.fastq.gz[0m
[32m2024-08-21 16:16:11.455[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcategorize_fastq_files[0m:[36m115[0m - [34m[1mCategorized as R1: sample_F_L001.fastq.gz[0m
[32m2024-08-21 16:16:11.455[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcategorize_fastq_files[0m:[36m118[0m - [34m[1mCategorized as R2: sample_R_L011.fastq.gz[0m
[32m2024-08-21 16:16:11.455[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcategorize_fastq_files[0m:[36m115[0m - [34m[1mCategorized as R1: sample_F_L011.fastq.gz[0m
[32m2024-08-21 16:16:11.455[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mcategorize_fastq_fil

In [9]:
categorized_files

{'R1': ['sample_F_L001.fastq.gz', 'sample_F_L011.fastq.gz'],
 'R2': ['sample_R_L001.fastq.gz', 'sample_R_L011.fastq.gz'],
 'ignored': ['I_sample_2_L001.fastq.gz', 'i_sample_1_L001.fastq.gz']}

In [262]:
%%writefile ../tests/test_fastq_file_name_checker.py
import unittest
import yaml
# Adjust the import according to your module structure
from frmatcher.fastq_file_name_checker import FastqFileNameChecker

class TestFastqFileNameChecker(unittest.TestCase):

    def test_categorization(self):
        # Define the YAML patterns directly in the test
        patterns = yaml.safe_load(r"""
        patterns:
          r1:
            - "_1"
            - "_R1"
          r2:
            - "_2"
            - "_R2"
          ignore:
            - "^i_"
            - "^I_"
            - "_i\\d+"
            - "_I\\d+"
        """)

        filenames = [
            "sample_1_L001.fastq.gz",    # R1
            "sample_R1_L001.fastq.gz",   # R1
            "sample_2_L001.fastq.gz",    # R2
            "sample_R2_L001.fastq.gz",   # R2
            "sample_i1_L001.fastq.gz",   # Ignored
            "sample_I2_L001.fastq.gz",   # Ignored
            "i_sample_1_L001.fastq.gz",  # Ignored
            "I_sample_2_L001.fastq.gz",  # Ignored
            "sample_A_L001.fastq.gz"     # Ignored (no matching pattern)
        ]

        # Initialize checker with in-memory patterns
        checker = FastqFileNameChecker(filenames, config_path=None)
        checker.patterns = patterns['patterns']  # Inject the test patterns directly
        categorized_files = checker.categorize_fastq_files()

        # Assert correct categorization
        expected_r1 = {"sample_1_L001.fastq.gz", "sample_R1_L001.fastq.gz"}
        expected_r2 = {"sample_2_L001.fastq.gz", "sample_R2_L001.fastq.gz"}
        expected_ignored = {
            "sample_i1_L001.fastq.gz",
            "sample_I2_L001.fastq.gz",
            "i_sample_1_L001.fastq.gz",
            "I_sample_2_L001.fastq.gz",
            "sample_A_L001.fastq.gz"
        }

        with self.subTest("R1 Categorization"):
            self.assertEqual(set(categorized_files['R1']), expected_r1)

        with self.subTest("R2 Categorization"):
            self.assertEqual(set(categorized_files['R2']), expected_r2)

        with self.subTest("Ignored Categorization"):
            self.assertEqual(set(categorized_files['ignored']), expected_ignored)

if __name__ == '__main__':
    unittest.main()


Overwriting ../tests/test_fastq_file_name_checker.py


In [263]:
%%writefile ../.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
venv/
ENV/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# Environments
.env
.envrc
venv/
.venv/

# Poetry-specific files
poetry.lock
# If you use pyproject.toml for other purposes, uncomment the line below
# !pyproject.toml

# VS Code settings
.vscode/

# MacOS files
.DS_Store

# PyCharm settings
.idea/

# Local config files
*.local
*.log


Overwriting ../.gitignore


In [224]:
%%writefile ../LICENSE
MIT License

Copyright (c) 2024 odinokov

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

Writing ../LICENSE


In [10]:
from frmatcher.fastq_file_name_checker import FastqFileNameChecker

filenames = [
    "sample_1_L001.fastq.gz",
    "sample_2_L001.fastq.gz",
    "sample_1_L002.fastq.gz",
    "sample_2_L002.fastq.gz",
]

checker = FastqFileNameChecker(
    filenames, length_check=False, verbose=False, config_path=None
)
checker.patterns = {
    "r1": ["_1", "_R1"],
    "r2": ["_2", "_R2"],
    "ignore": ["^i_", "^I_", "_i\\d+", "_I\\d+"],
}
categorized_files = checker.categorize_fastq_files()

print(categorized_files)
# {'R1': ['sample_1_L001.fastq.gz', 'sample_1_L002.fastq.gz'], 'R2': ['sample_2_L001.fastq.gz', 'sample_2_L002.fastq.gz'], 'ignored': []}

{'R1': ['sample_1_L001.fastq.gz', 'sample_1_L002.fastq.gz'], 'R2': ['sample_2_L001.fastq.gz', 'sample_2_L002.fastq.gz'], 'ignored': []}


In [11]:
%%writefile ../README.md
# FRMatcher

**FRMatcher** categorizes a list of presumably FASTQ files into `R1` (forward reads) and `R2` (reverse reads) pairs using customizable pattern matching.

## Installation

Clone the repository:
   ```bash
   git clone https://github.com/odinokov/frmatcher.git
   cd frmatcher
   ```

Activate the virtual environment:
   ```bash
   poetry shell
   ```

Build the package:
   ```bash
   poetry build
   ```

Install the package locally:
   ```bash
   poetry install
   ```

## Usage

```python
from frmatcher.fastq_file_name_checker import FastqFileNameChecker

filenames = [
    "sample_1_L001.fastq.gz",
    "sample_2_L001.fastq.gz",
    "sample_1_L002.fastq.gz",
    "sample_2_L002.fastq.gz",
]

checker = FastqFileNameChecker(filenames,
                              length_check=False,
                              verbose=False)

# checker = FastqFileNameChecker(filenames,
#                                length_check=True,
#                                verbose=True,
#                                config_path=None)
# checker.patterns = {
#     'r1': ["_1", "_R1"],
#     'r2': ["_2", "_R2"],
#     'ignore': ["^i_", "^I_", "_i\\d+", "_I\\d+"]
# }

categorized_files = checker.categorize_fastq_files()

print(categorized_files)

# {'R1': ['sample_1_L001.fastq.gz', 'sample_1_L002.fastq.gz'], 
# 'R2': ['sample_2_L001.fastq.gz', 'sample_2_L002.fastq.gz'], 
# 'ignored': []}

```

## License

MIT License. See the [LICENSE](LICENSE) file for details.


Overwriting ../README.md
