diff --git a/CHANGELOG.md b/CHANGELOG.md index 411bd1f..226eae9 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [unreleased] +## [v1.11.2] +### Added +- Parsing model information from fastq headers output by Guppy and MinKNOW. ### Changed - Additional explanatory information in VCF INFO fields concerning depth calculations. diff --git a/medaka/__init__.py b/medaka/__init__.py index 2d84c15..5fc7cae 100755 --- a/medaka/__init__.py +++ b/medaka/__init__.py @@ -5,7 +5,7 @@ import subprocess import sys -__version__ = "1.11.1" +__version__ = "1.11.2" try: import pyabpoa as abpoa diff --git a/medaka/models.py b/medaka/models.py index ddac598..8d1e530 100644 --- a/medaka/models.py +++ b/medaka/models.py @@ -173,16 +173,24 @@ def _model_from_fastq(fname): models = set() with pysam.FastxFile(fname, 'r') as fastq: for rec in itertools.islice(fastq, 100): - # model is embedded in RG:Z: tag of comment as - # __, but model has _ - # characters in also so search for known models try: + # dorado SAM converted to FASTQ with e.g. samtools fastq + # model is embedded in RG:Z: tag of comment as + # __, but model has _ + # characters in also so search for known models read_group = rec.comment.split("RG:Z:")[1].split()[0] for model in known_models: if model in read_group: models.add(model) except Exception: - pass + # minknow/guppy + # basecall_model_version_id= + try: + model = rec.comment.split( + "basecall_model_version_id=")[1].split()[0] + models.add(model) + except Exception: + pass if len(models) > 1: # filter out any models without an `@`. These are likely FPs of # the search above (there are unversioned models whose name diff --git a/medaka/test/data/bc_model_scrape_minknow.fastq.gz b/medaka/test/data/bc_model_scrape_minknow.fastq.gz new file mode 100644 index 0000000..95329ec Binary files /dev/null and b/medaka/test/data/bc_model_scrape_minknow.fastq.gz differ diff --git a/medaka/test/test_model.py b/medaka/test/test_model.py index 601eb1c..d768b3a 100755 --- a/medaka/test/test_model.py +++ b/medaka/test/test_model.py @@ -71,6 +71,7 @@ class TestScrapBasecaller(unittest.TestCase): root_dir = os.path.abspath(os.path.dirname(__file__)) bam = os.path.join(root_dir, 'data/bc_model_scrape.bam') fastq = os.path.join(root_dir, 'data/bc_model_scrape.fastq.gz') + fastq_minknow = os.path.join(root_dir, 'data/bc_model_scrape_minknow.fastq.gz') def test_000_from_bam_consensus(self): model = models.model_from_basecaller(self.bam, variant=False) @@ -88,6 +89,13 @@ def test_011_from_fastq_variant(self): model = models.model_from_basecaller(self.fastq, variant=True) self.assertEqual(model, "r1041_e82_400bps_hac_variant_v4.2.0") + def test_020_from_fastq_minknow(self): + model = models.model_from_basecaller(self.fastq_minknow, variant=False) + self.assertEqual(model, "r1041_e82_400bps_sup_v4.2.0") + + def test_021_from_fastq_minknow_variant(self): + model = models.model_from_basecaller(self.fastq_minknow, variant=True) + self.assertEqual(model, "r1041_e82_400bps_sup_variant_v4.2.0") class TestBuildModel(unittest.TestCase):