From 942dc3c035b16d48fdf3122e5d04be392a4fbe42 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Fri, 4 Sep 2020 14:50:42 -0500 Subject: [PATCH 1/5] ars type of `lgx` and `lg` should return 2-fields when not in G group Fixes #40 --- pyard/pyard.py | 20 +++++++++++++++----- setup.py | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/pyard/pyard.py b/pyard/pyard.py index e8099be..a5be8ae 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -404,16 +404,26 @@ def redux(self, allele: str, ars_type: str) -> str: return self.dup_g[allele] else: return self.G[allele] - elif ars_type == "lg" and allele in self._lg: - return self.lg[allele] - elif ars_type == "lgx" and allele in self._lgx: - return self.lgx[allele] + elif ars_type == "lg": + if allele in self._lg: + return self.lg[allele] + else: + # for 'lg' when allele is not in G group, + # return allele with only first 2 field + return ':'.join(allele.split(':')[0:2]) + 'g' + elif ars_type == "lgx": + if allele in self._lgx: + return self.lgx[allele] + else: + # for 'lgx' when allele is not in G group, + # return allele with only first 2 field + return ':'.join(allele.split(':')[0:2]) else: if self.remove_invalid: if allele in self.valid: return allele else: - return + return '' else: return allele diff --git a/setup.py b/setup.py index bcc2640..6002b54 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ setup( name='py-ard', - version='0.0.18', + version='0.0.19', description="ARD reduction for HLA with python", long_description=readme + '\n\n' + history, author="CIBMTR", From ff34c71fe237c56cbffd647dac3657fcdd42435d Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Fri, 4 Sep 2020 16:00:42 -0500 Subject: [PATCH 2/5] Alleles with P and G groups are treated as valid alleles. --- pyard/pyard.py | 10 ++++++++-- setup.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pyard/pyard.py b/pyard/pyard.py index a5be8ae..af221e4 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -393,12 +393,15 @@ def redux(self, allele: str, ars_type: str) -> str: """ # PERFORMANCE: precompiled regex - # dealing with leading HLA- - + # dealing with leading 'HLA-' if self.HLA_regex.search(allele): hla, allele_name = allele.split("-") return "-".join(["HLA", self.redux(allele_name, ars_type)]) + # Alleles ending with P or G are valid + if allele.endswith(('P', 'G')): + allele = allele[:-1] + if ars_type == "G" and allele in self._G: if allele in self.dup_g: return self.dup_g[allele] @@ -498,6 +501,9 @@ def isvalid(self, allele: str) -> bool: if not ismac(allele): # PERFORMANCE: use hash instead of allele in "list" # return allele in self.valid + # Alleles ending with P or G are valid + if allele.endswith(('P', 'G')): + allele = allele[:-1] return self.valid_dict.get(allele, False) return True diff --git a/setup.py b/setup.py index 6002b54..9b93bf7 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ setup( name='py-ard', - version='0.0.19', + version='0.0.20', description="ARD reduction for HLA with python", long_description=readme + '\n\n' + history, author="CIBMTR", From 6e75defb386386924bed2fa2987d1b87a06667ea Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Tue, 8 Sep 2020 10:17:33 -0500 Subject: [PATCH 3/5] Fix Tests - Check allele validation without the `HLA-` prefix - cleanup tests Fixes #25 and #26 --- pyard/pyard.py | 5 +++ tests/test_pyard.py | 94 ++++++++++++++++++--------------------------- 2 files changed, 42 insertions(+), 57 deletions(-) diff --git a/pyard/pyard.py b/pyard/pyard.py index af221e4..be23f42 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -503,7 +503,12 @@ def isvalid(self, allele: str) -> bool: # return allele in self.valid # Alleles ending with P or G are valid if allele.endswith(('P', 'G')): + # remove the last character allele = allele[:-1] + # validate allele without the 'HLA-' prefix + if self.HLA_regex.search(allele): + # remove 'HLA-' prefix + allele = allele[4:] return self.valid_dict.get(allele, False) return True diff --git a/tests/test_pyard.py b/tests/test_pyard.py index b29fc48..5b665fa 100644 --- a/tests/test_pyard.py +++ b/tests/test_pyard.py @@ -1,8 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - # -# pyars pyARS. +# py-ard pyARD. # Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved. # # This library is free software; you can redistribute it and/or modify it @@ -24,75 +23,56 @@ # """ -test_pyars +test_pyard ---------------------------------- -Tests for `pyars` module. +Tests for `py-ard` module. """ -import os -import sys import json +import os import unittest from pyard import ARD -class TestPyard(unittest.TestCase): +class TestPyArd(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.db_version = '3290' + cls.ard = ARD(cls.db_version, data_dir='/tmp/3290') def setUp(self): - self.ard = ARD(verbose=True) - self.data_dir = os.path.dirname(__file__) self.assertIsInstance(self.ard, ARD) - expected_json = self.data_dir + "/expected.json" - with open(expected_json) as json_data: - self.expected = json.load(json_data) - pass - - def test_000_nomac(self): - self.ardnomac = ARD(download_mac=False) - self.assertIsInstance(self.ardnomac, ARD) - self.assertFalse(self.ardnomac.download_mac) - self.assertTrue(len(self.ardnomac.mac.keys()) == 0) - self.assertTrue(self.ardnomac.redux("A*01:01:01", 'G') == "A*01:01:01G") - self.assertTrue(self.ardnomac.redux("A*01:01:01", 'lg') == "A*01:01g") - self.assertTrue(self.ardnomac.redux("A*01:01:01", 'lgx') == "A*01:01") - self.assertTrue(self.ardnomac.redux("HLA-A*01:01:01", 'G') == "HLA-A*01:01:01G") - self.assertTrue(self.ardnomac.redux("HLA-A*01:01:01", 'lg') == "HLA-A*01:01g") - self.assertTrue(self.ardnomac.redux("HLA-A*01:01:01", 'lgx') == "HLA-A*01:01") - pass - - def test_001_dbversions(self): - for db in ['3310', '3300', '3290', '3280']: - self.arddb = ARD(dbversion=db, download_mac=False) - self.assertIsInstance(self.arddb, ARD) - self.assertFalse(self.arddb.download_mac) - self.assertTrue(self.arddb.dbversion == db) - self.assertTrue(self.arddb.redux("A*01:01:01", 'G') == "A*01:01:01G") - self.assertTrue(self.arddb.redux("A*01:01:01", 'lg') == "A*01:01g") - self.assertTrue(self.arddb.redux("A*01:01:01", 'lgx') == "A*01:01") - pass - def test_002_remove_invalid(self): - self.assertTrue(self.ard.redux("A*01:01:01", 'G') == "A*01:01:01G") - pass - - def test_003_mac(self): - self.assertTrue(self.ard.redux_gl("A*01:AB", 'G') == "A*01:01:01G/A*01:02") - self.assertTrue(self.ard.redux_gl("HLA-A*01:AB", 'G') == "HLA-A*01:01:01G/HLA-A*01:02") - pass - - def test_004_redux_gl(self): - for ex in self.expected['redux_gl']: + def test_no_mac(self): + self.ard_no_mac = ARD(self.db_version, data_dir='/tmp/3290', download_mac=False) + self.assertIsInstance(self.ard_no_mac, ARD) + self.assertEqual(len(self.ard_no_mac.mac.keys()), 0) + self.assertEqual(self.ard_no_mac.redux("A*01:01:01", 'G'), "A*01:01:01G") + self.assertEqual(self.ard_no_mac.redux("A*01:01:01", 'lg'), "A*01:01g") + self.assertEqual(self.ard_no_mac.redux("A*01:01:01", 'lgx'), "A*01:01") + self.assertEqual(self.ard_no_mac.redux("HLA-A*01:01:01", 'G'), "HLA-A*01:01:01G") + self.assertEqual(self.ard_no_mac.redux("HLA-A*01:01:01", 'lg'), "HLA-A*01:01g") + self.assertEqual(self.ard_no_mac.redux("HLA-A*01:01:01", 'lgx'), "HLA-A*01:01") + + def test_remove_invalid(self): + self.assertEqual(self.ard.redux("A*01:01:01", 'G'), "A*01:01:01G") + + def test_mac(self): + self.assertEqual(self.ard.redux_gl("A*01:AB", 'G'), "A*01:01:01G/A*01:02") + self.assertEqual(self.ard.redux_gl("HLA-A*01:AB", 'G'), "HLA-A*01:01:01G/HLA-A*01:02") + + def test_redux_gl(self): + data_dir = os.path.dirname(__file__) + expected_json = data_dir + "/expected.json" + with open(expected_json) as json_data: + expected = json.load(json_data) + for ex in expected['redux_gl']: glstring = ex['glstring'] ard_type = ex['ard_type'] expected_gl = ex['expected_gl'] - self.assertTrue(self.ard.redux_gl(glstring, ard_type) == expected_gl) - pass - - def test_005_mac_G(self): - self.assertTrue(self.ard.redux("A*01:01:01", 'G') == "A*01:01:01G") - pass - - - + self.assertEqual(self.ard.redux_gl(glstring, ard_type), expected_gl) + def test_mac_G(self): + self.assertEqual(self.ard.redux("A*01:01:01", 'G'), "A*01:01:01G") From 937469c47824d0a810d4c25f0f82165beda40727 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Tue, 8 Sep 2020 12:31:31 -0500 Subject: [PATCH 4/5] Smart Sort Fix and cleanup - Fix 4th field comparison bug - Remove unused function `smart_sort_alleles` - cleanup - upped version to `0.0.21` --- pyard/smart_sort.py | 73 ++++++++++---------------------- setup.py | 2 +- tests/test_smart_sort.py | 89 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 53 deletions(-) create mode 100644 tests/test_smart_sort.py diff --git a/pyard/smart_sort.py b/pyard/smart_sort.py index 04f70db..0316826 100644 --- a/pyard/smart_sort.py +++ b/pyard/smart_sort.py @@ -3,12 +3,13 @@ expr_regex = re.compile('[NQLSGg]') + @functools.lru_cache(maxsize=None) def smart_sort_comparator(a1, a2): """ Natural sort 2 given alleles. - Python sorts strings lexographically but HLA alleles need + Python sorts strings lexicographically but HLA alleles need to be sorted by numerical values in each field of the HLA nomenclature. :param a1: first allele @@ -19,85 +20,53 @@ def smart_sort_comparator(a1, a2): if a1 == a2: return 0 - # remove any non-numerics a1 = re.sub(expr_regex, '', a1) a2 = re.sub(expr_regex, '', a2) + + # Check to see if they are still the same alleles + if a1 == a2: + return 0 + # Extract and Compare first fields first - a1_f1 = int(a1[a1.find('*')+1:a1.find(':')]) - a2_f1 = int(a2[a2.find('*')+1:a2.find(':')]) + a1_f1 = int(a1[a1.find('*') + 1:a1.find(':')]) + a2_f1 = int(a2[a2.find('*') + 1:a2.find(':')]) if a1_f1 < a2_f1: return -1 if a1_f1 > a2_f1: return 1 - # If the first fields are equal, try the 2nd fields + a1_fields = a1.split(':') + a2_fields = a2.split(':') - a1_f2 = int(a1.split(':')[1]) - a2_f2 = int(a2.split(':')[1]) + # If the first fields are equal, try the 2nd fields + a1_f2 = int(a1_fields[1]) + a2_f2 = int(a2_fields[1]) if a1_f2 < a2_f2: return -1 if a1_f2 > a2_f2: return 1 - # If the two fields are equal, try the 3rd fields - - a1_f3 = int(a1.split(':')[2]) - a2_f3 = int(a2.split(':')[2]) + # If the second fields are equal, try the 3rd fields + a1_f3 = int(a1_fields[2]) + a2_f3 = int(a2_fields[2]) if a1_f3 < a2_f3: return -1 if a1_f3 > a2_f3: return 1 - # If the two fields are equal, try the 4th fields - - a1_f4 = int(a1.split(':')[3]) - a2_f3 = int(a2.split(':')[3]) + # If the third fields are equal, try the 4th fields + a1_f4 = int(a1_fields[3]) + a2_f4 = int(a2_fields[3]) if a1_f4 < a2_f4: return -1 if a1_f4 > a2_f4: return 1 - - - # All fields are equal + # All fields are considered equal after 4th field return 0 -def smart_sort_alleles(a1, a2): - """ - Natural sort 2 given alleles. - - Python sorts strings lexographically but HLA alleles need - to be sorted by numerical values in each field of the HLA nomenclature. - - :param a1: first allele - :param a2: second allele - """ - # Check to see if they are the same alleles - if a1 == a2: - return [a1, a2] - - # Extract and Compare first fields first - a1_f1 = int(a1[a1.find('*')+1:a1.find(':')]) - a2_f1 = int(a2[a2.find('*')+1:a2.find(':')]) - - if a1_f1 < a2_f1: - return [a1, a2] - if a1_f1 > a2_f1: - return [a2, a1] - - # If the first fields are equal, try the 2nd fields - a1_f2 = int(a1[a1.find(':')+1:]) - a2_f2 = int(a2[a2.find(':')+1:]) - - if a1_f2 < a2_f2: - return [a1, a2] - if a1_f2 > a2_f2: - return [a2, a1] - - # All fields are equal - return [a1, a2] diff --git a/setup.py b/setup.py index 9b93bf7..0671864 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ setup( name='py-ard', - version='0.0.20', + version='0.0.21', description="ARD reduction for HLA with python", long_description=readme + '\n\n' + history, author="CIBMTR", diff --git a/tests/test_smart_sort.py b/tests/test_smart_sort.py new file mode 100644 index 0000000..ab9063c --- /dev/null +++ b/tests/test_smart_sort.py @@ -0,0 +1,89 @@ +import unittest + +from pyard.smart_sort import smart_sort_comparator + + +class TestSmartSort(unittest.TestCase): + + def setUp(self) -> None: + super().setUp() + + def test_same_comparator(self): + allele = "HLA-A*01:01" + self.assertEqual(smart_sort_comparator(allele, allele), 0) + + def test_equal_comparator(self): + allele1 = "HLA-A*01:01" + allele2 = "HLA-A*01:01" + self.assertEqual(smart_sort_comparator(allele1, allele2), 0) + + def test_equal_comparator_G(self): + # Should compare without G + allele1 = "HLA-A*01:01G" + allele2 = "HLA-A*01:01" + self.assertEqual(smart_sort_comparator(allele1, allele2), 0) + + def test_equal_comparator_NG(self): + # Should compare without N and G + allele1 = "HLA-A*01:01G" + allele2 = "HLA-A*01:01N" + self.assertEqual(smart_sort_comparator(allele1, allele2), 0) + + def test_first_field_comparator_le(self): + allele1 = "HLA-A*01:01" + allele2 = "HLA-A*02:01" + self.assertEqual(smart_sort_comparator(allele1, allele2), -1) + + def test_first_field_comparator_ge(self): + allele1 = "HLA-A*02:01" + allele2 = "HLA-A*01:01" + self.assertEqual(smart_sort_comparator(allele1, allele2), 1) + + def test_second_field_comparator_le(self): + allele1 = "HLA-A*01:01" + allele2 = "HLA-A*01:02" + self.assertEqual(smart_sort_comparator(allele1, allele2), -1) + + def test_second_field_comparator_le_smart(self): + allele1 = "HLA-A*01:29" + allele2 = "HLA-A*01:100" + self.assertEqual(smart_sort_comparator(allele1, allele2), -1) + + def test_second_field_comparator_ge(self): + allele1 = "HLA-A*01:02" + allele2 = "HLA-A*01:01" + self.assertEqual(smart_sort_comparator(allele1, allele2), 1) + + def test_third_field_comparator_le(self): + allele1 = "HLA-A*01:01:01" + allele2 = "HLA-A*01:01:20" + self.assertEqual(smart_sort_comparator(allele1, allele2), -1) + + def test_third_field_comparator_le_smart(self): + allele1 = "HLA-A*01:01:29" + allele2 = "HLA-A*01:01:100" + self.assertEqual(smart_sort_comparator(allele1, allele2), -1) + + def test_third_field_comparator_ge(self): + allele1 = "HLA-A*01:01:02" + allele2 = "HLA-A*01:01:01" + self.assertEqual(smart_sort_comparator(allele1, allele2), 1) + + def test_fourth_field_comparator_le(self): + allele1 = "HLA-A*01:01:01:01" + allele2 = "HLA-A*01:01:01:20" + self.assertEqual(smart_sort_comparator(allele1, allele2), -1) + + def test_fourth_field_comparator_le_smart(self): + allele1 = "HLA-A*01:01:01:39" + allele2 = "HLA-A*01:01:01:200" + self.assertEqual(smart_sort_comparator(allele1, allele2), -1) + + def test_fourth_field_comparator_ge(self): + allele1 = "HLA-A*01:01:01:30" + allele2 = "HLA-A*01:01:01:09" + self.assertEqual(smart_sort_comparator(allele1, allele2), 1) + + +if __name__ == '__main__': + unittest.main() From 279f679f4d95b554dec2fe1d3e2efc259d81ed0b Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Tue, 8 Sep 2020 15:01:44 -0500 Subject: [PATCH 5/5] Update version t 0.0.21 --- pyard/__init__.py | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyard/__init__.py b/pyard/__init__.py index 8b8afcb..cb7e264 100644 --- a/pyard/__init__.py +++ b/pyard/__init__.py @@ -25,4 +25,4 @@ from .pyard import ARD __author__ = """NMDP Bioinformatics""" -__version__ = '0.0.13' +__version__ = '0.0.21' diff --git a/setup.cfg b/setup.cfg index a919b18..be5a281 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.13 +current_version = 0.0.21 commit = True tag = True