From 7dafdfd6901e3cb41acde493a636f784a73a33e8 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Mon, 30 Nov 2020 14:50:40 -0600 Subject: [PATCH 1/5] Support Version 2 Nomenclature - Add initial support for importing V2 into db - Lookup V2 nomenclature --- pyard/data_repository.py | 44 +++++++++++++++++++++++++++++++++ pyard/db.py | 43 ++++++++++++++++++++++++++++++-- pyard/pyard.py | 40 +++++++++++++++++++++++++----- tests/features/version2.feature | 16 ++++++++++++ tests/steps/redux_allele.py | 5 ++++ 5 files changed, 140 insertions(+), 8 deletions(-) create mode 100644 tests/features/version2.feature diff --git a/pyard/data_repository.py b/pyard/data_repository.py index 7c5cef0..c475b00 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -1,3 +1,25 @@ +# -*- coding: utf-8 -*- +# +# py-ard +# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved. +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# import functools import sqlite3 @@ -299,3 +321,25 @@ def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version): # Save the serology mapping to db db.save_dict(db_connection, table_name='serology_mapping', dictionary=sero_mapping, columns=('serology', 'allele_list')) + + +def generate_v2_to_v3_mapping(db_connection: sqlite3.Connection, imgt_version): + if not db.table_exists(db_connection, 'v2_mapping'): + # TODO: Create mapping table using both the allele list history and + # deleted alleles as reference. + # Temporary Example + v2_to_v3_example = { + "A*0104": "A*01:04N", + "A*0105N": "A*01:04N", + "A*0111": "A*01:11N", + "A*01123": "A*01:123N", + "A*0115": "A*01:15N", + "A*0116": "A*01:16N", + "A*01160": "A*01:160N", + "A*01162": "A*01:162N", + "A*01178": "A*01:178N", + "A*01179": "A*01:179N", + "DRB5*02ZB": "DRB5*02:UTV", + } + db.save_dict(db_connection, table_name='v2_mapping', + dictionary=v2_to_v3_example, columns=('v2', 'v3')) diff --git a/pyard/db.py b/pyard/db.py index 13d754a..488699c 100644 --- a/pyard/db.py +++ b/pyard/db.py @@ -1,3 +1,25 @@ +# -*- coding: utf-8 -*- +# +# py-ard +# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved. +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# import pathlib import sqlite3 from typing import Tuple, Dict, Set, List @@ -92,7 +114,7 @@ def serology_to_alleles(connection: sqlite3.Connection, serology: str) -> List[s :return: List of alleles """ serology_query = "SELECT allele_list from serology_mapping where serology = ?" - cursor = connection.execute(serology_query, (serology, )) + cursor = connection.execute(serology_query, (serology,)) result = cursor.fetchone() cursor.close() if result: @@ -102,6 +124,23 @@ def serology_to_alleles(connection: sqlite3.Connection, serology: str) -> List[s return alleles +def v2_to_v3_allele(connection: sqlite3.Connection, v2_allele: str) -> str: + """ + Look up V3 version of the allele in the database. + + :param connection: db connection of type sqlite.Connection + :param v2_allele: V2 allele + :return: V3 allele + """ + v2_query = "SELECT v3 from v2_mapping where v2 = ?" + cursor = connection.execute(v2_query, (v2_allele,)) + result = cursor.fetchone() + cursor.close() + if result: + return result[0] + return '' + + def is_valid_mac_code(connection: sqlite3.Connection, code: str) -> bool: """ Check db if the MAC code exists. @@ -215,4 +254,4 @@ def load_dict(connection: sqlite3.Connection, table_name: str, columns: Tuple[st cursor.execute(select_all_query) table_as_dict = {k: v for k, v in cursor.fetchall()} cursor.close() - return table_as_dict \ No newline at end of file + return table_as_dict diff --git a/pyard/pyard.py b/pyard/pyard.py index 5ee76e2..3b62741 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -27,8 +27,8 @@ from . import db from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes, \ - generate_serology_mapping -from .db import is_valid_mac_code, mac_code_to_alleles + generate_serology_mapping, generate_v2_to_v3_mapping +from .db import is_valid_mac_code, mac_code_to_alleles, v2_to_v3_allele from .smart_sort import smart_sort_comparator HLA_regex = re.compile("^HLA-") @@ -66,6 +66,8 @@ def __init__(self, imgt_version: str = 'Latest', self.dup_g, self._G, self._lg, self._lgx = generate_ars_mapping(self.db_connection, imgt_version) # Load Serology mappings generate_serology_mapping(self.db_connection, imgt_version) + # Load V2 to V3 mappings + generate_v2_to_v3_mapping(self.db_connection, imgt_version) # Close the current read-write db connection self.db_connection.close() @@ -172,6 +174,11 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(smart_sort_comparator))) + # Handle V2 to V3 mapping + if self.is_v2(glstring): + glstring = self._map_v2_to_v3(glstring) + return self.redux_gl(glstring, redux_type) + # Handle Serology if self.is_serology(glstring): alleles = self._get_alleles_from_serology(glstring) @@ -232,6 +239,17 @@ def is_mac(gl: str) -> bool: """ return re.search(r":\D+", gl) is not None + @staticmethod + def is_v2(allele: str) -> bool: + """ + Version 2 of the nomenclature is a single field. + It does not have any ':' field separator. + Eg: A*0104 + :param allele: Possible allele + :return: Is the allele in V2 nomenclature + """ + return '*' in allele and not ':' in allele + def _is_valid_allele(self, allele): """ Test if allele is valid in the current imgt database @@ -255,7 +273,7 @@ def _get_alleles(self, code, locus_antigen) -> Iterable[str]: # else it's a group expansion is_allelic_expansion = any([':' in allele for allele in alleles]) if is_allelic_expansion: - locus = locus_antigen.split('*')[0] # Just keep the locus name + locus = locus_antigen.split('*')[0] # Just keep the locus name alleles = [f'{locus}*{a}' for a in alleles] else: alleles = [f'{locus_antigen}:{a}' for a in alleles] @@ -272,6 +290,14 @@ def _get_alleles_from_serology(self, serology) -> Iterable[str]: else: return alleles + def _map_v2_to_v3(self, v2_allele): + """ + Get V3 version of V2 versioned allele + :param v2_allele: V2 versioned allele + :return: V3 versioned allele + """ + return v2_to_v3_allele(self.db_connection, v2_allele) + def isvalid(self, allele: str) -> bool: """ Determines validity of an allele @@ -283,7 +309,9 @@ def isvalid(self, allele: str) -> bool: """ if allele == '': return False - if not self.is_mac(allele) and not self.is_serology(allele): + if not self.is_mac(allele) and \ + not self.is_serology(allele) and \ + not self.is_v2(allele): # Alleles ending with P or G are valid_alleles if allele.endswith(('P', 'G')): # remove the last character @@ -330,7 +358,7 @@ def mac_toG(self, allele: str) -> str: """ locus_antigen, code = allele.split(":") if HLA_regex.search(allele): - locus_antigen = locus_antigen.split("-")[1] # Remove HLA- prefix + locus_antigen = locus_antigen.split("-")[1] # Remove HLA- prefix if is_valid_mac_code(self.db_connection, code): alleles = self._get_alleles(code, locus_antigen) group = [self.toG(a) for a in alleles] @@ -370,7 +398,7 @@ def expand_mac(self, mac_code: str): locus_antigen, code = mac_code.split(":") if is_valid_mac_code(self.db_connection, code): if HLA_regex.search(mac_code): - locus_antigen = locus_antigen.split("-")[1] # Remove HLA- prefix + locus_antigen = locus_antigen.split("-")[1] # Remove HLA- prefix return ['HLA-' + a for a in self._get_alleles(code, locus_antigen)] else: return list(self._get_alleles(code, locus_antigen)) diff --git a/tests/features/version2.feature b/tests/features/version2.feature new file mode 100644 index 0000000..195d452 --- /dev/null +++ b/tests/features/version2.feature @@ -0,0 +1,16 @@ +Feature: Version 2 Nomenclature + + py-ard is able to reduce version 2 HLA nomenclature. + + Scenario Outline: + + Given the version 2 typing is + When reducing on the level (ambiguous) + Then the reduced allele is found to be + + + Examples: Valid A serology typings + | Version2 | Level | Redux Allele | + | A*0105N | G | A*01:01:01G | + | A*0111 | G | A*01:11N | + | DRB5*02ZB | G | DRB5*01:02:01G/DRB5*01:03/DRB5*02:02:01G/DRB5*02:03/DRB5*02:04 | diff --git a/tests/steps/redux_allele.py b/tests/steps/redux_allele.py index 7715272..0d43506 100644 --- a/tests/steps/redux_allele.py +++ b/tests/steps/redux_allele.py @@ -27,3 +27,8 @@ def step_impl(context, redux_allele): @given("the serology typing is {serology}") def step_impl(context, serology): context.allele = serology + + +@given("the version 2 typing is {v2_allele}") +def step_impl(context, v2_allele): + context.allele = v2_allele From 589cfc52ba5188b0c269e35bf2cdb0491398f8c2 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Tue, 1 Dec 2020 09:20:36 -0600 Subject: [PATCH 2/5] Ignore sorting of GL Strings for smart sorts --- pyard/smart_sort.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyard/smart_sort.py b/pyard/smart_sort.py index 0c244c2..9a9a8fb 100644 --- a/pyard/smart_sort.py +++ b/pyard/smart_sort.py @@ -25,6 +25,7 @@ import re expr_regex = re.compile('[NQLSGg]') +glstring_chars = re.compile('[/|+^~]') @functools.lru_cache(maxsize=1000) @@ -43,6 +44,10 @@ def smart_sort_comparator(a1, a2): if a1 == a2: return 0 + # Ignore GL String matches + if re.search(glstring_chars, a1) or re.search(glstring_chars, a2): + return 0 + # remove any non-numerics a1 = re.sub(expr_regex, '', a1) a2 = re.sub(expr_regex, '', a2) @@ -92,4 +97,3 @@ def smart_sort_comparator(a1, a2): # All fields are considered equal after 4th field return 0 - From 48fc3264a26c0330fdf7b6fdbdfc966f26420da8 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Tue, 1 Dec 2020 09:38:23 -0600 Subject: [PATCH 3/5] Command Line Tool `py-ard-import` to import a new database Command Line Tool `py-ard` to reduce a GL String --- README.rst | 28 ++++++++++++ scripts/pyard | 69 +++++++++++++++++++++++++++++ scripts/pyard-import | 102 +++++++++++++++++++++++++++++++++++++++++++ setup.py | 12 +++-- 4 files changed, 207 insertions(+), 4 deletions(-) create mode 100755 scripts/pyard create mode 100755 scripts/pyard-import diff --git a/README.rst b/README.rst index 2158f5c..0558caf 100644 --- a/README.rst +++ b/README.rst @@ -88,3 +88,31 @@ Example # 'HLA-A*24:19g/HLA-A*24:22g^HLA-A*26:01g/HLA-A*26:10g/HLA-A*26:15g/HLA-A*26:92g/HLA-A*66:01g/HLA-A*66:03g' +Command Line Tools +------------------ + +.. code-block:: bash + + # Import the latest IMGT database + $ pyard-import + Created Latest py-ard database + + # Import particular version of IMGT database + $ pyard-import --import-db-version 3.29.0 + Created py-ard version 3290 database + + # Import particular version of IMGT database and + # replace the v2 to v3 mapping table + $ pyard-import --import-db-version 3.29.0 --v2-to-v3-mapping map2to3.csv + Created py-ard version 3290 database + Updated v2_mapping table with 'map2to3.csv' mapping file. + + # Reduce a gl string from command line + $ pyard --gl 'A*01:AB' -r lgx + A*01:01/A*01:02 + + $ pyard --gl 'DRB1*08:XX' -r G + DRB1*08:01:01G/DRB1*08:02:01G/DRB1*08:03:02G/DRB1*08:04:01G/DRB1*08:05/ ... + + $ pyard -v 3290 --gl 'A1' -r lgx + A*01:01/A*01:02/A*01:03/A*01:06/A*01:07/A*01:08/A*01:09/A*01:10/A*01:12/ ... \ No newline at end of file diff --git a/scripts/pyard b/scripts/pyard new file mode 100755 index 0000000..1589fe0 --- /dev/null +++ b/scripts/pyard @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# py-ard +# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved. +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# +import argparse + +import pyard + + +def get_imgt_version(imgt_version): + if imgt_version: + version = imgt_version.replace('.', '') + if version.isdigit(): + return version + raise RuntimeError(f"{imgt_version} is not a valid IMGT database version number") + return None + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + usage="""[-v ] [gl-string redux_type]""", + description="""py-ard tool to redux GL String""" + ) + parser.add_argument( + "-v", + "--imgt-version", + dest="imgt_version" + ) + parser.add_argument( + "--gl", + required=True, + dest="gl_string" + ) + parser.add_argument( + "-r", + choices=['G', 'lg', 'lgx'], + required=True, + dest="redux_type" + ) + + args = parser.parse_args() + + imgt_version = get_imgt_version(args.imgt_version) + if imgt_version: + ard = pyard.ARD(imgt_version) + else: + ard = pyard.ARD() + + print(ard.redux_gl(args.gl_string, args.redux_type)) + del ard diff --git a/scripts/pyard-import b/scripts/pyard-import new file mode 100755 index 0000000..b2b08b7 --- /dev/null +++ b/scripts/pyard-import @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# py-ard +# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved. +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 3 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +# +# > http://www.fsf.org/licensing/licenses/lgpl.html +# > http://www.opensource.org/licenses/lgpl-license.php +# +import argparse +import pathlib + +import pyard +from pyard import db +import pandas as pd + + +def get_imgt_version(imgt_version): + if imgt_version: + version = imgt_version.replace('.', '') + if version.isdigit(): + return version + raise RuntimeError(f"{imgt_version} is not a valid IMGT database version number") + return None + + +def get_data_dir(data_dir): + if data_dir: + path = pathlib.Path(data_dir) + if not path.exists() or not path.is_dir(): + raise RuntimeError(f"{data_dir} is not a valid directory") + return data_dir + + +def get_v2_v3_mapping(v2_v3_mapping): + if v2_v3_mapping: + path = pathlib.Path(v2_v3_mapping) + if not path.exists() or not path.is_file(): + raise RuntimeError(f"{data_dir} is not a valid file") + df = pd.read_csv(path, names=['v2', 'v3']) + return df.set_index('v2')['v3'].to_dict() + return None + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + usage="""[--import-db-version ]\ + [--data-dir ]\ + [--v2-to-v3-mapping ]""", + description="""py-ard tool to generate reference SQLite database.\ + Allows updating db with custom mappings.""" + ) + parser.add_argument( + "--import-db-version", + dest="imgt_version" + ) + parser.add_argument( + "--data-dir", + dest="data_dir" + ) + parser.add_argument( + "--v2-to-v3-mapping", + dest="v2_v3_mapping" + ) + args = parser.parse_args() + + imgt_version = get_imgt_version(args.imgt_version) + # print(imgt_version) + + data_dir = get_data_dir(args.data_dir) + # print(data_dir) + + v2_to_v3_dict = get_v2_v3_mapping(args.v2_v3_mapping) + # print(len(v2_to_v3_dict)) + + if imgt_version: + ard = pyard.ARD(imgt_version=imgt_version, data_dir=data_dir) + print(f"Created py-ard version {imgt_version} database") + else: + ard = pyard.ARD(data_dir=data_dir) + print(f"Created Latest py-ard database") + del ard + + if v2_to_v3_dict: + db_connection = db.create_db_connection(data_dir, imgt_version, ro=False) + db.save_dict(db_connection, table_name='v2_mapping', + dictionary=v2_to_v3_dict, columns=('v2', 'v3')) + print(f"Updated v2_mapping table with '{args.v2_v3_mapping}' mapping file.") diff --git a/setup.py b/setup.py index 7986b5e..1b90a2c 100644 --- a/setup.py +++ b/setup.py @@ -35,15 +35,15 @@ 'pandas>=1.1.4' ] - test_requirements = [ - # TODO: put package test requirements here + 'behave==1.2.6', + 'PyHamcrest==2.0.2' ] setup( name='py-ard', version='0.5.1', - description="ARD reduction for HLA with python", + description="ARD reduction for HLA with Python", long_description=readme + '\n\n' + history, author="CIBMTR", author_email='cibmtr-pypi@nmdp.org', @@ -51,7 +51,11 @@ packages=[ 'pyard', ], - package_dir={'pyard': 'pyard'}, + provides=['pyard'], + scripts=[ + 'scripts/pyard', + 'scripts/pyard-import', + ], install_requires=requirements, license="LGPL 3.0", zip_safe=False, From 63ef9cfa12fdf818f9b30a0a059ef69543804da3 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Tue, 1 Dec 2020 10:14:48 -0600 Subject: [PATCH 4/5] - Add GL String examples - Compare GL String in sort --- pyard/smart_sort.py | 7 +++++-- tests/features/mac.feature | 10 ++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pyard/smart_sort.py b/pyard/smart_sort.py index 9a9a8fb..2ae39d3 100644 --- a/pyard/smart_sort.py +++ b/pyard/smart_sort.py @@ -44,9 +44,12 @@ def smart_sort_comparator(a1, a2): if a1 == a2: return 0 - # Ignore GL String matches + # GL String matches if re.search(glstring_chars, a1) or re.search(glstring_chars, a2): - return 0 + if a1 > a2: + return 1 + else: + return -1 # remove any non-numerics a1 = re.sub(expr_regex, '', a1) diff --git a/tests/features/mac.feature b/tests/features/mac.feature index 1ddec7e..7c65954 100644 --- a/tests/features/mac.feature +++ b/tests/features/mac.feature @@ -16,7 +16,9 @@ Feature: MAC (Multiple Allele Code) Examples: MACs with allelic expansions - | Allele | Level | Redux Allele | - | B*08:ASXJP | G | B*08:01:01G | - | B*08:ASXJP | lgx | B*08:01 | - | C*07:HTGM | lgx | C*07:01/C*07:150Q | + | Allele | Level | Redux Allele | + | B*08:ASXJP | G | B*08:01:01G | + | B*08:ASXJP | lgx | B*08:01 | + | C*07:HTGM | lgx | C*07:01/C*07:150Q | + | A*01:AC+A*01:AB | G | A*01:01:01G/A*01:02+A*01:01:01G/A*01:03:01G | + | A*01:01+A*01:AB | G | A*01:01:01G+A*01:01:01G/A*01:02 | \ No newline at end of file From 2cc7f25d2b0bd4868f75e466a6343c74d7b41994 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Tue, 1 Dec 2020 10:17:27 -0600 Subject: [PATCH 5/5] =?UTF-8?q?Bump=20version:=200.5.1=20=E2=86=92=200.6.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyard/__init__.py b/pyard/__init__.py index 326fc28..5f85ea0 100644 --- a/pyard/__init__.py +++ b/pyard/__init__.py @@ -24,4 +24,4 @@ from .pyard import ARD __author__ = """NMDP Bioinformatics""" -__version__ = '0.5.1' +__version__ = '0.6.0' diff --git a/setup.cfg b/setup.cfg index 1ba24de..5fc3028 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.1 +current_version = 0.6.0 commit = True tag = True diff --git a/setup.py b/setup.py index 1b90a2c..355a737 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ setup( name='py-ard', - version='0.5.1', + version='0.6.0', description="ARD reduction for HLA with Python", long_description=readme + '\n\n' + history, author="CIBMTR",