From 87a0a7e318d3373af85348df59e554a80c3aba34 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Fri, 6 Mar 2020 22:44:23 -0600 Subject: [PATCH 1/8] handle broad XX codes --- pyard/pyard.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/pyard/pyard.py b/pyard/pyard.py index 3690525..6e2e368 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- # -# pyars pyARS. -# Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved. +# pyard +# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved. # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published @@ -120,6 +120,7 @@ def __init__(self, dbversion: str='Latest', allele_file = data_dir + '/AlleleList.' + str(dbversion) + ".txt" mac_file = data_dir + "/mac.txt" mac_pickle = data_dir + "/mac.pickle" + broad_file = data_dir + "/dna_relshp.csv" allele_url = "https://raw.githubusercontent.com/ANHIG/IMGTHLA/" \ + dbversion + "/Allelelist.txt" @@ -184,10 +185,27 @@ def __init__(self, dbversion: str='Latest', dfxx = pd.DataFrame(pd.Series(allele_df['2d'].unique().tolist()), columns=['Allele']) dfxx['1d'] = dfxx['Allele'].apply(lambda x: x.split(":")[0]) + + # xxcodes maps a first field name to its expansion self.xxcodes = dfxx.groupby(['1d'])\ .apply(lambda x: list(x['Allele']))\ .to_dict() + # defined broad XX codes + dfbroad = pd.read_csv(broad_file, skiprows=1, dtype=str, + names=["Locus", "Broad", "Fam"], sep=",").dropna() + + dictbroad = dfbroad.groupby(['Locus','Broad']).apply(lambda x: list(x['Fam'])).to_dict() + + for (locus,broad) in dictbroad.keys(): + locusbroad="*".join([locus,broad]) + for split in dictbroad[(locus,broad)]: + locussplit="*".join([locus,split]) + if locusbroad in self.xxcodes.keys(): + self.xxcodes[locusbroad].extend(self.xxcodes[locussplit]) + else: + self.xxcodes[locusbroad] = self.xxcodes[locussplit] + allele_df['3d'] = allele_df['Allele'].apply(lambda a: ":".join(a.split(":")[0:3]) + list(a)[-1] if list(a)[-1] @@ -195,6 +213,7 @@ def __init__(self, dbversion: str='Latest', len(a.split(":")) > 3 else ":".join(a.split(":")[0:3])) + # all alleles are valid and also shortening to 3 and 2 fields self.valid = list(set(allele_df['Allele'].tolist() + allele_df['2d'].tolist() + allele_df['3d'].tolist())) @@ -410,6 +429,8 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: loc_allele = glstring.split(":") loc_name, code = loc_allele[0], loc_allele[1] + + # handle XX codes if(ismac(glstring) and glstring.split(":")[1] == "XX"): loc, n = loc_name.split("*") return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(loci_sort))), redux_type) From ac956099072070c0d6545f88da6af0aa9603a376 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Fri, 6 Mar 2020 22:57:48 -0600 Subject: [PATCH 2/8] relshp file --- pyard/dna_relshp.csv | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 pyard/dna_relshp.csv diff --git a/pyard/dna_relshp.csv b/pyard/dna_relshp.csv new file mode 100644 index 0000000..a1e026b --- /dev/null +++ b/pyard/dna_relshp.csv @@ -0,0 +1,34 @@ +loc,broad_fam,fam +A,09,23 +A,09,24 +A,10,25 +A,10,26 +A,10,34 +A,10,66 +A,19,29 +A,19,30 +A,19,31 +A,19,32 +A,19,33 +A,19,74 +A,28,68 +A,28,69 +B,05,51 +B,05,52 +B,12,44 +B,12,45 +B,16,38 +B,16,39 +B,17,57 +B,17,58 +B,21,49 +B,21,50 +B,22,54 +B,22,55 +B,22,56 +DQB1,01,05 +DQB1,01,06 +DRB1,02,15 +DRB1,02,16 +DRB1,06,13 +DRB1,06,14 From 802b90995ddc71b95a389878a5ee13398d747725 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Fri, 6 Mar 2020 23:10:29 -0600 Subject: [PATCH 3/8] packaging --- MANIFEST.in | 4 ++-- setup.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 63895e7..902c778 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,13 +1,13 @@ include AUTHORS.rst - include CONTRIBUTING.rst include HISTORY.rst include LICENSE include README.rst +include pyard/*.csv recursive-include tests * recursive-exclude * __pycache__ recursive-exclude * *.py[co] -recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif +recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.csv diff --git a/setup.py b/setup.py index d1df2c4..5cbfdc6 100644 --- a/setup.py +++ b/setup.py @@ -65,5 +65,6 @@ 'Programming Language :: Python :: 3.7', ], test_suite='tests', - tests_require=test_requirements + tests_require=test_requirements, + include_package_data=True ) From 8ed549f7d949569cdea4e03ffe621f29da4a4ad8 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Sat, 7 Mar 2020 23:00:55 -0600 Subject: [PATCH 4/8] performance --- pyard/pyard.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/pyard/pyard.py b/pyard/pyard.py index 6e2e368..652d34c 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -26,6 +26,7 @@ import pickle import urllib.request import pandas as pd +import functools from .util import pandas_explode from .util import all_macs from operator import is_not @@ -37,9 +38,9 @@ ismac = lambda x: True if re.search(":\D+", x) else False -logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO) +# a module shouldn't decide the logging config; thats up to the calling programo + +#logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) import string @@ -108,6 +109,8 @@ def __init__(self, dbversion: str='Latest', self._download_mac = download_mac self._remove_invalid = remove_invalid + self.rHLA = re.compile("HLA-") + # TODO: add check for valid ARD type # TODO: add check for valid db version @@ -217,6 +220,10 @@ def __init__(self, dbversion: str='Latest', self.valid = list(set(allele_df['Allele'].tolist() + allele_df['2d'].tolist() + allele_df['3d'].tolist())) + # use a dict + self.validd={} + for i in self.valid: + self.validd[i]=True # Loading ARS file into pandas # TODO: Make skip dynamic in case the files are not consistent @@ -292,6 +299,17 @@ def __init__(self, dbversion: str='Latest', df[['A', 'lgx']]], ignore_index=True).set_index('A').to_dict()['lgx'] + # use a dict + self._Gd = {} + for i in self._G: + self._Gd[i]=True + self._lgd = {} + for i in self._lg: + self._lgd[i]=True + self._lgxd = {} + for i in self._lgx: + self._lgxd[i]=True + @property def dbversion(self) -> str: """ @@ -363,6 +381,7 @@ def lgx(self): """ return self._lgx + @functools.lru_cache(maxsize=1000) def redux(self, allele: str, ars_type: str) -> str: """ Does ARS reduction with allele and ARS type @@ -375,18 +394,22 @@ def redux(self, allele: str, ars_type: str) -> str: :rtype: str """ - if re.search("HLA-", allele): + # PERFORMANCE: precompiled regex + # dealing with leading HLA- + + if self.rHLA.search(allele): hla, allele_name = allele.split("-") return "-".join(["HLA", self.redux(allele_name, ars_type)]) - if ars_type == "G" and allele in self.G: + # PERFORMANCE: use hash instead of allele in "list" + if ars_type == "G" and self._Gd.get(allele): if allele in self.dup_g: return self.dup_g[allele] else: return self.G[allele] - elif ars_type == "lg" and allele in self.lg: + elif ars_type == "lg" and self._lgd.get(allele): return self.lg[allele] - elif ars_type == "lgx" and allele in self.lgx: + elif ars_type == "lgx" and self._lgx.get(allele): return self.lgx[allele] else: if self.remove_invalid: @@ -464,7 +487,9 @@ def isvalid(self, allele: str) -> str: :rtype: boolean """ if not ismac(allele): - return allele in self.valid + # PERFORMANCE: use hash instead of allele in "list" + # return allele in self.valid + return self.validd.get(allele) return True def isvalid_gl(self, glstring: str) -> str: From 6d72f57ffd5fce37efadbb4dde2ea1844a057bff Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Mon, 9 Mar 2020 17:04:45 -0500 Subject: [PATCH 5/8] performance code clinic --- pyard/smart_sort.py | 80 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 pyard/smart_sort.py diff --git a/pyard/smart_sort.py b/pyard/smart_sort.py new file mode 100644 index 0000000..09dbe92 --- /dev/null +++ b/pyard/smart_sort.py @@ -0,0 +1,80 @@ +import functools +import re + +expr_regex = re.compile('[NQLS]') + +@functools.lru_cache(maxsize=None) +def smart_sort_comparator(a1, a2): + """ + Natural sort 2 given alleles. + + Python sorts strings lexographically but HLA alleles need + to be sorted by numerical values in each field of the HLA nomenclature. + + :param a1: first allele + :param a2: second allele + """ + + # Check to see if they are the same alleles + if a1 == a2: + return 0 + + + # remove any non-numerics + a1 = re.sub(expr_regex, '', a1) + a2 = re.sub(expr_regex, '', a2) + # Extract and Compare first fields first + a1_f1 = int(a1[a1.find('*')+1:a1.find(':')]) + a2_f1 = int(a2[a2.find('*')+1:a2.find(':')]) + + if a1_f1 < a2_f1: + return -1 + if a1_f1 > a2_f1: + return 1 + + # If the first fields are equal, try the 2nd fields + a1_f2 = int(a1[a1.find(':')+1:]) + a2_f2 = int(a2[a2.find(':')+1:]) + + if a1_f2 < a2_f2: + return -1 + if a1_f2 > a2_f2: + return 1 + + # All fields are equal + return 0 + +def smart_sort_alleles(a1, a2): + """ + Natural sort 2 given alleles. + + Python sorts strings lexographically but HLA alleles need + to be sorted by numerical values in each field of the HLA nomenclature. + + :param a1: first allele + :param a2: second allele + """ + # Check to see if they are the same alleles + if a1 == a2: + return [a1, a2] + + # Extract and Compare first fields first + a1_f1 = int(a1[a1.find('*')+1:a1.find(':')]) + a2_f1 = int(a2[a2.find('*')+1:a2.find(':')]) + + if a1_f1 < a2_f1: + return [a1, a2] + if a1_f1 > a2_f1: + return [a2, a1] + + # If the first fields are equal, try the 2nd fields + a1_f2 = int(a1[a1.find(':')+1:]) + a2_f2 = int(a2[a2.find(':')+1:]) + + if a1_f2 < a2_f2: + return [a1, a2] + if a1_f2 > a2_f2: + return [a2, a1] + + # All fields are equal + return [a1, a2] From 789d4bec867ee25a751b660577325122ad5eb71f Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Mon, 9 Mar 2020 23:45:42 -0500 Subject: [PATCH 6/8] gitignore and performance enhancements to pyard.py --- .gitignore | 5 +++++ pyard/pyard.py | 54 ++++++++++++++++++++------------------------------ 2 files changed, 27 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 7bbc71c..227a0bd 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,8 @@ ENV/ # mypy .mypy_cache/ + +# downloaded +*.txt +*.pickle +*.zip diff --git a/pyard/pyard.py b/pyard/pyard.py index 652d34c..a545421 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -27,6 +27,7 @@ import urllib.request import pandas as pd import functools +from .smart_sort import smart_sort_comparator from .util import pandas_explode from .util import all_macs from operator import is_not @@ -109,7 +110,7 @@ def __init__(self, dbversion: str='Latest', self._download_mac = download_mac self._remove_invalid = remove_invalid - self.rHLA = re.compile("HLA-") + self.HLA_regex = re.compile("^HLA-") # TODO: add check for valid ARD type # TODO: add check for valid db version @@ -221,9 +222,9 @@ def __init__(self, dbversion: str='Latest', + allele_df['2d'].tolist() + allele_df['3d'].tolist())) # use a dict - self.validd={} + self.valid_dict={} for i in self.valid: - self.validd[i]=True + self.valid_dict[i]=True # Loading ARS file into pandas # TODO: Make skip dynamic in case the files are not consistent @@ -299,17 +300,6 @@ def __init__(self, dbversion: str='Latest', df[['A', 'lgx']]], ignore_index=True).set_index('A').to_dict()['lgx'] - # use a dict - self._Gd = {} - for i in self._G: - self._Gd[i]=True - self._lgd = {} - for i in self._lg: - self._lgd[i]=True - self._lgxd = {} - for i in self._lgx: - self._lgxd[i]=True - @property def dbversion(self) -> str: """ @@ -381,7 +371,7 @@ def lgx(self): """ return self._lgx - @functools.lru_cache(maxsize=1000) + @functools.lru_cache(maxsize=None) def redux(self, allele: str, ars_type: str) -> str: """ Does ARS reduction with allele and ARS type @@ -397,19 +387,18 @@ def redux(self, allele: str, ars_type: str) -> str: # PERFORMANCE: precompiled regex # dealing with leading HLA- - if self.rHLA.search(allele): + if self.HLA_regex.search(allele): hla, allele_name = allele.split("-") return "-".join(["HLA", self.redux(allele_name, ars_type)]) - # PERFORMANCE: use hash instead of allele in "list" - if ars_type == "G" and self._Gd.get(allele): + if ars_type == "G" and allele in self._G: if allele in self.dup_g: return self.dup_g[allele] else: return self.G[allele] - elif ars_type == "lg" and self._lgd.get(allele): + elif ars_type == "lg" and allele in self._lg: return self.lg[allele] - elif ars_type == "lgx" and self._lgx.get(allele): + elif ars_type == "lgx" and allele in self._lgx: return self.lgx[allele] else: if self.remove_invalid: @@ -420,6 +409,7 @@ def redux(self, allele: str, ars_type: str) -> str: else: return allele + @functools.lru_cache(maxsize=None) def redux_gl(self, glstring: str, redux_type: str) -> str: """ Does ARS reduction with allele and ARS type @@ -436,19 +426,19 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: return "" if re.search("\^", glstring): - return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(loci_sort))) + return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(smart_sort_comparator))) if re.search("\|", glstring): - return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(loci_sort))) + return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(smart_sort_comparator))) if re.search("\+", glstring): - return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(loci_sort))) + return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(smart_sort_comparator))) if re.search("\~", glstring): return "~".join([self.redux_gl(a, redux_type) for a in glstring.split("~")]) if re.search("/", glstring): - return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(loci_sort))) + return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(smart_sort_comparator))) loc_allele = glstring.split(":") loc_name, code = loc_allele[0], loc_allele[1] @@ -456,7 +446,7 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: # handle XX codes if(ismac(glstring) and glstring.split(":")[1] == "XX"): loc, n = loc_name.split("*") - return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(loci_sort))), redux_type) + return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type) if ismac(glstring) and code in self.mac: if re.search("HLA-", glstring): @@ -467,39 +457,39 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: [loc_name + ":" + a if len(a) <= 3 else loc + "*" + a for a in self.mac[code]['Alleles']])) - return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(loci_sort))), redux_type) + return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))), redux_type) else: loc, n = loc_name.split("*") alleles = list(filter(lambda a: a in self.valid, [loc_name + ":" + a if len(a) <= 3 else loc + "*" + a for a in self.mac[code]['Alleles']])) - return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(loci_sort))), redux_type) + return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))), redux_type) return self.redux(glstring, redux_type) - def isvalid(self, allele: str) -> str: + def isvalid(self, allele: str) -> bool: """ Determines validity of an allele :param allele: An HLA allele. :type: str :return: allele or empty - :rtype: boolean + :rtype: bool """ if not ismac(allele): # PERFORMANCE: use hash instead of allele in "list" # return allele in self.valid - return self.validd.get(allele) + return self.valid_dict.get(allele, False) return True - def isvalid_gl(self, glstring: str) -> str: + def isvalid_gl(self, glstring: str) -> bool: """ Determine validity of glstring :param glstring :type: str :return: result - :rtype: boolean + :rtype: bool """ if re.search("\^", glstring): From 5e914b5d02d99688fe4ed5e69791fee03673bb34 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Fri, 20 Mar 2020 00:00:37 -0500 Subject: [PATCH 7/8] validate XX codes --- pyard/pyard.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyard/pyard.py b/pyard/pyard.py index a545421..c81dc3f 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -444,7 +444,8 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: loc_name, code = loc_allele[0], loc_allele[1] # handle XX codes - if(ismac(glstring) and glstring.split(":")[1] == "XX"): + # test that they are valid + if(ismac(glstring) and glstring.split(":")[1] == "XX") and glstring in self.xxcodes: loc, n = loc_name.split("*") return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type) From 32c72b62eb84efcbab1ca2fa2cd632ba1d59e9d0 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Fri, 20 Mar 2020 00:56:53 -0500 Subject: [PATCH 8/8] minor fix to XX validation --- pyard/pyard.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyard/pyard.py b/pyard/pyard.py index c81dc3f..01bda4d 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -205,7 +205,7 @@ def __init__(self, dbversion: str='Latest', locusbroad="*".join([locus,broad]) for split in dictbroad[(locus,broad)]: locussplit="*".join([locus,split]) - if locusbroad in self.xxcodes.keys(): + if locusbroad in self.xxcodes: self.xxcodes[locusbroad].extend(self.xxcodes[locussplit]) else: self.xxcodes[locusbroad] = self.xxcodes[locussplit] @@ -445,7 +445,7 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: # handle XX codes # test that they are valid - if(ismac(glstring) and glstring.split(":")[1] == "XX") and glstring in self.xxcodes: + if(ismac(glstring) and glstring.split(":")[1] == "XX") and loc_name in self.xxcodes: loc, n = loc_name.split("*") return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)