diff --git a/.gitignore b/.gitignore index 7bbc71c..227a0bd 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,8 @@ ENV/ # mypy .mypy_cache/ + +# downloaded +*.txt +*.pickle +*.zip diff --git a/MANIFEST.in b/MANIFEST.in index 63895e7..902c778 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,13 +1,13 @@ include AUTHORS.rst - include CONTRIBUTING.rst include HISTORY.rst include LICENSE include README.rst +include pyard/*.csv recursive-include tests * recursive-exclude * __pycache__ recursive-exclude * *.py[co] -recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif +recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.csv diff --git a/pyard/dna_relshp.csv b/pyard/dna_relshp.csv new file mode 100644 index 0000000..a1e026b --- /dev/null +++ b/pyard/dna_relshp.csv @@ -0,0 +1,34 @@ +loc,broad_fam,fam +A,09,23 +A,09,24 +A,10,25 +A,10,26 +A,10,34 +A,10,66 +A,19,29 +A,19,30 +A,19,31 +A,19,32 +A,19,33 +A,19,74 +A,28,68 +A,28,69 +B,05,51 +B,05,52 +B,12,44 +B,12,45 +B,16,38 +B,16,39 +B,17,57 +B,17,58 +B,21,49 +B,21,50 +B,22,54 +B,22,55 +B,22,56 +DQB1,01,05 +DQB1,01,06 +DRB1,02,15 +DRB1,02,16 +DRB1,06,13 +DRB1,06,14 diff --git a/pyard/pyard.py b/pyard/pyard.py index 3690525..a545421 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- # -# pyars pyARS. -# Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved. +# pyard +# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved. # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published @@ -26,6 +26,8 @@ import pickle import urllib.request import pandas as pd +import functools +from .smart_sort import smart_sort_comparator from .util import pandas_explode from .util import all_macs from operator import is_not @@ -37,9 +39,9 @@ ismac = lambda x: True if re.search(":\D+", x) else False -logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO) +# a module shouldn't decide the logging config; thats up to the calling programo + +#logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) import string @@ -108,6 +110,8 @@ def __init__(self, dbversion: str='Latest', self._download_mac = download_mac self._remove_invalid = remove_invalid + self.HLA_regex = re.compile("^HLA-") + # TODO: add check for valid ARD type # TODO: add check for valid db version @@ -120,6 +124,7 @@ def __init__(self, dbversion: str='Latest', allele_file = data_dir + '/AlleleList.' + str(dbversion) + ".txt" mac_file = data_dir + "/mac.txt" mac_pickle = data_dir + "/mac.pickle" + broad_file = data_dir + "/dna_relshp.csv" allele_url = "https://raw.githubusercontent.com/ANHIG/IMGTHLA/" \ + dbversion + "/Allelelist.txt" @@ -184,10 +189,27 @@ def __init__(self, dbversion: str='Latest', dfxx = pd.DataFrame(pd.Series(allele_df['2d'].unique().tolist()), columns=['Allele']) dfxx['1d'] = dfxx['Allele'].apply(lambda x: x.split(":")[0]) + + # xxcodes maps a first field name to its expansion self.xxcodes = dfxx.groupby(['1d'])\ .apply(lambda x: list(x['Allele']))\ .to_dict() + # defined broad XX codes + dfbroad = pd.read_csv(broad_file, skiprows=1, dtype=str, + names=["Locus", "Broad", "Fam"], sep=",").dropna() + + dictbroad = dfbroad.groupby(['Locus','Broad']).apply(lambda x: list(x['Fam'])).to_dict() + + for (locus,broad) in dictbroad.keys(): + locusbroad="*".join([locus,broad]) + for split in dictbroad[(locus,broad)]: + locussplit="*".join([locus,split]) + if locusbroad in self.xxcodes.keys(): + self.xxcodes[locusbroad].extend(self.xxcodes[locussplit]) + else: + self.xxcodes[locusbroad] = self.xxcodes[locussplit] + allele_df['3d'] = allele_df['Allele'].apply(lambda a: ":".join(a.split(":")[0:3]) + list(a)[-1] if list(a)[-1] @@ -195,9 +217,14 @@ def __init__(self, dbversion: str='Latest', len(a.split(":")) > 3 else ":".join(a.split(":")[0:3])) + # all alleles are valid and also shortening to 3 and 2 fields self.valid = list(set(allele_df['Allele'].tolist() + allele_df['2d'].tolist() + allele_df['3d'].tolist())) + # use a dict + self.valid_dict={} + for i in self.valid: + self.valid_dict[i]=True # Loading ARS file into pandas # TODO: Make skip dynamic in case the files are not consistent @@ -344,6 +371,7 @@ def lgx(self): """ return self._lgx + @functools.lru_cache(maxsize=None) def redux(self, allele: str, ars_type: str) -> str: """ Does ARS reduction with allele and ARS type @@ -356,18 +384,21 @@ def redux(self, allele: str, ars_type: str) -> str: :rtype: str """ - if re.search("HLA-", allele): + # PERFORMANCE: precompiled regex + # dealing with leading HLA- + + if self.HLA_regex.search(allele): hla, allele_name = allele.split("-") return "-".join(["HLA", self.redux(allele_name, ars_type)]) - if ars_type == "G" and allele in self.G: + if ars_type == "G" and allele in self._G: if allele in self.dup_g: return self.dup_g[allele] else: return self.G[allele] - elif ars_type == "lg" and allele in self.lg: + elif ars_type == "lg" and allele in self._lg: return self.lg[allele] - elif ars_type == "lgx" and allele in self.lgx: + elif ars_type == "lgx" and allele in self._lgx: return self.lgx[allele] else: if self.remove_invalid: @@ -378,6 +409,7 @@ def redux(self, allele: str, ars_type: str) -> str: else: return allele + @functools.lru_cache(maxsize=None) def redux_gl(self, glstring: str, redux_type: str) -> str: """ Does ARS reduction with allele and ARS type @@ -394,25 +426,27 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: return "" if re.search("\^", glstring): - return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(loci_sort))) + return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(smart_sort_comparator))) if re.search("\|", glstring): - return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(loci_sort))) + return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(smart_sort_comparator))) if re.search("\+", glstring): - return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(loci_sort))) + return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(smart_sort_comparator))) if re.search("\~", glstring): return "~".join([self.redux_gl(a, redux_type) for a in glstring.split("~")]) if re.search("/", glstring): - return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(loci_sort))) + return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(smart_sort_comparator))) loc_allele = glstring.split(":") loc_name, code = loc_allele[0], loc_allele[1] + + # handle XX codes if(ismac(glstring) and glstring.split(":")[1] == "XX"): loc, n = loc_name.split("*") - return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(loci_sort))), redux_type) + return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type) if ismac(glstring) and code in self.mac: if re.search("HLA-", glstring): @@ -423,37 +457,39 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: [loc_name + ":" + a if len(a) <= 3 else loc + "*" + a for a in self.mac[code]['Alleles']])) - return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(loci_sort))), redux_type) + return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))), redux_type) else: loc, n = loc_name.split("*") alleles = list(filter(lambda a: a in self.valid, [loc_name + ":" + a if len(a) <= 3 else loc + "*" + a for a in self.mac[code]['Alleles']])) - return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(loci_sort))), redux_type) + return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))), redux_type) return self.redux(glstring, redux_type) - def isvalid(self, allele: str) -> str: + def isvalid(self, allele: str) -> bool: """ Determines validity of an allele :param allele: An HLA allele. :type: str :return: allele or empty - :rtype: boolean + :rtype: bool """ if not ismac(allele): - return allele in self.valid + # PERFORMANCE: use hash instead of allele in "list" + # return allele in self.valid + return self.valid_dict.get(allele, False) return True - def isvalid_gl(self, glstring: str) -> str: + def isvalid_gl(self, glstring: str) -> bool: """ Determine validity of glstring :param glstring :type: str :return: result - :rtype: boolean + :rtype: bool """ if re.search("\^", glstring): diff --git a/pyard/smart_sort.py b/pyard/smart_sort.py new file mode 100644 index 0000000..09dbe92 --- /dev/null +++ b/pyard/smart_sort.py @@ -0,0 +1,80 @@ +import functools +import re + +expr_regex = re.compile('[NQLS]') + +@functools.lru_cache(maxsize=None) +def smart_sort_comparator(a1, a2): + """ + Natural sort 2 given alleles. + + Python sorts strings lexographically but HLA alleles need + to be sorted by numerical values in each field of the HLA nomenclature. + + :param a1: first allele + :param a2: second allele + """ + + # Check to see if they are the same alleles + if a1 == a2: + return 0 + + + # remove any non-numerics + a1 = re.sub(expr_regex, '', a1) + a2 = re.sub(expr_regex, '', a2) + # Extract and Compare first fields first + a1_f1 = int(a1[a1.find('*')+1:a1.find(':')]) + a2_f1 = int(a2[a2.find('*')+1:a2.find(':')]) + + if a1_f1 < a2_f1: + return -1 + if a1_f1 > a2_f1: + return 1 + + # If the first fields are equal, try the 2nd fields + a1_f2 = int(a1[a1.find(':')+1:]) + a2_f2 = int(a2[a2.find(':')+1:]) + + if a1_f2 < a2_f2: + return -1 + if a1_f2 > a2_f2: + return 1 + + # All fields are equal + return 0 + +def smart_sort_alleles(a1, a2): + """ + Natural sort 2 given alleles. + + Python sorts strings lexographically but HLA alleles need + to be sorted by numerical values in each field of the HLA nomenclature. + + :param a1: first allele + :param a2: second allele + """ + # Check to see if they are the same alleles + if a1 == a2: + return [a1, a2] + + # Extract and Compare first fields first + a1_f1 = int(a1[a1.find('*')+1:a1.find(':')]) + a2_f1 = int(a2[a2.find('*')+1:a2.find(':')]) + + if a1_f1 < a2_f1: + return [a1, a2] + if a1_f1 > a2_f1: + return [a2, a1] + + # If the first fields are equal, try the 2nd fields + a1_f2 = int(a1[a1.find(':')+1:]) + a2_f2 = int(a2[a2.find(':')+1:]) + + if a1_f2 < a2_f2: + return [a1, a2] + if a1_f2 > a2_f2: + return [a2, a1] + + # All fields are equal + return [a1, a2] diff --git a/setup.py b/setup.py index d1df2c4..5cbfdc6 100644 --- a/setup.py +++ b/setup.py @@ -65,5 +65,6 @@ 'Programming Language :: Python :: 3.7', ], test_suite='tests', - tests_require=test_requirements + tests_require=test_requirements, + include_package_data=True )