Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,8 @@ ENV/

# mypy
.mypy_cache/

# downloaded
*.txt
*.pickle
*.zip
4 changes: 2 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@

include AUTHORS.rst

include CONTRIBUTING.rst
include HISTORY.rst
include LICENSE
include README.rst
include pyard/*.csv

recursive-include tests *
recursive-exclude * __pycache__
recursive-exclude * *.py[co]

recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.csv
34 changes: 34 additions & 0 deletions pyard/dna_relshp.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
loc,broad_fam,fam
A,09,23
A,09,24
A,10,25
A,10,26
A,10,34
A,10,66
A,19,29
A,19,30
A,19,31
A,19,32
A,19,33
A,19,74
A,28,68
A,28,69
B,05,51
B,05,52
B,12,44
B,12,45
B,16,38
B,16,39
B,17,57
B,17,58
B,21,49
B,21,50
B,22,54
B,22,55
B,22,56
DQB1,01,05
DQB1,01,06
DRB1,02,15
DRB1,02,16
DRB1,06,13
DRB1,06,14
78 changes: 57 additions & 21 deletions pyard/pyard.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-

#
# pyars pyARS.
# Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
# pyard
# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
#
# This library is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published
Expand All @@ -26,6 +26,8 @@
import pickle
import urllib.request
import pandas as pd
import functools
from .smart_sort import smart_sort_comparator
from .util import pandas_explode
from .util import all_macs
from operator import is_not
Expand All @@ -37,9 +39,9 @@
ismac = lambda x: True if re.search(":\D+", x) else False


logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO)
# a module shouldn't decide the logging config; thats up to the calling programo

#logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)

import string

Expand Down Expand Up @@ -108,6 +110,8 @@ def __init__(self, dbversion: str='Latest',
self._download_mac = download_mac
self._remove_invalid = remove_invalid

self.HLA_regex = re.compile("^HLA-")

# TODO: add check for valid ARD type
# TODO: add check for valid db version

Expand All @@ -120,6 +124,7 @@ def __init__(self, dbversion: str='Latest',
allele_file = data_dir + '/AlleleList.' + str(dbversion) + ".txt"
mac_file = data_dir + "/mac.txt"
mac_pickle = data_dir + "/mac.pickle"
broad_file = data_dir + "/dna_relshp.csv"

allele_url = "https://raw.githubusercontent.com/ANHIG/IMGTHLA/" \
+ dbversion + "/Allelelist.txt"
Expand Down Expand Up @@ -184,20 +189,42 @@ def __init__(self, dbversion: str='Latest',
dfxx = pd.DataFrame(pd.Series(allele_df['2d'].unique().tolist()),
columns=['Allele'])
dfxx['1d'] = dfxx['Allele'].apply(lambda x: x.split(":")[0])

# xxcodes maps a first field name to its expansion
self.xxcodes = dfxx.groupby(['1d'])\
.apply(lambda x: list(x['Allele']))\
.to_dict()

# defined broad XX codes
dfbroad = pd.read_csv(broad_file, skiprows=1, dtype=str,
names=["Locus", "Broad", "Fam"], sep=",").dropna()

dictbroad = dfbroad.groupby(['Locus','Broad']).apply(lambda x: list(x['Fam'])).to_dict()

for (locus,broad) in dictbroad.keys():
locusbroad="*".join([locus,broad])
for split in dictbroad[(locus,broad)]:
locussplit="*".join([locus,split])
if locusbroad in self.xxcodes.keys():
self.xxcodes[locusbroad].extend(self.xxcodes[locussplit])
else:
self.xxcodes[locusbroad] = self.xxcodes[locussplit]

allele_df['3d'] = allele_df['Allele'].apply(lambda a:
":".join(a.split(":")[0:3]) +
list(a)[-1] if list(a)[-1]
in expre_chars and
len(a.split(":")) > 3
else ":".join(a.split(":")[0:3]))

# all alleles are valid and also shortening to 3 and 2 fields
self.valid = list(set(allele_df['Allele'].tolist()
+ allele_df['2d'].tolist()
+ allele_df['3d'].tolist()))
# use a dict
self.valid_dict={}
for i in self.valid:
self.valid_dict[i]=True

# Loading ARS file into pandas
# TODO: Make skip dynamic in case the files are not consistent
Expand Down Expand Up @@ -344,6 +371,7 @@ def lgx(self):
"""
return self._lgx

@functools.lru_cache(maxsize=None)
def redux(self, allele: str, ars_type: str) -> str:
"""
Does ARS reduction with allele and ARS type
Expand All @@ -356,18 +384,21 @@ def redux(self, allele: str, ars_type: str) -> str:
:rtype: str
"""

if re.search("HLA-", allele):
# PERFORMANCE: precompiled regex
# dealing with leading HLA-

if self.HLA_regex.search(allele):
hla, allele_name = allele.split("-")
return "-".join(["HLA", self.redux(allele_name, ars_type)])

if ars_type == "G" and allele in self.G:
if ars_type == "G" and allele in self._G:
if allele in self.dup_g:
return self.dup_g[allele]
else:
return self.G[allele]
elif ars_type == "lg" and allele in self.lg:
elif ars_type == "lg" and allele in self._lg:
return self.lg[allele]
elif ars_type == "lgx" and allele in self.lgx:
elif ars_type == "lgx" and allele in self._lgx:
return self.lgx[allele]
else:
if self.remove_invalid:
Expand All @@ -378,6 +409,7 @@ def redux(self, allele: str, ars_type: str) -> str:
else:
return allele

@functools.lru_cache(maxsize=None)
def redux_gl(self, glstring: str, redux_type: str) -> str:
"""
Does ARS reduction with allele and ARS type
Expand All @@ -394,25 +426,27 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
return ""

if re.search("\^", glstring):
return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(loci_sort)))
return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(smart_sort_comparator)))

if re.search("\|", glstring):
return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(loci_sort)))
return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(smart_sort_comparator)))

if re.search("\+", glstring):
return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(loci_sort)))
return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(smart_sort_comparator)))

if re.search("\~", glstring):
return "~".join([self.redux_gl(a, redux_type) for a in glstring.split("~")])

if re.search("/", glstring):
return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(loci_sort)))
return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(smart_sort_comparator)))

loc_allele = glstring.split(":")
loc_name, code = loc_allele[0], loc_allele[1]

# handle XX codes
if(ismac(glstring) and glstring.split(":")[1] == "XX"):
loc, n = loc_name.split("*")
return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(loci_sort))), redux_type)
return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)

if ismac(glstring) and code in self.mac:
if re.search("HLA-", glstring):
Expand All @@ -423,37 +457,39 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
[loc_name + ":" + a if len(a) <= 3
else loc + "*" + a
for a in self.mac[code]['Alleles']]))
return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(loci_sort))), redux_type)
return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
else:
loc, n = loc_name.split("*")
alleles = list(filter(lambda a: a in self.valid,
[loc_name + ":" + a if len(a) <= 3
else loc + "*" + a
for a in self.mac[code]['Alleles']]))
return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(loci_sort))), redux_type)
return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
return self.redux(glstring, redux_type)

def isvalid(self, allele: str) -> str:
def isvalid(self, allele: str) -> bool:
"""
Determines validity of an allele

:param allele: An HLA allele.
:type: str
:return: allele or empty
:rtype: boolean
:rtype: bool
"""
if not ismac(allele):
return allele in self.valid
# PERFORMANCE: use hash instead of allele in "list"
# return allele in self.valid
return self.valid_dict.get(allele, False)
return True

def isvalid_gl(self, glstring: str) -> str:
def isvalid_gl(self, glstring: str) -> bool:
"""
Determine validity of glstring

:param glstring
:type: str
:return: result
:rtype: boolean
:rtype: bool
"""

if re.search("\^", glstring):
Expand Down
80 changes: 80 additions & 0 deletions pyard/smart_sort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import functools
import re

expr_regex = re.compile('[NQLS]')

@functools.lru_cache(maxsize=None)
def smart_sort_comparator(a1, a2):
"""
Natural sort 2 given alleles.

Python sorts strings lexographically but HLA alleles need
to be sorted by numerical values in each field of the HLA nomenclature.

:param a1: first allele
:param a2: second allele
"""

# Check to see if they are the same alleles
if a1 == a2:
return 0


# remove any non-numerics
a1 = re.sub(expr_regex, '', a1)
a2 = re.sub(expr_regex, '', a2)
# Extract and Compare first fields first
a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])

if a1_f1 < a2_f1:
return -1
if a1_f1 > a2_f1:
return 1

# If the first fields are equal, try the 2nd fields
a1_f2 = int(a1[a1.find(':')+1:])
a2_f2 = int(a2[a2.find(':')+1:])

if a1_f2 < a2_f2:
return -1
if a1_f2 > a2_f2:
return 1

# All fields are equal
return 0

def smart_sort_alleles(a1, a2):
"""
Natural sort 2 given alleles.

Python sorts strings lexographically but HLA alleles need
to be sorted by numerical values in each field of the HLA nomenclature.

:param a1: first allele
:param a2: second allele
"""
# Check to see if they are the same alleles
if a1 == a2:
return [a1, a2]

# Extract and Compare first fields first
a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])

if a1_f1 < a2_f1:
return [a1, a2]
if a1_f1 > a2_f1:
return [a2, a1]

# If the first fields are equal, try the 2nd fields
a1_f2 = int(a1[a1.find(':')+1:])
a2_f2 = int(a2[a2.find(':')+1:])

if a1_f2 < a2_f2:
return [a1, a2]
if a1_f2 > a2_f2:
return [a2, a1]

# All fields are equal
return [a1, a2]
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,6 @@
'Programming Language :: Python :: 3.7',
],
test_suite='tests',
tests_require=test_requirements
tests_require=test_requirements,
include_package_data=True
)