Skip to content

Commit

Permalink
Merge pull request #50 from pbashyal-nmdp/fix_4th_field_comparison
Browse files Browse the repository at this point in the history
Smart Sort Fix and cleanup
  • Loading branch information
mmaiers-nmdp authored Sep 9, 2020
2 parents 5c11598 + 279f679 commit f034eb8
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 55 deletions.
2 changes: 1 addition & 1 deletion pyard/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@
from .pyard import ARD

__author__ = """NMDP Bioinformatics"""
__version__ = '0.0.13'
__version__ = '0.0.21'
73 changes: 21 additions & 52 deletions pyard/smart_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@

expr_regex = re.compile('[NQLSGg]')


@functools.lru_cache(maxsize=None)
def smart_sort_comparator(a1, a2):
"""
Natural sort 2 given alleles.
Python sorts strings lexographically but HLA alleles need
Python sorts strings lexicographically but HLA alleles need
to be sorted by numerical values in each field of the HLA nomenclature.
:param a1: first allele
Expand All @@ -19,85 +20,53 @@ def smart_sort_comparator(a1, a2):
if a1 == a2:
return 0


# remove any non-numerics
a1 = re.sub(expr_regex, '', a1)
a2 = re.sub(expr_regex, '', a2)

# Check to see if they are still the same alleles
if a1 == a2:
return 0

# Extract and Compare first fields first
a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])
a1_f1 = int(a1[a1.find('*') + 1:a1.find(':')])
a2_f1 = int(a2[a2.find('*') + 1:a2.find(':')])

if a1_f1 < a2_f1:
return -1
if a1_f1 > a2_f1:
return 1

# If the first fields are equal, try the 2nd fields
a1_fields = a1.split(':')
a2_fields = a2.split(':')

a1_f2 = int(a1.split(':')[1])
a2_f2 = int(a2.split(':')[1])
# If the first fields are equal, try the 2nd fields
a1_f2 = int(a1_fields[1])
a2_f2 = int(a2_fields[1])

if a1_f2 < a2_f2:
return -1
if a1_f2 > a2_f2:
return 1

# If the two fields are equal, try the 3rd fields

a1_f3 = int(a1.split(':')[2])
a2_f3 = int(a2.split(':')[2])
# If the second fields are equal, try the 3rd fields
a1_f3 = int(a1_fields[2])
a2_f3 = int(a2_fields[2])

if a1_f3 < a2_f3:
return -1
if a1_f3 > a2_f3:
return 1

# If the two fields are equal, try the 4th fields

a1_f4 = int(a1.split(':')[3])
a2_f3 = int(a2.split(':')[3])
# If the third fields are equal, try the 4th fields
a1_f4 = int(a1_fields[3])
a2_f4 = int(a2_fields[3])

if a1_f4 < a2_f4:
return -1
if a1_f4 > a2_f4:
return 1



# All fields are equal
# All fields are considered equal after 4th field
return 0

def smart_sort_alleles(a1, a2):
"""
Natural sort 2 given alleles.
Python sorts strings lexographically but HLA alleles need
to be sorted by numerical values in each field of the HLA nomenclature.
:param a1: first allele
:param a2: second allele
"""
# Check to see if they are the same alleles
if a1 == a2:
return [a1, a2]

# Extract and Compare first fields first
a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])

if a1_f1 < a2_f1:
return [a1, a2]
if a1_f1 > a2_f1:
return [a2, a1]

# If the first fields are equal, try the 2nd fields
a1_f2 = int(a1[a1.find(':')+1:])
a2_f2 = int(a2[a2.find(':')+1:])

if a1_f2 < a2_f2:
return [a1, a2]
if a1_f2 > a2_f2:
return [a2, a1]

# All fields are equal
return [a1, a2]
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.0.13
current_version = 0.0.21
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@

setup(
name='py-ard',
version='0.0.20',
version='0.0.21',
description="ARD reduction for HLA with python",
long_description=readme + '\n\n' + history,
author="CIBMTR",
Expand Down
89 changes: 89 additions & 0 deletions tests/test_smart_sort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import unittest

from pyard.smart_sort import smart_sort_comparator


class TestSmartSort(unittest.TestCase):

def setUp(self) -> None:
super().setUp()

def test_same_comparator(self):
allele = "HLA-A*01:01"
self.assertEqual(smart_sort_comparator(allele, allele), 0)

def test_equal_comparator(self):
allele1 = "HLA-A*01:01"
allele2 = "HLA-A*01:01"
self.assertEqual(smart_sort_comparator(allele1, allele2), 0)

def test_equal_comparator_G(self):
# Should compare without G
allele1 = "HLA-A*01:01G"
allele2 = "HLA-A*01:01"
self.assertEqual(smart_sort_comparator(allele1, allele2), 0)

def test_equal_comparator_NG(self):
# Should compare without N and G
allele1 = "HLA-A*01:01G"
allele2 = "HLA-A*01:01N"
self.assertEqual(smart_sort_comparator(allele1, allele2), 0)

def test_first_field_comparator_le(self):
allele1 = "HLA-A*01:01"
allele2 = "HLA-A*02:01"
self.assertEqual(smart_sort_comparator(allele1, allele2), -1)

def test_first_field_comparator_ge(self):
allele1 = "HLA-A*02:01"
allele2 = "HLA-A*01:01"
self.assertEqual(smart_sort_comparator(allele1, allele2), 1)

def test_second_field_comparator_le(self):
allele1 = "HLA-A*01:01"
allele2 = "HLA-A*01:02"
self.assertEqual(smart_sort_comparator(allele1, allele2), -1)

def test_second_field_comparator_le_smart(self):
allele1 = "HLA-A*01:29"
allele2 = "HLA-A*01:100"
self.assertEqual(smart_sort_comparator(allele1, allele2), -1)

def test_second_field_comparator_ge(self):
allele1 = "HLA-A*01:02"
allele2 = "HLA-A*01:01"
self.assertEqual(smart_sort_comparator(allele1, allele2), 1)

def test_third_field_comparator_le(self):
allele1 = "HLA-A*01:01:01"
allele2 = "HLA-A*01:01:20"
self.assertEqual(smart_sort_comparator(allele1, allele2), -1)

def test_third_field_comparator_le_smart(self):
allele1 = "HLA-A*01:01:29"
allele2 = "HLA-A*01:01:100"
self.assertEqual(smart_sort_comparator(allele1, allele2), -1)

def test_third_field_comparator_ge(self):
allele1 = "HLA-A*01:01:02"
allele2 = "HLA-A*01:01:01"
self.assertEqual(smart_sort_comparator(allele1, allele2), 1)

def test_fourth_field_comparator_le(self):
allele1 = "HLA-A*01:01:01:01"
allele2 = "HLA-A*01:01:01:20"
self.assertEqual(smart_sort_comparator(allele1, allele2), -1)

def test_fourth_field_comparator_le_smart(self):
allele1 = "HLA-A*01:01:01:39"
allele2 = "HLA-A*01:01:01:200"
self.assertEqual(smart_sort_comparator(allele1, allele2), -1)

def test_fourth_field_comparator_ge(self):
allele1 = "HLA-A*01:01:01:30"
allele2 = "HLA-A*01:01:01:09"
self.assertEqual(smart_sort_comparator(allele1, allele2), 1)


if __name__ == '__main__':
unittest.main()

0 comments on commit f034eb8

Please sign in to comment.