From 63a11a5c685160c77764fb5e89b86939cba7e50c Mon Sep 17 00:00:00 2001
From: pbashyal-nmdp <pbashyal@nmdp.org>
Date: Thu, 22 Oct 2020 14:30:22 -0500
Subject: [PATCH] Support Serology

  - Gets WMDA `rel_dna_ser.txt` for the corresponding version of IMGT database to produce serology mapping. 0 and ? are ignored.
  - Save mapped data to the `serology_mapping` table
  - Add Serology Gherkin test
---
 README.rst                      |   8 +-
 pyard/data_repository.py        | 166 ++++++++++++++++++++------------
 pyard/db.py                     |  21 +++-
 pyard/pyard.py                  |  51 +++++++---
 tests/features/serology.feature |  24 +++++
 tests/steps/redux_allele.py     |   5 +
 6 files changed, 198 insertions(+), 77 deletions(-)
 create mode 100644 tests/features/serology.feature

diff --git a/README.rst b/README.rst
index acff308..2158f5c 100644
--- a/README.rst
+++ b/README.rst
@@ -72,10 +72,10 @@ Example
     allele = "A*01:01:01"
 
     ard.redux(allele, 'G')
-    # >> 'A*01:01:01G'
+    # 'A*01:01:01G'
 
     ard.redux(allele, 'lg')
-    # >> 'A*01:01g'
+    # 'A*01:01g'
 
     ard.redux(allele, 'lgx')
     # 'A*01:01'
@@ -83,4 +83,8 @@ Example
     ard.redux_gl("A*01:01/A*01:01N+A*02:AB^B*07:02+B*07:AB", "G")
     # 'B*07:02:01G+B*07:02:01G^A*01:01:01G+A*02:01:01G/A*02:02'
 
+    # py-ard can also reduce serology based typings
+    ard.redux_gl('HLA-A*10^HLA-A*9', 'lg')
+    # 'HLA-A*24:19g/HLA-A*24:22g^HLA-A*26:01g/HLA-A*26:10g/HLA-A*26:15g/HLA-A*26:92g/HLA-A*66:01g/HLA-A*66:03g'
+
 
diff --git a/pyard/data_repository.py b/pyard/data_repository.py
index e91cf8a..81364fd 100644
--- a/pyard/data_repository.py
+++ b/pyard/data_repository.py
@@ -1,3 +1,4 @@
+import functools
 import sqlite3
 
 import pandas as pd
@@ -6,6 +7,8 @@
 from pyard.broad_splits import broad_splits_mapping
 
 # GitHub URL where IMGT HLA files are downloaded.
+from pyard.smart_sort import smart_sort_comparator
+
 IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'
 
 # List of expression characters
@@ -97,65 +100,6 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
     return dup_g, g_group, lg_group, lgx_group
 
 
-def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool):
-    """
-    MAC files come in 2 different versions:
-
-    Martin: when they’re printed, the first is better for encoding and the
-    second is better for decoding. The entire list was maintained both as an
-    excel spreadsheet and also as a sybase database table. The excel was the
-    one that was printed and distributed.
-
-        **==> numer.v3.txt <==**
-
-        Sorted by the length and the the values in the list
-        ```
-        "LAST UPDATED: 09/30/20"
-        CODE	SUBTYPE
-
-        AB	01/02
-        AC	01/03
-        AD	01/04
-        AE	01/05
-        AG	01/06
-        AH	01/07
-        AJ	01/08
-        ```
-
-        **==> alpha.v3.txt <==**
-
-        Sorted by the code
-
-        ```
-        "LAST UPDATED: 10/01/20"
-        *	CODE	SUBTYPE
-
-            AA	01/02/03/05
-            AB	01/02
-            AC	01/03
-            AD	01/04
-            AE	01/05
-            AF	01/09
-            AG	01/06
-        ```
-
-    :param db_connection:
-    :param data_dir:
-    :return:
-    """
-    mac_table_name = 'mac_codes'
-    if refresh_mac or not db.table_exists(db_connection, mac_table_name):
-        # Load the MAC file to a DataFrame
-        mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
-        df_mac = pd.read_csv(mac_url, sep='\t', compression='zip',
-                             skiprows=3, names=['Code', 'Alleles'])
-        # Create a dict from code to alleles
-        mac = df_mac.set_index("Code")["Alleles"].to_dict()
-        # Save the mac dict to db
-        db.save_dict(db_connection, table_name=mac_table_name,
-                     dictionary=mac, columns=('code', 'alleles'))
-
-
 def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version):
     """
     Checks to see if there's already an allele list file for the `imgt_version`
@@ -226,10 +170,110 @@ def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_versio
             else:
                 xx_codes[broad] = xx_codes[split]
 
-    # Save this version of the valid alleles and xx codes
+    # Save this version of the valid alleles
     db.save_set(db_connection, 'alleles', valid_alleles, 'allele')
-    flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()}
+    # Save this version of xx codes
+    flat_xx_codes = {k: '/'.join(sorted(v, key=functools.cmp_to_key(smart_sort_comparator)))
+                     for k, v in xx_codes.items()}
     db.save_dict(db_connection, 'xx_codes', flat_xx_codes,
                  ('allele_1d', 'allele_list'))
 
     return valid_alleles, xx_codes
+
+
+def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool):
+    """
+    MAC files come in 2 different versions:
+
+    Martin: when they’re printed, the first is better for encoding and the
+    second is better for decoding. The entire list was maintained both as an
+    excel spreadsheet and also as a sybase database table. The excel was the
+    one that was printed and distributed.
+
+        **==> numer.v3.txt <==**
+
+        Sorted by the length and the the values in the list
+        ```
+        "LAST UPDATED: 09/30/20"
+        CODE	SUBTYPE
+
+        AB	01/02
+        AC	01/03
+        AD	01/04
+        AE	01/05
+        AG	01/06
+        AH	01/07
+        AJ	01/08
+        ```
+
+        **==> alpha.v3.txt <==**
+
+        Sorted by the code
+
+        ```
+        "LAST UPDATED: 10/01/20"
+        *	CODE	SUBTYPE
+
+            AA	01/02/03/05
+            AB	01/02
+            AC	01/03
+            AD	01/04
+            AE	01/05
+            AF	01/09
+            AG	01/06
+        ```
+
+    :param db_connection: Database connection to the sqlite database
+    :param refresh_mac: Refresh the database with newer MAC data ?
+    :return: None
+    """
+    mac_table_name = 'mac_codes'
+    if refresh_mac or not db.table_exists(db_connection, mac_table_name):
+        # Load the MAC file to a DataFrame
+        mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
+        df_mac = pd.read_csv(mac_url, sep='\t', compression='zip',
+                             skiprows=3, names=['Code', 'Alleles'])
+        # Create a dict from code to alleles
+        mac = df_mac.set_index("Code")["Alleles"].to_dict()
+        # Save the mac dict to db
+        db.save_dict(db_connection, table_name=mac_table_name,
+                     dictionary=mac, columns=('code', 'alleles'))
+
+
+def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
+    if not db.table_exists(db_connection, 'serology_mapping'):
+        # Load WMDA serology mapping data
+        rel_dna_ser_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/rel_dna_ser.txt'
+        df_sero = pd.read_csv(rel_dna_ser_url, sep=';', skiprows=6,
+                              names=['Locus', 'Allele', 'USA', 'PSA', 'ASA'],
+                              index_col=False)
+
+        # Remove 0 and ?
+        df_sero = df_sero[(df_sero != '0') & (df_sero != '?')]
+        df_sero['Allele'] = df_sero['Locus'] + df_sero['Allele']
+
+        usa = df_sero[['Locus', 'Allele', 'USA']].dropna()
+        usa['Sero'] = usa['Locus'] + usa['USA']
+
+        psa = df_sero[['Locus', 'Allele', 'PSA']].dropna()
+        psa['PSA'] = psa['PSA'].apply(lambda row: row.split('/'))
+        psa = psa.explode('PSA')
+        psa = psa[(psa != '0') & (psa != '?')].dropna()
+        psa['Sero'] = psa['Locus'] + psa['PSA']
+
+        asa = df_sero[['Locus', 'Allele', 'ASA']].dropna()
+        asa['ASA'] = asa['ASA'].apply(lambda x: x.split('/'))
+        asa = asa.explode('ASA')
+        asa = asa[(asa != '0') & (asa != '?')].dropna()
+        asa['Sero'] = asa['Locus'] + asa['ASA']
+
+        sero_mapping_combined = pd.concat([usa[['Sero', 'Allele']],
+                                           psa[['Sero', 'Allele']],
+                                           asa[['Sero', 'Allele']]])
+        sero_mapping = sero_mapping_combined.groupby('Sero').\
+            apply(lambda x: '/'.join(sorted(x['Allele']))).\
+            to_dict()
+
+        # Save the serology mapping to db
+        db.save_dict(db_connection, table_name='serology_mapping',
+                     dictionary=sero_mapping, columns=('serology', 'allele_list'))
diff --git a/pyard/db.py b/pyard/db.py
index 27a109f..6988473 100644
--- a/pyard/db.py
+++ b/pyard/db.py
@@ -83,6 +83,25 @@ def mac_code_to_alleles(connection: sqlite3.Connection, code: str) -> List[str]:
     return alleles
 
 
+def serology_to_alleles(connection: sqlite3.Connection, serology: str) -> List[str]:
+    """
+    Look up Serology in the database and return corresponding list of alleles.
+
+    :param connection: db connection of type sqlite.Connection
+    :param serology: Serology
+    :return: List of alleles
+    """
+    serology_query = "SELECT allele_list from serology_mapping where serology = ?"
+    cursor = connection.execute(serology_query, (serology, ))
+    result = cursor.fetchone()
+    cursor.close()
+    if result:
+        alleles = result[0].split('/')
+    else:
+        alleles = None
+    return alleles
+
+
 def is_valid_mac_code(connection: sqlite3.Connection, code: str) -> bool:
     """
     Check db if the MAC code exists.
@@ -196,4 +215,4 @@ def load_dict(connection: sqlite3.Connection, table_name: str, columns: Tuple[st
     cursor.execute(select_all_query)
     table_as_dict = {k: v for k, v in cursor.fetchall()}
     cursor.close()
-    return table_as_dict
+    return table_as_dict
\ No newline at end of file
diff --git a/pyard/pyard.py b/pyard/pyard.py
index 58c74a5..96a4487 100644
--- a/pyard/pyard.py
+++ b/pyard/pyard.py
@@ -26,7 +26,8 @@
 from typing import Iterable
 
 from . import db
-from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes
+from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes, \
+    generate_serology_mapping
 from .db import is_valid_mac_code, mac_code_to_alleles
 from .smart_sort import smart_sort_comparator
 
@@ -63,6 +64,8 @@ def __init__(self, imgt_version: str = 'Latest',
         self.valid_alleles, self.xx_codes = generate_alleles_and_xx_codes(self.db_connection, imgt_version)
         # Load ARS mappings
         self.dup_g, self._G, self._lg, self._lgx = generate_ars_mapping(self.db_connection, imgt_version)
+        # Load Serology mappings
+        generate_serology_mapping(self.db_connection, imgt_version)
 
         # Close the current read-write db connection
         self.db_connection.close()
@@ -169,36 +172,54 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
             return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]),
                                    key=functools.cmp_to_key(smart_sort_comparator)))
 
+        # Handle Serology
+        if self.is_serology(glstring):
+            if HLA_regex.search(glstring):
+                # Remove HLA- prefix
+                serology = glstring.split("-")[1]
+                alleles = self._get_alleles_from_serology(serology)
+                alleles = ['HLA-' + a for a in alleles]
+            else:
+                alleles = self._get_alleles_from_serology(glstring)
+            return self.redux_gl("/".join(alleles), redux_type)
+
         loc_allele = glstring.split(":")
         loc_name, code = loc_allele[0], loc_allele[1]
 
-        # handle XX codes
-        # test that they are valid_alleles
+        # Handle XX codes
         if (self.is_mac(glstring) and glstring.split(":")[1] == "XX") and loc_name in self.xx_codes:
-            return self.redux_gl(
-                "/".join(sorted(self.xx_codes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
+            return self.redux_gl("/".join(self.xx_codes[loc_name]), redux_type)
 
+        # Handle MAC
         if self.is_mac(glstring) and is_valid_mac_code(self.db_connection, code):
             if HLA_regex.search(glstring):
-                hla, allele_name = glstring.split("-")
+                # Remove HLA- prefix
+                allele_name = glstring.split("-")[1]
                 loc_name, code = allele_name.split(":")
                 alleles = self._get_alleles(code, loc_name)
-                return self.redux_gl(
-                    "/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))),
-                    redux_type)
+                alleles = ["HLA-" + a for a in alleles]
             else:
                 alleles = self._get_alleles(code, loc_name)
-                return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))),
-                                     redux_type)
+            return self.redux_gl("/".join(alleles), redux_type)
+
         return self.redux(glstring, redux_type)
 
+    @staticmethod
+    def is_serology(allele: str) -> bool:
+        """
+        An allele is serology if the allele name after * is numeral only, no ':'
+        :param allele: The allele to test for serology
+        :return: True if serology
+        """
+        return allele.split('*')[1].isdigit()
+
     @staticmethod
     def is_mac(gl: str) -> bool:
         """
         MAC has there are non-digit characters after the : character,
         then it's a MAC.
         :param gl: glstring to test if it has a MAC code
-        :return: bool
+        :return: True if MAC
         """
         return re.search(r":\D+", gl) is not None
 
@@ -221,6 +242,10 @@ def _get_alleles(self, code, loc_name) -> Iterable[str]:
         return filter(self._is_valid_allele,
                       [f'{loc_name}:{a}' for a in alleles])
 
+    def _get_alleles_from_serology(self, serology) -> Iterable[str]:
+        alleles = db.serology_to_alleles(self.db_connection, serology)
+        return filter(self._is_valid_allele, alleles)
+
     def isvalid(self, allele: str) -> bool:
         """
         Determines validity of an allele
@@ -230,7 +255,7 @@ def isvalid(self, allele: str) -> bool:
         :return: allele or empty
         :rtype: bool
         """
-        if not self.is_mac(allele):
+        if not self.is_mac(allele) and not self.is_serology(allele):
             # Alleles ending with P or G are valid_alleles
             if allele.endswith(('P', 'G')):
                 # remove the last character
diff --git a/tests/features/serology.feature b/tests/features/serology.feature
new file mode 100644
index 0000000..296c3ba
--- /dev/null
+++ b/tests/features/serology.feature
@@ -0,0 +1,24 @@
+Feature: Serology
+
+  py-ard is able to map serology to the corresponding alleles and reduce to the desired
+  level.
+
+  Scenario Outline:
+
+    Given the serology typing is <Serology>
+    When reducing on the <Level> level (ambiguous)
+    Then the reduced allele is found to be <Redux Allele>
+
+
+    Examples: Valid A serology typings
+      | Serology | Level | Redux Allele                                                |
+      | A*10     | G     | A*26:01:01G/A*26:10/A*26:15/A*26:92/A*66:01:01G/A*66:03:01G |
+      | A*10     | lg    | A*26:01g/A*26:10g/A*26:15g/A*26:92g/A*66:01g/A*66:03g       |
+      | A*10     | lgx   | A*26:01/A*26:10/A*26:15/A*26:92/A*66:01/A*66:03             |
+
+    Examples: With HLA- prefix
+      | Serology    | Level | Redux Allele                                                                        |
+      | HLA-A*10    | G     | HLA-A*26:01:01G/HLA-A*26:10/HLA-A*26:15/HLA-A*26:92/HLA-A*66:01:01G/HLA-A*66:03:01G |
+      | HLA-B*15:03 | G     | HLA-B*15:03:01G                                                                     |
+      | HLA-DQB1*1  | G     | HLA-DQB1*06:11:01/HLA-DQB1*06:11:02/HLA-DQB1*06:11:03/HLA-DQB1*06:12                |
+      | HLA-DQB1*1  | lg    | HLA-DQB1*06:11g/HLA-DQB1*06:12g                                                     |
diff --git a/tests/steps/redux_allele.py b/tests/steps/redux_allele.py
index d5055a4..7715272 100644
--- a/tests/steps/redux_allele.py
+++ b/tests/steps/redux_allele.py
@@ -22,3 +22,8 @@ def step_impl(context, level):
 @then('the reduced allele is found to be {redux_allele}')
 def step_impl(context, redux_allele):
     assert_that(context.redux_allele, is_(redux_allele))
+
+
+@given("the serology typing is {serology}")
+def step_impl(context, serology):
+    context.allele = serology