From 63a11a5c685160c77764fb5e89b86939cba7e50c Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Thu, 22 Oct 2020 14:30:22 -0500 Subject: [PATCH] Support Serology - Gets WMDA `rel_dna_ser.txt` for the corresponding version of IMGT database to produce serology mapping. 0 and ? are ignored. - Save mapped data to the `serology_mapping` table - Add Serology Gherkin test --- README.rst | 8 +- pyard/data_repository.py | 166 ++++++++++++++++++++------------ pyard/db.py | 21 +++- pyard/pyard.py | 51 +++++++--- tests/features/serology.feature | 24 +++++ tests/steps/redux_allele.py | 5 + 6 files changed, 198 insertions(+), 77 deletions(-) create mode 100644 tests/features/serology.feature diff --git a/README.rst b/README.rst index acff308..2158f5c 100644 --- a/README.rst +++ b/README.rst @@ -72,10 +72,10 @@ Example allele = "A*01:01:01" ard.redux(allele, 'G') - # >> 'A*01:01:01G' + # 'A*01:01:01G' ard.redux(allele, 'lg') - # >> 'A*01:01g' + # 'A*01:01g' ard.redux(allele, 'lgx') # 'A*01:01' @@ -83,4 +83,8 @@ Example ard.redux_gl("A*01:01/A*01:01N+A*02:AB^B*07:02+B*07:AB", "G") # 'B*07:02:01G+B*07:02:01G^A*01:01:01G+A*02:01:01G/A*02:02' + # py-ard can also reduce serology based typings + ard.redux_gl('HLA-A*10^HLA-A*9', 'lg') + # 'HLA-A*24:19g/HLA-A*24:22g^HLA-A*26:01g/HLA-A*26:10g/HLA-A*26:15g/HLA-A*26:92g/HLA-A*66:01g/HLA-A*66:03g' + diff --git a/pyard/data_repository.py b/pyard/data_repository.py index e91cf8a..81364fd 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -1,3 +1,4 @@ +import functools import sqlite3 import pandas as pd @@ -6,6 +7,8 @@ from pyard.broad_splits import broad_splits_mapping # GitHub URL where IMGT HLA files are downloaded. +from pyard.smart_sort import smart_sort_comparator + IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/' # List of expression characters @@ -97,65 +100,6 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): return dup_g, g_group, lg_group, lgx_group -def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool): - """ - MAC files come in 2 different versions: - - Martin: when they’re printed, the first is better for encoding and the - second is better for decoding. The entire list was maintained both as an - excel spreadsheet and also as a sybase database table. The excel was the - one that was printed and distributed. - - **==> numer.v3.txt <==** - - Sorted by the length and the the values in the list - ``` - "LAST UPDATED: 09/30/20" - CODE SUBTYPE - - AB 01/02 - AC 01/03 - AD 01/04 - AE 01/05 - AG 01/06 - AH 01/07 - AJ 01/08 - ``` - - **==> alpha.v3.txt <==** - - Sorted by the code - - ``` - "LAST UPDATED: 10/01/20" - * CODE SUBTYPE - - AA 01/02/03/05 - AB 01/02 - AC 01/03 - AD 01/04 - AE 01/05 - AF 01/09 - AG 01/06 - ``` - - :param db_connection: - :param data_dir: - :return: - """ - mac_table_name = 'mac_codes' - if refresh_mac or not db.table_exists(db_connection, mac_table_name): - # Load the MAC file to a DataFrame - mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip' - df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', - skiprows=3, names=['Code', 'Alleles']) - # Create a dict from code to alleles - mac = df_mac.set_index("Code")["Alleles"].to_dict() - # Save the mac dict to db - db.save_dict(db_connection, table_name=mac_table_name, - dictionary=mac, columns=('code', 'alleles')) - - def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version): """ Checks to see if there's already an allele list file for the `imgt_version` @@ -226,10 +170,110 @@ def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_versio else: xx_codes[broad] = xx_codes[split] - # Save this version of the valid alleles and xx codes + # Save this version of the valid alleles db.save_set(db_connection, 'alleles', valid_alleles, 'allele') - flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()} + # Save this version of xx codes + flat_xx_codes = {k: '/'.join(sorted(v, key=functools.cmp_to_key(smart_sort_comparator))) + for k, v in xx_codes.items()} db.save_dict(db_connection, 'xx_codes', flat_xx_codes, ('allele_1d', 'allele_list')) return valid_alleles, xx_codes + + +def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool): + """ + MAC files come in 2 different versions: + + Martin: when they’re printed, the first is better for encoding and the + second is better for decoding. The entire list was maintained both as an + excel spreadsheet and also as a sybase database table. The excel was the + one that was printed and distributed. + + **==> numer.v3.txt <==** + + Sorted by the length and the the values in the list + ``` + "LAST UPDATED: 09/30/20" + CODE SUBTYPE + + AB 01/02 + AC 01/03 + AD 01/04 + AE 01/05 + AG 01/06 + AH 01/07 + AJ 01/08 + ``` + + **==> alpha.v3.txt <==** + + Sorted by the code + + ``` + "LAST UPDATED: 10/01/20" + * CODE SUBTYPE + + AA 01/02/03/05 + AB 01/02 + AC 01/03 + AD 01/04 + AE 01/05 + AF 01/09 + AG 01/06 + ``` + + :param db_connection: Database connection to the sqlite database + :param refresh_mac: Refresh the database with newer MAC data ? + :return: None + """ + mac_table_name = 'mac_codes' + if refresh_mac or not db.table_exists(db_connection, mac_table_name): + # Load the MAC file to a DataFrame + mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip' + df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', + skiprows=3, names=['Code', 'Alleles']) + # Create a dict from code to alleles + mac = df_mac.set_index("Code")["Alleles"].to_dict() + # Save the mac dict to db + db.save_dict(db_connection, table_name=mac_table_name, + dictionary=mac, columns=('code', 'alleles')) + + +def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version): + if not db.table_exists(db_connection, 'serology_mapping'): + # Load WMDA serology mapping data + rel_dna_ser_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/rel_dna_ser.txt' + df_sero = pd.read_csv(rel_dna_ser_url, sep=';', skiprows=6, + names=['Locus', 'Allele', 'USA', 'PSA', 'ASA'], + index_col=False) + + # Remove 0 and ? + df_sero = df_sero[(df_sero != '0') & (df_sero != '?')] + df_sero['Allele'] = df_sero['Locus'] + df_sero['Allele'] + + usa = df_sero[['Locus', 'Allele', 'USA']].dropna() + usa['Sero'] = usa['Locus'] + usa['USA'] + + psa = df_sero[['Locus', 'Allele', 'PSA']].dropna() + psa['PSA'] = psa['PSA'].apply(lambda row: row.split('/')) + psa = psa.explode('PSA') + psa = psa[(psa != '0') & (psa != '?')].dropna() + psa['Sero'] = psa['Locus'] + psa['PSA'] + + asa = df_sero[['Locus', 'Allele', 'ASA']].dropna() + asa['ASA'] = asa['ASA'].apply(lambda x: x.split('/')) + asa = asa.explode('ASA') + asa = asa[(asa != '0') & (asa != '?')].dropna() + asa['Sero'] = asa['Locus'] + asa['ASA'] + + sero_mapping_combined = pd.concat([usa[['Sero', 'Allele']], + psa[['Sero', 'Allele']], + asa[['Sero', 'Allele']]]) + sero_mapping = sero_mapping_combined.groupby('Sero').\ + apply(lambda x: '/'.join(sorted(x['Allele']))).\ + to_dict() + + # Save the serology mapping to db + db.save_dict(db_connection, table_name='serology_mapping', + dictionary=sero_mapping, columns=('serology', 'allele_list')) diff --git a/pyard/db.py b/pyard/db.py index 27a109f..6988473 100644 --- a/pyard/db.py +++ b/pyard/db.py @@ -83,6 +83,25 @@ def mac_code_to_alleles(connection: sqlite3.Connection, code: str) -> List[str]: return alleles +def serology_to_alleles(connection: sqlite3.Connection, serology: str) -> List[str]: + """ + Look up Serology in the database and return corresponding list of alleles. + + :param connection: db connection of type sqlite.Connection + :param serology: Serology + :return: List of alleles + """ + serology_query = "SELECT allele_list from serology_mapping where serology = ?" + cursor = connection.execute(serology_query, (serology, )) + result = cursor.fetchone() + cursor.close() + if result: + alleles = result[0].split('/') + else: + alleles = None + return alleles + + def is_valid_mac_code(connection: sqlite3.Connection, code: str) -> bool: """ Check db if the MAC code exists. @@ -196,4 +215,4 @@ def load_dict(connection: sqlite3.Connection, table_name: str, columns: Tuple[st cursor.execute(select_all_query) table_as_dict = {k: v for k, v in cursor.fetchall()} cursor.close() - return table_as_dict + return table_as_dict \ No newline at end of file diff --git a/pyard/pyard.py b/pyard/pyard.py index 58c74a5..96a4487 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -26,7 +26,8 @@ from typing import Iterable from . import db -from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes +from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes, \ + generate_serology_mapping from .db import is_valid_mac_code, mac_code_to_alleles from .smart_sort import smart_sort_comparator @@ -63,6 +64,8 @@ def __init__(self, imgt_version: str = 'Latest', self.valid_alleles, self.xx_codes = generate_alleles_and_xx_codes(self.db_connection, imgt_version) # Load ARS mappings self.dup_g, self._G, self._lg, self._lgx = generate_ars_mapping(self.db_connection, imgt_version) + # Load Serology mappings + generate_serology_mapping(self.db_connection, imgt_version) # Close the current read-write db connection self.db_connection.close() @@ -169,36 +172,54 @@ def redux_gl(self, glstring: str, redux_type: str) -> str: return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(smart_sort_comparator))) + # Handle Serology + if self.is_serology(glstring): + if HLA_regex.search(glstring): + # Remove HLA- prefix + serology = glstring.split("-")[1] + alleles = self._get_alleles_from_serology(serology) + alleles = ['HLA-' + a for a in alleles] + else: + alleles = self._get_alleles_from_serology(glstring) + return self.redux_gl("/".join(alleles), redux_type) + loc_allele = glstring.split(":") loc_name, code = loc_allele[0], loc_allele[1] - # handle XX codes - # test that they are valid_alleles + # Handle XX codes if (self.is_mac(glstring) and glstring.split(":")[1] == "XX") and loc_name in self.xx_codes: - return self.redux_gl( - "/".join(sorted(self.xx_codes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type) + return self.redux_gl("/".join(self.xx_codes[loc_name]), redux_type) + # Handle MAC if self.is_mac(glstring) and is_valid_mac_code(self.db_connection, code): if HLA_regex.search(glstring): - hla, allele_name = glstring.split("-") + # Remove HLA- prefix + allele_name = glstring.split("-")[1] loc_name, code = allele_name.split(":") alleles = self._get_alleles(code, loc_name) - return self.redux_gl( - "/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))), - redux_type) + alleles = ["HLA-" + a for a in alleles] else: alleles = self._get_alleles(code, loc_name) - return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))), - redux_type) + return self.redux_gl("/".join(alleles), redux_type) + return self.redux(glstring, redux_type) + @staticmethod + def is_serology(allele: str) -> bool: + """ + An allele is serology if the allele name after * is numeral only, no ':' + :param allele: The allele to test for serology + :return: True if serology + """ + return allele.split('*')[1].isdigit() + @staticmethod def is_mac(gl: str) -> bool: """ MAC has there are non-digit characters after the : character, then it's a MAC. :param gl: glstring to test if it has a MAC code - :return: bool + :return: True if MAC """ return re.search(r":\D+", gl) is not None @@ -221,6 +242,10 @@ def _get_alleles(self, code, loc_name) -> Iterable[str]: return filter(self._is_valid_allele, [f'{loc_name}:{a}' for a in alleles]) + def _get_alleles_from_serology(self, serology) -> Iterable[str]: + alleles = db.serology_to_alleles(self.db_connection, serology) + return filter(self._is_valid_allele, alleles) + def isvalid(self, allele: str) -> bool: """ Determines validity of an allele @@ -230,7 +255,7 @@ def isvalid(self, allele: str) -> bool: :return: allele or empty :rtype: bool """ - if not self.is_mac(allele): + if not self.is_mac(allele) and not self.is_serology(allele): # Alleles ending with P or G are valid_alleles if allele.endswith(('P', 'G')): # remove the last character diff --git a/tests/features/serology.feature b/tests/features/serology.feature new file mode 100644 index 0000000..296c3ba --- /dev/null +++ b/tests/features/serology.feature @@ -0,0 +1,24 @@ +Feature: Serology + + py-ard is able to map serology to the corresponding alleles and reduce to the desired + level. + + Scenario Outline: + + Given the serology typing is + When reducing on the level (ambiguous) + Then the reduced allele is found to be + + + Examples: Valid A serology typings + | Serology | Level | Redux Allele | + | A*10 | G | A*26:01:01G/A*26:10/A*26:15/A*26:92/A*66:01:01G/A*66:03:01G | + | A*10 | lg | A*26:01g/A*26:10g/A*26:15g/A*26:92g/A*66:01g/A*66:03g | + | A*10 | lgx | A*26:01/A*26:10/A*26:15/A*26:92/A*66:01/A*66:03 | + + Examples: With HLA- prefix + | Serology | Level | Redux Allele | + | HLA-A*10 | G | HLA-A*26:01:01G/HLA-A*26:10/HLA-A*26:15/HLA-A*26:92/HLA-A*66:01:01G/HLA-A*66:03:01G | + | HLA-B*15:03 | G | HLA-B*15:03:01G | + | HLA-DQB1*1 | G | HLA-DQB1*06:11:01/HLA-DQB1*06:11:02/HLA-DQB1*06:11:03/HLA-DQB1*06:12 | + | HLA-DQB1*1 | lg | HLA-DQB1*06:11g/HLA-DQB1*06:12g | diff --git a/tests/steps/redux_allele.py b/tests/steps/redux_allele.py index d5055a4..7715272 100644 --- a/tests/steps/redux_allele.py +++ b/tests/steps/redux_allele.py @@ -22,3 +22,8 @@ def step_impl(context, level): @then('the reduced allele is found to be {redux_allele}') def step_impl(context, redux_allele): assert_that(context.redux_allele, is_(redux_allele)) + + +@given("the serology typing is {serology}") +def step_impl(context, serology): + context.allele = serology