Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,19 @@ Example
allele = "A*01:01:01"

ard.redux(allele, 'G')
# >> 'A*01:01:01G'
# 'A*01:01:01G'

ard.redux(allele, 'lg')
# >> 'A*01:01g'
# 'A*01:01g'

ard.redux(allele, 'lgx')
# 'A*01:01'

ard.redux_gl("A*01:01/A*01:01N+A*02:AB^B*07:02+B*07:AB", "G")
# 'B*07:02:01G+B*07:02:01G^A*01:01:01G+A*02:01:01G/A*02:02'

# py-ard can also reduce serology based typings
ard.redux_gl('HLA-A*10^HLA-A*9', 'lg')
# 'HLA-A*24:19g/HLA-A*24:22g^HLA-A*26:01g/HLA-A*26:10g/HLA-A*26:15g/HLA-A*26:92g/HLA-A*66:01g/HLA-A*66:03g'


166 changes: 105 additions & 61 deletions pyard/data_repository.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import functools
import sqlite3

import pandas as pd
Expand All @@ -6,6 +7,8 @@
from pyard.broad_splits import broad_splits_mapping

# GitHub URL where IMGT HLA files are downloaded.
from pyard.smart_sort import smart_sort_comparator

IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'

# List of expression characters
Expand Down Expand Up @@ -97,65 +100,6 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
return dup_g, g_group, lg_group, lgx_group


def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool):
"""
MAC files come in 2 different versions:

Martin: when they’re printed, the first is better for encoding and the
second is better for decoding. The entire list was maintained both as an
excel spreadsheet and also as a sybase database table. The excel was the
one that was printed and distributed.

**==> numer.v3.txt <==**

Sorted by the length and the the values in the list
```
"LAST UPDATED: 09/30/20"
CODE SUBTYPE

AB 01/02
AC 01/03
AD 01/04
AE 01/05
AG 01/06
AH 01/07
AJ 01/08
```

**==> alpha.v3.txt <==**

Sorted by the code

```
"LAST UPDATED: 10/01/20"
* CODE SUBTYPE

AA 01/02/03/05
AB 01/02
AC 01/03
AD 01/04
AE 01/05
AF 01/09
AG 01/06
```

:param db_connection:
:param data_dir:
:return:
"""
mac_table_name = 'mac_codes'
if refresh_mac or not db.table_exists(db_connection, mac_table_name):
# Load the MAC file to a DataFrame
mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
df_mac = pd.read_csv(mac_url, sep='\t', compression='zip',
skiprows=3, names=['Code', 'Alleles'])
# Create a dict from code to alleles
mac = df_mac.set_index("Code")["Alleles"].to_dict()
# Save the mac dict to db
db.save_dict(db_connection, table_name=mac_table_name,
dictionary=mac, columns=('code', 'alleles'))


def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version):
"""
Checks to see if there's already an allele list file for the `imgt_version`
Expand Down Expand Up @@ -226,10 +170,110 @@ def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_versio
else:
xx_codes[broad] = xx_codes[split]

# Save this version of the valid alleles and xx codes
# Save this version of the valid alleles
db.save_set(db_connection, 'alleles', valid_alleles, 'allele')
flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()}
# Save this version of xx codes
flat_xx_codes = {k: '/'.join(sorted(v, key=functools.cmp_to_key(smart_sort_comparator)))
for k, v in xx_codes.items()}
db.save_dict(db_connection, 'xx_codes', flat_xx_codes,
('allele_1d', 'allele_list'))

return valid_alleles, xx_codes


def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool):
"""
MAC files come in 2 different versions:

Martin: when they’re printed, the first is better for encoding and the
second is better for decoding. The entire list was maintained both as an
excel spreadsheet and also as a sybase database table. The excel was the
one that was printed and distributed.

**==> numer.v3.txt <==**

Sorted by the length and the the values in the list
```
"LAST UPDATED: 09/30/20"
CODE SUBTYPE

AB 01/02
AC 01/03
AD 01/04
AE 01/05
AG 01/06
AH 01/07
AJ 01/08
```

**==> alpha.v3.txt <==**

Sorted by the code

```
"LAST UPDATED: 10/01/20"
* CODE SUBTYPE

AA 01/02/03/05
AB 01/02
AC 01/03
AD 01/04
AE 01/05
AF 01/09
AG 01/06
```

:param db_connection: Database connection to the sqlite database
:param refresh_mac: Refresh the database with newer MAC data ?
:return: None
"""
mac_table_name = 'mac_codes'
if refresh_mac or not db.table_exists(db_connection, mac_table_name):
# Load the MAC file to a DataFrame
mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
df_mac = pd.read_csv(mac_url, sep='\t', compression='zip',
skiprows=3, names=['Code', 'Alleles'])
# Create a dict from code to alleles
mac = df_mac.set_index("Code")["Alleles"].to_dict()
# Save the mac dict to db
db.save_dict(db_connection, table_name=mac_table_name,
dictionary=mac, columns=('code', 'alleles'))


def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
if not db.table_exists(db_connection, 'serology_mapping'):
# Load WMDA serology mapping data
rel_dna_ser_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/rel_dna_ser.txt'
df_sero = pd.read_csv(rel_dna_ser_url, sep=';', skiprows=6,
names=['Locus', 'Allele', 'USA', 'PSA', 'ASA'],
index_col=False)

# Remove 0 and ?
df_sero = df_sero[(df_sero != '0') & (df_sero != '?')]
df_sero['Allele'] = df_sero['Locus'] + df_sero['Allele']

usa = df_sero[['Locus', 'Allele', 'USA']].dropna()
usa['Sero'] = usa['Locus'] + usa['USA']

psa = df_sero[['Locus', 'Allele', 'PSA']].dropna()
psa['PSA'] = psa['PSA'].apply(lambda row: row.split('/'))
psa = psa.explode('PSA')
psa = psa[(psa != '0') & (psa != '?')].dropna()
psa['Sero'] = psa['Locus'] + psa['PSA']

asa = df_sero[['Locus', 'Allele', 'ASA']].dropna()
asa['ASA'] = asa['ASA'].apply(lambda x: x.split('/'))
asa = asa.explode('ASA')
asa = asa[(asa != '0') & (asa != '?')].dropna()
asa['Sero'] = asa['Locus'] + asa['ASA']

sero_mapping_combined = pd.concat([usa[['Sero', 'Allele']],
psa[['Sero', 'Allele']],
asa[['Sero', 'Allele']]])
sero_mapping = sero_mapping_combined.groupby('Sero').\
apply(lambda x: '/'.join(sorted(x['Allele']))).\
to_dict()

# Save the serology mapping to db
db.save_dict(db_connection, table_name='serology_mapping',
dictionary=sero_mapping, columns=('serology', 'allele_list'))
21 changes: 20 additions & 1 deletion pyard/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,25 @@ def mac_code_to_alleles(connection: sqlite3.Connection, code: str) -> List[str]:
return alleles


def serology_to_alleles(connection: sqlite3.Connection, serology: str) -> List[str]:
"""
Look up Serology in the database and return corresponding list of alleles.

:param connection: db connection of type sqlite.Connection
:param serology: Serology
:return: List of alleles
"""
serology_query = "SELECT allele_list from serology_mapping where serology = ?"
cursor = connection.execute(serology_query, (serology, ))
result = cursor.fetchone()
cursor.close()
if result:
alleles = result[0].split('/')
else:
alleles = None
return alleles


def is_valid_mac_code(connection: sqlite3.Connection, code: str) -> bool:
"""
Check db if the MAC code exists.
Expand Down Expand Up @@ -196,4 +215,4 @@ def load_dict(connection: sqlite3.Connection, table_name: str, columns: Tuple[st
cursor.execute(select_all_query)
table_as_dict = {k: v for k, v in cursor.fetchall()}
cursor.close()
return table_as_dict
return table_as_dict
51 changes: 38 additions & 13 deletions pyard/pyard.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
from typing import Iterable

from . import db
from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes
from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes, \
generate_serology_mapping
from .db import is_valid_mac_code, mac_code_to_alleles
from .smart_sort import smart_sort_comparator

Expand Down Expand Up @@ -63,6 +64,8 @@ def __init__(self, imgt_version: str = 'Latest',
self.valid_alleles, self.xx_codes = generate_alleles_and_xx_codes(self.db_connection, imgt_version)
# Load ARS mappings
self.dup_g, self._G, self._lg, self._lgx = generate_ars_mapping(self.db_connection, imgt_version)
# Load Serology mappings
generate_serology_mapping(self.db_connection, imgt_version)

# Close the current read-write db connection
self.db_connection.close()
Expand Down Expand Up @@ -169,36 +172,54 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]),
key=functools.cmp_to_key(smart_sort_comparator)))

# Handle Serology
if self.is_serology(glstring):
if HLA_regex.search(glstring):
# Remove HLA- prefix
serology = glstring.split("-")[1]
alleles = self._get_alleles_from_serology(serology)
alleles = ['HLA-' + a for a in alleles]
else:
alleles = self._get_alleles_from_serology(glstring)
return self.redux_gl("/".join(alleles), redux_type)

loc_allele = glstring.split(":")
loc_name, code = loc_allele[0], loc_allele[1]

# handle XX codes
# test that they are valid_alleles
# Handle XX codes
if (self.is_mac(glstring) and glstring.split(":")[1] == "XX") and loc_name in self.xx_codes:
return self.redux_gl(
"/".join(sorted(self.xx_codes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
return self.redux_gl("/".join(self.xx_codes[loc_name]), redux_type)

# Handle MAC
if self.is_mac(glstring) and is_valid_mac_code(self.db_connection, code):
if HLA_regex.search(glstring):
hla, allele_name = glstring.split("-")
# Remove HLA- prefix
allele_name = glstring.split("-")[1]
loc_name, code = allele_name.split(":")
alleles = self._get_alleles(code, loc_name)
return self.redux_gl(
"/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))),
redux_type)
alleles = ["HLA-" + a for a in alleles]
else:
alleles = self._get_alleles(code, loc_name)
return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))),
redux_type)
return self.redux_gl("/".join(alleles), redux_type)

return self.redux(glstring, redux_type)

@staticmethod
def is_serology(allele: str) -> bool:
"""
An allele is serology if the allele name after * is numeral only, no ':'
:param allele: The allele to test for serology
:return: True if serology
"""
return allele.split('*')[1].isdigit()

@staticmethod
def is_mac(gl: str) -> bool:
"""
MAC has there are non-digit characters after the : character,
then it's a MAC.
:param gl: glstring to test if it has a MAC code
:return: bool
:return: True if MAC
"""
return re.search(r":\D+", gl) is not None

Expand All @@ -221,6 +242,10 @@ def _get_alleles(self, code, loc_name) -> Iterable[str]:
return filter(self._is_valid_allele,
[f'{loc_name}:{a}' for a in alleles])

def _get_alleles_from_serology(self, serology) -> Iterable[str]:
alleles = db.serology_to_alleles(self.db_connection, serology)
return filter(self._is_valid_allele, alleles)

def isvalid(self, allele: str) -> bool:
"""
Determines validity of an allele
Expand All @@ -230,7 +255,7 @@ def isvalid(self, allele: str) -> bool:
:return: allele or empty
:rtype: bool
"""
if not self.is_mac(allele):
if not self.is_mac(allele) and not self.is_serology(allele):
# Alleles ending with P or G are valid_alleles
if allele.endswith(('P', 'G')):
# remove the last character
Expand Down
24 changes: 24 additions & 0 deletions tests/features/serology.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Feature: Serology

py-ard is able to map serology to the corresponding alleles and reduce to the desired
level.

Scenario Outline:

Given the serology typing is <Serology>
When reducing on the <Level> level (ambiguous)
Then the reduced allele is found to be <Redux Allele>


Examples: Valid A serology typings
| Serology | Level | Redux Allele |
| A*10 | G | A*26:01:01G/A*26:10/A*26:15/A*26:92/A*66:01:01G/A*66:03:01G |
| A*10 | lg | A*26:01g/A*26:10g/A*26:15g/A*26:92g/A*66:01g/A*66:03g |
| A*10 | lgx | A*26:01/A*26:10/A*26:15/A*26:92/A*66:01/A*66:03 |

Examples: With HLA- prefix
| Serology | Level | Redux Allele |
| HLA-A*10 | G | HLA-A*26:01:01G/HLA-A*26:10/HLA-A*26:15/HLA-A*26:92/HLA-A*66:01:01G/HLA-A*66:03:01G |
| HLA-B*15:03 | G | HLA-B*15:03:01G |
| HLA-DQB1*1 | G | HLA-DQB1*06:11:01/HLA-DQB1*06:11:02/HLA-DQB1*06:11:03/HLA-DQB1*06:12 |
| HLA-DQB1*1 | lg | HLA-DQB1*06:11g/HLA-DQB1*06:12g |
5 changes: 5 additions & 0 deletions tests/steps/redux_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,8 @@ def step_impl(context, level):
@then('the reduced allele is found to be {redux_allele}')
def step_impl(context, redux_allele):
assert_that(context.redux_allele, is_(redux_allele))


@given("the serology typing is {serology}")
def step_impl(context, serology):
context.allele = serology