diff --git a/pyard/data_repository.py b/pyard/data_repository.py index 61b3de2..c8a80ea 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -23,14 +23,13 @@ from collections import namedtuple import functools import sqlite3 - import pandas as pd from . import db from .broad_splits import broad_splits_dna_mapping from .broad_splits import broad_splits_ser_mapping from .misc import get_2field_allele, get_3field_allele, number_of_fields -from .misc import expression_chars +from .misc import expression_chars, get_G_name, get_P_name # GitHub URL where IMGT HLA files are downloaded. from pyard.smart_sort import smart_sort_comparator @@ -46,6 +45,7 @@ "lgx_group", "exon_group", "p_group", + "p_not_g", ] ARSMapping = namedtuple("ARSMapping", ars_mapping_tables) @@ -102,6 +102,9 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): p_group = db.load_dict( db_connection, table_name="p_group", columns=("allele", "p") ) + p_not_g = db.load_dict( + db_connection, table_name="p_not_g", columns=("allele", "lgx") + ) return ARSMapping( dup_g=dup_g, dup_lg=dup_lg, @@ -111,13 +114,46 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): lgx_group=lgx_group, exon_group=exon_group, p_group=p_group, + p_not_g=p_not_g, ) + # load the hla_nom_g.txt ars_G_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt" df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() + # the G-group is named for its first allele + df["G"] = df["A"].apply(get_G_name) + + # load the hla_nom_p.txt + ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt" + # example: C*;06:06:01:01/06:06:01:02/06:271;06:06P + df_P = pd.read_csv( + ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";" + ).dropna() + + # the P-group is named for its first allele + df_P["P"] = df_P["A"].apply(get_P_name) + + # convert slash delimited string to a list + df_P["A"] = df_P["A"].apply(lambda a: a.split("/")) + df_P = df_P.explode("A") + # C* 06:06:01:01/06:06:01:02/06:271 06:06P + df_P["A"] = df_P["Locus"] + df_P["A"] + df_P["P"] = df_P["Locus"] + df_P["P"] + # C* 06:06:01:01 06:06P + # C* 06:06:01:02 06:06P + # C* 06:271 06:06P + p_group = df_P.set_index("A")["P"].to_dict() + df_P["2d"] = df_P["A"].apply(get_2field_allele) + # lgx has the P-group name without the P for comparison + df_P["lgx"] = df_P["P"].apply(get_2field_allele) + + # convert slash delimited string to a list df["A"] = df["A"].apply(lambda a: a.split("/")) + # convert the list into separate rows for each element df = df.explode("A") + + # A* + 02:01 = A*02:01 df["A"] = df["Locus"] + df["A"] df["G"] = df["Locus"] + df["G"] @@ -126,8 +162,24 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): df["lg"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]) + "g") df["lgx"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2])) + # compare df_P["2d"] with df["2d"] to find 2-field alleles in the + # P-group that aren't in the G-group + PnotinG = set(df_P["2d"]) - set(df["2d"]) + + # filter to find these 2-field alleles (2d) in the P-group data frame + df_PnotG = df_P[df_P["2d"].isin(PnotinG)] + + # dictionary which will define the table + p_not_g = df_PnotG.set_index("A")["lgx"].to_dict() + # multiple Gs + # goal: identify 2-field alleles that are in multiple G-groups + + # group by 2d and G, and select the 2d column and count the columns mg = df.drop_duplicates(["2d", "G"])["2d"].value_counts() + # filter out the mg with count > 1, leaving only duplicates + # take the index from the 2d version the data frame, make that a column + # and turn that into a list multiple_g_list = mg[mg > 1].reset_index()["index"].to_list() # Keep only the alleles that have more than 1 mapping @@ -202,18 +254,13 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): ) exon_group = df_exon.set_index("A")["exon"].to_dict() - # P groups - ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt" - df_P = pd.read_csv( - ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";" - ).dropna() - df_P["A"] = df_P["A"].apply(lambda a: a.split("/")) - df_P = df_P.explode("A") - df_P["A"] = df_P["Locus"] + df_P["A"] - df_P["P"] = df_P["Locus"] + df_P["P"] - p_group = df_P.set_index("A")["P"].to_dict() - # save + db.save_dict( + db_connection, + table_name="p_not_g", + dictionary=p_not_g, + columns=("allele", "lgx"), + ) db.save_dict( db_connection, table_name="dup_g", @@ -256,7 +303,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): db.save_dict( db_connection, table_name="p_group", - dictionary=exon_group, + dictionary=p_group, columns=("allele", "p"), ) @@ -269,6 +316,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): lgx_group=lgx_group, exon_group=exon_group, p_group=p_group, + p_not_g=p_not_g, ) diff --git a/pyard/misc.py b/pyard/misc.py index b13a4d5..4d0addb 100644 --- a/pyard/misc.py +++ b/pyard/misc.py @@ -1,5 +1,7 @@ # List of expression characters expression_chars = ["N", "Q", "L", "S"] +# List of P and G characters +PandG_chars = ["P", "G"] def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str: @@ -20,12 +22,44 @@ def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str: def get_3field_allele(a: str) -> str: + last_char = a[-1] + if last_char in PandG_chars: + a = a[:-1] + return get_n_field_allele(a, 3) def get_2field_allele(a: str) -> str: + last_char = a[-1] + if last_char in PandG_chars: + a = a[:-1] return get_n_field_allele(a, 2) def number_of_fields(allele: str) -> int: return len(allele.split(":")) + + +# computes a valid G name based on the ambiguity string +def get_G_name(a: str) -> str: + a = a.split("/")[0] + last_char = a[-1] + if last_char in PandG_chars + expression_chars: + a = a[:-1] + if len(a.split(":")) == 2: + return ":".join([a, "01"]) + "G" + else: + return ":".join(a.split(":")[0:3]) + "G" + + +# computes a valid P name based on the ambiguity string +def get_P_name(a: str) -> str: + a = a.split("/")[0] + last_char = a[-1] + if last_char in PandG_chars + expression_chars: + a = a[:-1] + return ":".join(a.split(":")[0:2]) + "P" + + +def number_of_fields(allele: str) -> int: + return len(allele.split(":")) diff --git a/pyard/pyard.py b/pyard/pyard.py index cabf43f..1381806 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -48,6 +48,7 @@ "reduce_XX": True, "reduce_MAC": True, "reduce_shortnull": True, + "ping": False, "map_drb345_to_drbx": True, "verbose_log": True, } @@ -140,7 +141,7 @@ def __del__(self): self.db_connection.close() @functools.lru_cache(maxsize=max_cache_size) - def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str: + def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES, reping=True) -> str: """ Does ARS reduction with allele and ARS type @@ -172,6 +173,13 @@ def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str: if allele.endswith(("P", "G")): if redux_type in ["lg", "lgx", "G"]: allele = allele[:-1] + if self._config["ping"] and reping: + if redux_type in ("lg", "lgx", "U2"): + if allele in self.ars_mappings.p_not_g: + return self.ars_mappings.p_not_g[allele] + else: + return self.redux(allele, redux_type, False) + if redux_type == "G" and allele in self.ars_mappings.g_group: if allele in self.ars_mappings.dup_g: return self.ars_mappings.dup_g[allele] diff --git a/tests/environment.py b/tests/environment.py index 4b93784..f778cd1 100644 --- a/tests/environment.py +++ b/tests/environment.py @@ -3,3 +3,9 @@ def before_all(context): context.ard = ARD("3440", data_dir="/tmp/py-ard") + + # an ard with ping set to True + my_config = { + "ping": True, + } + context.ard_ping = ARD("3440", data_dir="/tmp/py-ard", config=my_config) diff --git a/tests/features/allele.feature b/tests/features/allele.feature index 21d0564..2e78a79 100644 --- a/tests/features/allele.feature +++ b/tests/features/allele.feature @@ -1,6 +1,20 @@ Feature: Alleles - Scenario Outline: + Scenario Outline: allele reduction with ping + + Given the allele as + When reducing on the level with ping + Then the reduced allele is found to be + + Examples: + | Allele | Level | Redux Allele | + | C*02:02 | lg | C*02:02g | + | C*02:02 | lgx | C*02:02 | + | C*02:10 | lg | C*02:02g | + | C*02:10 | lgx | C*02:02 | + | C*06:17 | lgx | C*06:02 | + + Scenario Outline: allele reduction Given the allele as When reducing on the level @@ -21,5 +35,8 @@ Feature: Alleles | DRB1*14:06:01 | lgx | DRB1*14:06 | | DRB1*14:06:01 | lg | DRB1*14:06g | - | C*02:02 | lg | C*02:02g/C*02:10g | - | C*02:02 | lgx | C*02:02/C*02:10 | + | C*02:02 | lg | C*02:02g | + | C*02:02 | lgx | C*02:02 | + | C*02:10 | lg | C*02:02g | + | C*02:10 | lgx | C*02:02 | + | C*06:17 | lgx | C*06:17 | diff --git a/tests/steps/redux_allele.py b/tests/steps/redux_allele.py index f7da113..d0d15e0 100644 --- a/tests/steps/redux_allele.py +++ b/tests/steps/redux_allele.py @@ -15,6 +15,12 @@ def step_impl(context, level): context.redux_allele = context.ard.redux(context.allele, level) +@when("reducing on the {level} level with ping") +def step_impl(context, level): + context.level = level + context.redux_allele = context.ard_ping.redux(context.allele, level) + + @when("reducing on the {level} level (ambiguous)") def step_impl(context, level): context.level = level diff --git a/tests/test_pyard.py b/tests/test_pyard.py index c011667..ff2847b 100644 --- a/tests/test_pyard.py +++ b/tests/test_pyard.py @@ -156,4 +156,4 @@ def test_allele_duplicated(self): # https://github.com/nmdp-bioinformatics/py-ard/issues/135 allele_code = "C*02:ACMGS" allele_code_rx = self.ard.redux_gl(allele_code, "lgx") - self.assertEqual(allele_code_rx, "C*02:02/C*02:10") + self.assertEqual(allele_code_rx, "C*02:02")