From ce9aabb236e0448e4199c23a8d6607a1ed12a15a Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Tue, 31 May 2022 07:36:02 -0500 Subject: [PATCH 1/7] works but not for G mode --- pyard/data_repository.py | 35 ++++++++++++++++++++++------------- pyard/misc.py | 8 ++++++++ pyard/pyard.py | 3 ++- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/pyard/data_repository.py b/pyard/data_repository.py index 806ffe7..5700cba 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -62,7 +62,7 @@ def expression_reduce(df): return None -def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): +def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version, ping): if db.tables_exist(db_connection, ars_mapping_tables): dup_g = db.load_dict(db_connection, table_name='dup_g', columns=('allele', 'g_group')) dup_lg = db.load_dict(db_connection, table_name='dup_lg', columns=('allele', 'lg_group')) @@ -76,8 +76,24 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): g_group=g_group, lg_group=lg_group, lgx_group=lgx_group, exon_group=exon_group, p_group=p_group) + # P groups + ars_P_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt' + df_P = pd.read_csv(ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";").dropna() + df_P['A'] = df_P['A'].apply(lambda a: a.split('/')) + df_P = df_P.explode('A') + df_P['A'] = df_P['Locus'] + df_P['A'] + df_P['P'] = df_P['Locus'] + df_P['P'] + p_group = df_P.set_index('A')['P'].to_dict() + + ars_G_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt' - df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() + df_G = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() + if ping: + # put the P codes in the G-codes early to catch C*06:17 -> C*06:02 + df_PinG = pd.read_csv(ars_P_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() + df = pd.concat([df_PinG, df_G]) + else: + df = df_G df['A'] = df['A'].apply(lambda a: a.split('/')) df = df.explode('A') @@ -86,8 +102,10 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): df['2d'] = df['A'].apply(get_2field_allele) df['3d'] = df['A'].apply(get_3field_allele) - df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g") - df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2])) + #df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g") + df['lg'] = df['G'].apply(lambda a: get_2field_allele(a) + "g") + #df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2])) + df['lgx'] = df['G'].apply(get_2field_allele) # multiple Gs mg = df.drop_duplicates(['2d', 'G'])['2d'].value_counts() @@ -145,15 +163,6 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): df_exon = pd.concat([df[['A', '3d']].rename(columns={'3d': 'exon'}), ]) exon_group = df_exon.set_index('A')['exon'].to_dict() - # P groups - ars_P_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt' - df_P = pd.read_csv(ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";").dropna() - df_P['A'] = df_P['A'].apply(lambda a: a.split('/')) - df_P = df_P.explode('A') - df_P['A'] = df_P['Locus'] + df_P['A'] - df_P['P'] = df_P['Locus'] + df_P['P'] - p_group = df_P.set_index('A')['P'].to_dict() - # save db.save_dict(db_connection, table_name='dup_g', dictionary=dup_g, columns=('allele', 'g_group')) db.save_dict(db_connection, table_name='dup_lg', dictionary=dup_lg, columns=('allele', 'lg_group')) diff --git a/pyard/misc.py b/pyard/misc.py index 5e212c9..c2886a3 100644 --- a/pyard/misc.py +++ b/pyard/misc.py @@ -1,5 +1,6 @@ # List of expression characters expression_chars = ['N', 'Q', 'L', 'S'] +PandG_chars = ['P', 'G'] def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str: @@ -21,10 +22,17 @@ def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str: def get_3field_allele(a: str) -> str: + last_char = a[-1] + if last_char in PandG_chars: + a = a[:-1] + return get_n_field_allele(a, 3) def get_2field_allele(a: str) -> str: + last_char = a[-1] + if last_char in PandG_chars: + a = a[:-1] return get_n_field_allele(a, 2) diff --git a/pyard/pyard.py b/pyard/pyard.py index 7d2945e..2284728 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -48,6 +48,7 @@ "reduce_XX": True, "reduce_MAC": True, "reduce_shortnull": True, + "ping": False, "map_drb345_to_drbx": True, "verbose_log": True } @@ -99,7 +100,7 @@ def __init__(self, imgt_version: str = 'Latest', data_dir: str = None, config: d # Load MAC codes dr.generate_mac_codes(self.db_connection, False) # Load ARS mappings - self.ars_mappings = dr.generate_ars_mapping(self.db_connection, imgt_version) + self.ars_mappings = dr.generate_ars_mapping(self.db_connection, imgt_version, self._config['ping']) # Load Alleles and XX Codes self.valid_alleles, self.who_alleles, self.xx_codes, self.who_group, self.shortnulls, self.exp_alleles = \ dr.generate_alleles_and_xx_codes_and_who(self.db_connection, imgt_version, self.ars_mappings) From 65105ce01e14bae33658c6ec0244c61419e2d942 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Tue, 27 Sep 2022 08:20:08 -0500 Subject: [PATCH 2/7] ping mode --- pyard/data_repository.py | 75 ++++++++++++++++++++++++++++++++-------- pyard/misc.py | 24 +++++++++++++ pyard/pyard.py | 13 +++++-- 3 files changed, 95 insertions(+), 17 deletions(-) diff --git a/pyard/data_repository.py b/pyard/data_repository.py index edd1563..8b2638b 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -30,7 +30,7 @@ from .broad_splits import broad_splits_dna_mapping from .broad_splits import broad_splits_ser_mapping from .misc import get_2field_allele, get_3field_allele, number_of_fields -from .misc import expression_chars +from .misc import expression_chars, get_G_name, get_P_name # GitHub URL where IMGT HLA files are downloaded. from pyard.smart_sort import smart_sort_comparator @@ -46,6 +46,7 @@ "lgx_group", "exon_group", "p_group", + "p_not_g", ] ARSMapping = namedtuple("ARSMapping", ars_mapping_tables) @@ -76,7 +77,7 @@ def expression_reduce(df): return None -def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version, ping): +def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): if db.tables_exist(db_connection, ars_mapping_tables): dup_g = db.load_dict( db_connection, table_name="dup_g", columns=("allele", "g_group") @@ -102,6 +103,9 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version, ping): p_group = db.load_dict( db_connection, table_name="p_group", columns=("allele", "p") ) + p_not_g = db.load_dict( + db_connection, table_name="p_not_g", columns=("allele", "lgx") + ) return ARSMapping( dup_g=dup_g, dup_lg=dup_lg, @@ -111,28 +115,45 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version, ping): lgx_group=lgx_group, exon_group=exon_group, p_group=p_group, + p_not_g=p_not_g, ) - # P groups + + # load the hla_nom_g.txt + ars_G_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt' + df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() + + # the G-group is named for its first allele + df["G"] = df["A"].apply(lambda a: get_G_name(a)) + + # load the hla_nom_p.txt ars_P_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt' + # example: C*;06:06:01:01/06:06:01:02/06:271;06:06P df_P = pd.read_csv(ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";").dropna() - df_P['A'] = df_P['A'].apply(lambda a: a.split('/')) - df_P = df_P.explode('A') + + # the P-group is named for its first allele + df_P["P"] = df_P["A"].apply(lambda a: get_P_name(a)) + + # convert slash delimited string to a list + df_P["A"] = df_P["A"].apply(lambda a: a.split("/")) + df_P = df_P.explode("A") + # C* 06:06:01:01/06:06:01:02/06:271 06:06P df_P['A'] = df_P['Locus'] + df_P['A'] df_P['P'] = df_P['Locus'] + df_P['P'] + # C* 06:06:01:01 06:06P + # C* 06:06:01:02 06:06P + # C* 06:271 06:06P p_group = df_P.set_index('A')['P'].to_dict() + df_P["2d"] = df_P["A"].apply(get_2field_allele) + # lgx has the P-group name without the P for comparison + df_P["lgx"] = df_P["P"].apply(get_2field_allele) - ars_G_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt' - df_G = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() - if ping: - # put the P codes in the G-codes early to catch C*06:17 -> C*06:02 - df_PinG = pd.read_csv(ars_P_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() - df = pd.concat([df_PinG, df_G]) - else: - df = df_G - + # convert slash delimited string to a list df["A"] = df["A"].apply(lambda a: a.split("/")) + # convert the list into separate rows for each element df = df.explode("A") + + # A* + 02:01 = A*02:01 df["A"] = df["Locus"] + df["A"] df["G"] = df["Locus"] + df["G"] @@ -141,8 +162,25 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version, ping): df["lg"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]) + "g") df["lgx"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2])) + # compare df_P["2d"] with df["2d"] to find 2-field alleles in the + # P-group that aren't in the G-group + PnotinG = set(df_P["2d"]) - set(df["2d"]) + + # filter to find these 2-field alleles (2d) in the P-group data frame + df_PnotG = df_P[df_P["2d"].isin(PnotinG)] + + # dictionary which will define the table + p_not_g = df_PnotG.set_index("A")["lgx"].to_dict() + + # multiple Gs + # goal: identify 2-field alleles that are in multiple G-groups + + # group by 2d and G, and select the 2d column and count the columns mg = df.drop_duplicates(["2d", "G"])["2d"].value_counts() + # filter out the mg with count > 1, leaving only duplicates + # take the index from the 2d version the data frame, make that a column + # and turn that into a list multiple_g_list = mg[mg > 1].reset_index()["index"].to_list() # Keep only the alleles that have more than 1 mapping @@ -218,6 +256,12 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version, ping): exon_group = df_exon.set_index("A")["exon"].to_dict() # save + db.save_dict( + db_connection, + table_name="p_not_g", + dictionary=p_not_g, + columns=("allele", "lgx"), + ) db.save_dict( db_connection, table_name="dup_g", @@ -260,7 +304,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version, ping): db.save_dict( db_connection, table_name="p_group", - dictionary=exon_group, + dictionary=p_group, columns=("allele", "p"), ) @@ -273,6 +317,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version, ping): lgx_group=lgx_group, exon_group=exon_group, p_group=p_group, + p_not_g=p_not_g, ) diff --git a/pyard/misc.py b/pyard/misc.py index 976f775..58a0229 100644 --- a/pyard/misc.py +++ b/pyard/misc.py @@ -37,3 +37,27 @@ def get_2field_allele(a: str) -> str: def number_of_fields(allele: str) -> int: return len(allele.split(":")) + +# computes a valid G name based on the ambiguity string +def get_G_name(a: str) -> str: + a = a.split('/')[0] + last_char = a[-1] + if last_char in PandG_chars + expression_chars: + a = a[:-1] + if len(a.split(':')) ==2: + return ':'.join([a,"01"]) + else: + return ':'.join(a.split(':')[0:3]) + "G" + +# computes a valid P name based on the ambiguity string +def get_P_name(a: str) -> str: + a = a.split('/')[0] + last_char = a[-1] + if last_char in PandG_chars + expression_chars: + a = a[:-1] + return ':'.join(a.split(':')[0:2]) + "P" + + +def number_of_fields(allele: str) -> int: + return len(allele.split(":")) + diff --git a/pyard/pyard.py b/pyard/pyard.py index 71d1ff3..c86afee 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -102,7 +102,7 @@ def __init__( # Load MAC codes dr.generate_mac_codes(self.db_connection, False) # Load ARS mappings - self.ars_mappings = dr.generate_ars_mapping(self.db_connection, imgt_version, self._config['ping']) + self.ars_mappings = dr.generate_ars_mapping(self.db_connection, imgt_version) # Load Alleles and XX Codes ( self.valid_alleles, @@ -141,7 +141,7 @@ def __del__(self): self.db_connection.close() @functools.lru_cache(maxsize=max_cache_size) - def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str: + def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES, reping=True) -> str: """ Does ARS reduction with allele and ARS type @@ -173,6 +173,15 @@ def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str: if allele.endswith(("P", "G")): if redux_type in ["lg", "lgx", "G"]: allele = allele[:-1] + if self._config["ping"] and reping: + if redux_type in ("lg", "lgx", "U2"): + if allele in self.ars_mappings.p_not_g: + # return a joined + return self.ars_mappings.p_not_g[allele] + else: + return self.redux(allele, redux_type, False) + + if redux_type == "G" and allele in self.ars_mappings.g_group: if allele in self.ars_mappings.dup_g: return self.ars_mappings.dup_g[allele] From 8f173f802beb3dcd2c7a8f8d2e45e1520fd2467a Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Tue, 27 Sep 2022 08:34:50 -0500 Subject: [PATCH 3/7] updated test to fix its behavior --- pyard/misc.py | 2 +- tests/features/allele.feature | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pyard/misc.py b/pyard/misc.py index 58a0229..2313c7a 100644 --- a/pyard/misc.py +++ b/pyard/misc.py @@ -45,7 +45,7 @@ def get_G_name(a: str) -> str: if last_char in PandG_chars + expression_chars: a = a[:-1] if len(a.split(':')) ==2: - return ':'.join([a,"01"]) + return ':'.join([a,"01"]) + "G" else: return ':'.join(a.split(':')[0:3]) + "G" diff --git a/tests/features/allele.feature b/tests/features/allele.feature index 21d0564..732b369 100644 --- a/tests/features/allele.feature +++ b/tests/features/allele.feature @@ -21,5 +21,7 @@ Feature: Alleles | DRB1*14:06:01 | lgx | DRB1*14:06 | | DRB1*14:06:01 | lg | DRB1*14:06g | - | C*02:02 | lg | C*02:02g/C*02:10g | - | C*02:02 | lgx | C*02:02/C*02:10 | + | C*02:02 | lg | C*02:02g | + | C*02:02 | lgx | C*02:02 | + | C*02:10 | lg | C*02:02g | + | C*02:10 | lgx | C*02:02 | From b8b56e195ac344698da41c4cb4c85b8f8b9b84d0 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Wed, 28 Sep 2022 09:54:59 -0500 Subject: [PATCH 4/7] remove lambdas, add and remove comments --- pyard/data_repository.py | 4 ++-- pyard/misc.py | 1 + pyard/pyard.py | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyard/data_repository.py b/pyard/data_repository.py index 8b2638b..45dd160 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -124,7 +124,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() # the G-group is named for its first allele - df["G"] = df["A"].apply(lambda a: get_G_name(a)) + df["G"] = df["A"].apply(get_G_name) # load the hla_nom_p.txt ars_P_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt' @@ -132,7 +132,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): df_P = pd.read_csv(ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";").dropna() # the P-group is named for its first allele - df_P["P"] = df_P["A"].apply(lambda a: get_P_name(a)) + df_P["P"] = df_P["A"].apply(get_P_name) # convert slash delimited string to a list df_P["A"] = df_P["A"].apply(lambda a: a.split("/")) diff --git a/pyard/misc.py b/pyard/misc.py index 2313c7a..0725892 100644 --- a/pyard/misc.py +++ b/pyard/misc.py @@ -1,5 +1,6 @@ # List of expression characters expression_chars = ['N', 'Q', 'L', 'S'] +# List of P and G characters PandG_chars = ['P', 'G'] diff --git a/pyard/pyard.py b/pyard/pyard.py index c86afee..8d05845 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -176,7 +176,6 @@ def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES, reping=True) -> if self._config["ping"] and reping: if redux_type in ("lg", "lgx", "U2"): if allele in self.ars_mappings.p_not_g: - # return a joined return self.ars_mappings.p_not_g[allele] else: return self.redux(allele, redux_type, False) From 0c756706943486bf32d9357d8cde157813bbd2d2 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Wed, 28 Sep 2022 13:59:26 -0500 Subject: [PATCH 5/7] fixed test expected results to new correct answer --- tests/test_pyard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pyard.py b/tests/test_pyard.py index c011667..ff2847b 100644 --- a/tests/test_pyard.py +++ b/tests/test_pyard.py @@ -156,4 +156,4 @@ def test_allele_duplicated(self): # https://github.com/nmdp-bioinformatics/py-ard/issues/135 allele_code = "C*02:ACMGS" allele_code_rx = self.ard.redux_gl(allele_code, "lgx") - self.assertEqual(allele_code_rx, "C*02:02/C*02:10") + self.assertEqual(allele_code_rx, "C*02:02") From 1a5c148414f7b5d822f22d2d40347e1888122b8f Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Thu, 29 Sep 2022 14:12:47 -0500 Subject: [PATCH 6/7] ping tests --- tests/environment.py | 6 ++++++ tests/features/allele.feature | 17 ++++++++++++++++- tests/steps/redux_allele.py | 6 ++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/environment.py b/tests/environment.py index 4b93784..f778cd1 100644 --- a/tests/environment.py +++ b/tests/environment.py @@ -3,3 +3,9 @@ def before_all(context): context.ard = ARD("3440", data_dir="/tmp/py-ard") + + # an ard with ping set to True + my_config = { + "ping": True, + } + context.ard_ping = ARD("3440", data_dir="/tmp/py-ard", config=my_config) diff --git a/tests/features/allele.feature b/tests/features/allele.feature index 732b369..2e78a79 100644 --- a/tests/features/allele.feature +++ b/tests/features/allele.feature @@ -1,6 +1,20 @@ Feature: Alleles - Scenario Outline: + Scenario Outline: allele reduction with ping + + Given the allele as + When reducing on the level with ping + Then the reduced allele is found to be + + Examples: + | Allele | Level | Redux Allele | + | C*02:02 | lg | C*02:02g | + | C*02:02 | lgx | C*02:02 | + | C*02:10 | lg | C*02:02g | + | C*02:10 | lgx | C*02:02 | + | C*06:17 | lgx | C*06:02 | + + Scenario Outline: allele reduction Given the allele as When reducing on the level @@ -25,3 +39,4 @@ Feature: Alleles | C*02:02 | lgx | C*02:02 | | C*02:10 | lg | C*02:02g | | C*02:10 | lgx | C*02:02 | + | C*06:17 | lgx | C*06:17 | diff --git a/tests/steps/redux_allele.py b/tests/steps/redux_allele.py index f7da113..d0d15e0 100644 --- a/tests/steps/redux_allele.py +++ b/tests/steps/redux_allele.py @@ -15,6 +15,12 @@ def step_impl(context, level): context.redux_allele = context.ard.redux(context.allele, level) +@when("reducing on the {level} level with ping") +def step_impl(context, level): + context.level = level + context.redux_allele = context.ard_ping.redux(context.allele, level) + + @when("reducing on the {level} level (ambiguous)") def step_impl(context, level): context.level = level From ffebb519c170d60e9f7f42f75caf98333500f9b9 Mon Sep 17 00:00:00 2001 From: Martin Maiers Date: Thu, 29 Sep 2022 14:19:38 -0500 Subject: [PATCH 7/7] lint --- pyard/data_repository.py | 25 ++++++++++++------------- pyard/misc.py | 23 ++++++++++++----------- pyard/pyard.py | 5 ++--- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/pyard/data_repository.py b/pyard/data_repository.py index 45dd160..c8a80ea 100644 --- a/pyard/data_repository.py +++ b/pyard/data_repository.py @@ -23,7 +23,6 @@ from collections import namedtuple import functools import sqlite3 - import pandas as pd from . import db @@ -118,34 +117,35 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): p_not_g=p_not_g, ) - # load the hla_nom_g.txt - ars_G_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt' + ars_G_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt" df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() # the G-group is named for its first allele - df["G"] = df["A"].apply(get_G_name) + df["G"] = df["A"].apply(get_G_name) # load the hla_nom_p.txt - ars_P_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt' + ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt" # example: C*;06:06:01:01/06:06:01:02/06:271;06:06P - df_P = pd.read_csv(ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";").dropna() + df_P = pd.read_csv( + ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";" + ).dropna() # the P-group is named for its first allele - df_P["P"] = df_P["A"].apply(get_P_name) + df_P["P"] = df_P["A"].apply(get_P_name) # convert slash delimited string to a list df_P["A"] = df_P["A"].apply(lambda a: a.split("/")) df_P = df_P.explode("A") # C* 06:06:01:01/06:06:01:02/06:271 06:06P - df_P['A'] = df_P['Locus'] + df_P['A'] - df_P['P'] = df_P['Locus'] + df_P['P'] + df_P["A"] = df_P["Locus"] + df_P["A"] + df_P["P"] = df_P["Locus"] + df_P["P"] # C* 06:06:01:01 06:06P # C* 06:06:01:02 06:06P # C* 06:271 06:06P - p_group = df_P.set_index('A')['P'].to_dict() + p_group = df_P.set_index("A")["P"].to_dict() df_P["2d"] = df_P["A"].apply(get_2field_allele) - # lgx has the P-group name without the P for comparison + # lgx has the P-group name without the P for comparison df_P["lgx"] = df_P["P"].apply(get_2field_allele) # convert slash delimited string to a list @@ -162,7 +162,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): df["lg"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]) + "g") df["lgx"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2])) - # compare df_P["2d"] with df["2d"] to find 2-field alleles in the + # compare df_P["2d"] with df["2d"] to find 2-field alleles in the # P-group that aren't in the G-group PnotinG = set(df_P["2d"]) - set(df["2d"]) @@ -172,7 +172,6 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): # dictionary which will define the table p_not_g = df_PnotG.set_index("A")["lgx"].to_dict() - # multiple Gs # goal: identify 2-field alleles that are in multiple G-groups diff --git a/pyard/misc.py b/pyard/misc.py index 0725892..4d0addb 100644 --- a/pyard/misc.py +++ b/pyard/misc.py @@ -1,7 +1,7 @@ # List of expression characters -expression_chars = ['N', 'Q', 'L', 'S'] +expression_chars = ["N", "Q", "L", "S"] # List of P and G characters -PandG_chars = ['P', 'G'] +PandG_chars = ["P", "G"] def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str: @@ -25,7 +25,7 @@ def get_3field_allele(a: str) -> str: last_char = a[-1] if last_char in PandG_chars: a = a[:-1] - + return get_n_field_allele(a, 3) @@ -39,26 +39,27 @@ def get_2field_allele(a: str) -> str: def number_of_fields(allele: str) -> int: return len(allele.split(":")) + # computes a valid G name based on the ambiguity string def get_G_name(a: str) -> str: - a = a.split('/')[0] + a = a.split("/")[0] last_char = a[-1] if last_char in PandG_chars + expression_chars: a = a[:-1] - if len(a.split(':')) ==2: - return ':'.join([a,"01"]) + "G" + if len(a.split(":")) == 2: + return ":".join([a, "01"]) + "G" else: - return ':'.join(a.split(':')[0:3]) + "G" - + return ":".join(a.split(":")[0:3]) + "G" + + # computes a valid P name based on the ambiguity string def get_P_name(a: str) -> str: - a = a.split('/')[0] + a = a.split("/")[0] last_char = a[-1] if last_char in PandG_chars + expression_chars: a = a[:-1] - return ':'.join(a.split(':')[0:2]) + "P" + return ":".join(a.split(":")[0:2]) + "P" def number_of_fields(allele: str) -> int: return len(allele.split(":")) - diff --git a/pyard/pyard.py b/pyard/pyard.py index 8d05845..1381806 100644 --- a/pyard/pyard.py +++ b/pyard/pyard.py @@ -177,10 +177,9 @@ def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES, reping=True) -> if redux_type in ("lg", "lgx", "U2"): if allele in self.ars_mappings.p_not_g: return self.ars_mappings.p_not_g[allele] - else: + else: return self.redux(allele, redux_type, False) - - + if redux_type == "G" and allele in self.ars_mappings.g_group: if allele in self.ars_mappings.dup_g: return self.ars_mappings.dup_g[allele]