Skip to content
Merged

PnotG #178

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 62 additions & 14 deletions pyard/data_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,13 @@
from collections import namedtuple
import functools
import sqlite3

import pandas as pd

from . import db
from .broad_splits import broad_splits_dna_mapping
from .broad_splits import broad_splits_ser_mapping
from .misc import get_2field_allele, get_3field_allele, number_of_fields
from .misc import expression_chars
from .misc import expression_chars, get_G_name, get_P_name

# GitHub URL where IMGT HLA files are downloaded.
from pyard.smart_sort import smart_sort_comparator
Expand All @@ -46,6 +45,7 @@
"lgx_group",
"exon_group",
"p_group",
"p_not_g",
]
ARSMapping = namedtuple("ARSMapping", ars_mapping_tables)

Expand Down Expand Up @@ -102,6 +102,9 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
p_group = db.load_dict(
db_connection, table_name="p_group", columns=("allele", "p")
)
p_not_g = db.load_dict(
db_connection, table_name="p_not_g", columns=("allele", "lgx")
)
return ARSMapping(
dup_g=dup_g,
dup_lg=dup_lg,
Expand All @@ -111,13 +114,46 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
lgx_group=lgx_group,
exon_group=exon_group,
p_group=p_group,
p_not_g=p_not_g,
)

# load the hla_nom_g.txt
ars_G_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt"
df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()

# the G-group is named for its first allele
df["G"] = df["A"].apply(get_G_name)

# load the hla_nom_p.txt
ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
# example: C*;06:06:01:01/06:06:01:02/06:271;06:06P
df_P = pd.read_csv(
ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
).dropna()

# the P-group is named for its first allele
df_P["P"] = df_P["A"].apply(get_P_name)

# convert slash delimited string to a list
df_P["A"] = df_P["A"].apply(lambda a: a.split("/"))
df_P = df_P.explode("A")
# C* 06:06:01:01/06:06:01:02/06:271 06:06P
df_P["A"] = df_P["Locus"] + df_P["A"]
df_P["P"] = df_P["Locus"] + df_P["P"]
# C* 06:06:01:01 06:06P
# C* 06:06:01:02 06:06P
# C* 06:271 06:06P
p_group = df_P.set_index("A")["P"].to_dict()
df_P["2d"] = df_P["A"].apply(get_2field_allele)
# lgx has the P-group name without the P for comparison
df_P["lgx"] = df_P["P"].apply(get_2field_allele)

# convert slash delimited string to a list
df["A"] = df["A"].apply(lambda a: a.split("/"))
# convert the list into separate rows for each element
df = df.explode("A")

# A* + 02:01 = A*02:01
df["A"] = df["Locus"] + df["A"]
df["G"] = df["Locus"] + df["G"]

Expand All @@ -126,8 +162,24 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
df["lg"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
df["lgx"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]))

# compare df_P["2d"] with df["2d"] to find 2-field alleles in the
# P-group that aren't in the G-group
PnotinG = set(df_P["2d"]) - set(df["2d"])

# filter to find these 2-field alleles (2d) in the P-group data frame
df_PnotG = df_P[df_P["2d"].isin(PnotinG)]

# dictionary which will define the table
p_not_g = df_PnotG.set_index("A")["lgx"].to_dict()

# multiple Gs
# goal: identify 2-field alleles that are in multiple G-groups

# group by 2d and G, and select the 2d column and count the columns
mg = df.drop_duplicates(["2d", "G"])["2d"].value_counts()
# filter out the mg with count > 1, leaving only duplicates
# take the index from the 2d version the data frame, make that a column
# and turn that into a list
multiple_g_list = mg[mg > 1].reset_index()["index"].to_list()

# Keep only the alleles that have more than 1 mapping
Expand Down Expand Up @@ -202,18 +254,13 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
)
exon_group = df_exon.set_index("A")["exon"].to_dict()

# P groups
ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
df_P = pd.read_csv(
ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
).dropna()
df_P["A"] = df_P["A"].apply(lambda a: a.split("/"))
df_P = df_P.explode("A")
df_P["A"] = df_P["Locus"] + df_P["A"]
df_P["P"] = df_P["Locus"] + df_P["P"]
p_group = df_P.set_index("A")["P"].to_dict()

# save
db.save_dict(
db_connection,
table_name="p_not_g",
dictionary=p_not_g,
columns=("allele", "lgx"),
)
db.save_dict(
db_connection,
table_name="dup_g",
Expand Down Expand Up @@ -256,7 +303,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
db.save_dict(
db_connection,
table_name="p_group",
dictionary=exon_group,
dictionary=p_group,
columns=("allele", "p"),
)

Expand All @@ -269,6 +316,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
lgx_group=lgx_group,
exon_group=exon_group,
p_group=p_group,
p_not_g=p_not_g,
)


Expand Down
34 changes: 34 additions & 0 deletions pyard/misc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# List of expression characters
expression_chars = ["N", "Q", "L", "S"]
# List of P and G characters
PandG_chars = ["P", "G"]


def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str:
Expand All @@ -20,12 +22,44 @@ def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str:


def get_3field_allele(a: str) -> str:
last_char = a[-1]
if last_char in PandG_chars:
a = a[:-1]

return get_n_field_allele(a, 3)


def get_2field_allele(a: str) -> str:
last_char = a[-1]
if last_char in PandG_chars:
a = a[:-1]
return get_n_field_allele(a, 2)


def number_of_fields(allele: str) -> int:
return len(allele.split(":"))


# computes a valid G name based on the ambiguity string
def get_G_name(a: str) -> str:
a = a.split("/")[0]
last_char = a[-1]
if last_char in PandG_chars + expression_chars:
a = a[:-1]
if len(a.split(":")) == 2:
return ":".join([a, "01"]) + "G"
else:
return ":".join(a.split(":")[0:3]) + "G"


# computes a valid P name based on the ambiguity string
def get_P_name(a: str) -> str:
a = a.split("/")[0]
last_char = a[-1]
if last_char in PandG_chars + expression_chars:
a = a[:-1]
return ":".join(a.split(":")[0:2]) + "P"


def number_of_fields(allele: str) -> int:
return len(allele.split(":"))
10 changes: 9 additions & 1 deletion pyard/pyard.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"reduce_XX": True,
"reduce_MAC": True,
"reduce_shortnull": True,
"ping": False,
"map_drb345_to_drbx": True,
"verbose_log": True,
}
Expand Down Expand Up @@ -140,7 +141,7 @@ def __del__(self):
self.db_connection.close()

@functools.lru_cache(maxsize=max_cache_size)
def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str:
def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES, reping=True) -> str:
"""
Does ARS reduction with allele and ARS type

Expand Down Expand Up @@ -172,6 +173,13 @@ def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str:
if allele.endswith(("P", "G")):
if redux_type in ["lg", "lgx", "G"]:
allele = allele[:-1]
if self._config["ping"] and reping:
if redux_type in ("lg", "lgx", "U2"):
if allele in self.ars_mappings.p_not_g:
return self.ars_mappings.p_not_g[allele]
else:
return self.redux(allele, redux_type, False)

if redux_type == "G" and allele in self.ars_mappings.g_group:
if allele in self.ars_mappings.dup_g:
return self.ars_mappings.dup_g[allele]
Expand Down
6 changes: 6 additions & 0 deletions tests/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@

def before_all(context):
context.ard = ARD("3440", data_dir="/tmp/py-ard")

# an ard with ping set to True
my_config = {
"ping": True,
}
context.ard_ping = ARD("3440", data_dir="/tmp/py-ard", config=my_config)
23 changes: 20 additions & 3 deletions tests/features/allele.feature
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
Feature: Alleles

Scenario Outline:
Scenario Outline: allele reduction with ping

Given the allele as <Allele>
When reducing on the <Level> level with ping
Then the reduced allele is found to be <Redux Allele>

Examples:
| Allele | Level | Redux Allele |
| C*02:02 | lg | C*02:02g |
| C*02:02 | lgx | C*02:02 |
| C*02:10 | lg | C*02:02g |
| C*02:10 | lgx | C*02:02 |
| C*06:17 | lgx | C*06:02 |

Scenario Outline: allele reduction

Given the allele as <Allele>
When reducing on the <Level> level
Expand All @@ -21,5 +35,8 @@ Feature: Alleles

| DRB1*14:06:01 | lgx | DRB1*14:06 |
| DRB1*14:06:01 | lg | DRB1*14:06g |
| C*02:02 | lg | C*02:02g/C*02:10g |
| C*02:02 | lgx | C*02:02/C*02:10 |
| C*02:02 | lg | C*02:02g |
| C*02:02 | lgx | C*02:02 |
| C*02:10 | lg | C*02:02g |
| C*02:10 | lgx | C*02:02 |
| C*06:17 | lgx | C*06:17 |
6 changes: 6 additions & 0 deletions tests/steps/redux_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ def step_impl(context, level):
context.redux_allele = context.ard.redux(context.allele, level)


@when("reducing on the {level} level with ping")
def step_impl(context, level):
context.level = level
context.redux_allele = context.ard_ping.redux(context.allele, level)


@when("reducing on the {level} level (ambiguous)")
def step_impl(context, level):
context.level = level
Expand Down
2 changes: 1 addition & 1 deletion tests/test_pyard.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,4 @@ def test_allele_duplicated(self):
# https://github.com/nmdp-bioinformatics/py-ard/issues/135
allele_code = "C*02:ACMGS"
allele_code_rx = self.ard.redux_gl(allele_code, "lgx")
self.assertEqual(allele_code_rx, "C*02:02/C*02:10")
self.assertEqual(allele_code_rx, "C*02:02")