-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #57 from pbashyal-nmdp/refactor_data_to_sqlite
Refactor reference data to db
- Loading branch information
Showing
7 changed files
with
532 additions
and
373 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,4 +24,4 @@ | |
from .pyard import ARD | ||
|
||
__author__ = """NMDP Bioinformatics""" | ||
__version__ = '0.2.0' | ||
__version__ = '0.3.0' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,233 @@ | ||
import sqlite3 | ||
|
||
import pandas as pd | ||
|
||
from pyard import db | ||
from pyard.broad_splits import broad_splits_mapping | ||
|
||
# GitHub URL where IMGT HLA files are downloaded. | ||
IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/' | ||
|
||
# List of expression characters | ||
expression_chars = ['N', 'Q', 'L', 'S'] | ||
|
||
|
||
def get_n_field_allele(allele: str, n: int) -> str: | ||
""" | ||
Given an HLA allele of >= n field, return n field allele. | ||
Preserve the expression character if it exists | ||
:param allele: Original allele | ||
:param n: n number of fields to reduce to | ||
:return: trimmed to n fields of the original allele | ||
""" | ||
last_char = allele[-1] | ||
fields = allele.split(':') | ||
if last_char in expression_chars and len(fields) > n: | ||
return ':'.join(fields[0:n]) + last_char | ||
else: | ||
return ':'.join(fields[0:n]) | ||
|
||
|
||
def get_3field_allele(a: str) -> str: | ||
return get_n_field_allele(a, 3) | ||
|
||
|
||
def get_2field_allele(a: str) -> str: | ||
return get_n_field_allele(a, 2) | ||
|
||
|
||
def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): | ||
if db.tables_exists(db_connection, ['dup_g', 'g_group', 'lg_group', 'lgx_group']): | ||
dup_g = db.load_dict(db_connection, table_name='dup_g', columns=('allele', 'g_group')) | ||
g_group = db.load_dict(db_connection, table_name='g_group', columns=('allele', 'g')) | ||
lg_group = db.load_dict(db_connection, table_name='lg_group', columns=('allele', 'lg')) | ||
lgx_group = db.load_dict(db_connection, table_name='lgx_group', columns=('allele', 'lgx')) | ||
return dup_g, g_group, lg_group, lgx_group | ||
|
||
ars_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt' | ||
df = pd.read_csv(ars_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() | ||
|
||
df['A'] = df['A'].apply(lambda a: a.split('/')) | ||
df = df.explode('A') | ||
df['A'] = df['Locus'] + df['A'] | ||
df['G'] = df['Locus'] + df['G'] | ||
|
||
df['2d'] = df['A'].apply(get_2field_allele) | ||
df['3d'] = df['A'].apply(get_3field_allele) | ||
|
||
mg = df.drop_duplicates(['2d', 'G'])['2d'].value_counts() | ||
multiple_g_list = mg[mg > 1].reset_index()['index'].to_list() | ||
|
||
dup_g = df[df['2d'].isin(multiple_g_list)][['G', '2d']] \ | ||
.drop_duplicates() \ | ||
.groupby('2d', as_index=True).agg("/".join) \ | ||
.to_dict()['G'] | ||
|
||
df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g") | ||
df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2])) | ||
|
||
# Creating dictionaries with mac_code->ARS group mapping | ||
df_g = pd.concat([ | ||
df[['2d', 'G']].rename(columns={'2d': 'A'}), | ||
df[['3d', 'G']].rename(columns={'3d': 'A'}), | ||
df[['A', 'G']] | ||
], ignore_index=True) | ||
g_group = df_g.set_index('A')['G'].to_dict() | ||
|
||
df_lg = pd.concat([ | ||
df[['2d', 'lg']].rename(columns={'2d': 'A'}), | ||
df[['3d', 'lg']].rename(columns={'3d': 'A'}), | ||
df[['A', 'lg']] | ||
]) | ||
lg_group = df_lg.set_index('A')['lg'].to_dict() | ||
|
||
df_lgx = pd.concat([ | ||
df[['2d', 'lgx']].rename(columns={'2d': 'A'}), | ||
df[['3d', 'lgx']].rename(columns={'3d': 'A'}), | ||
df[['A', 'lgx']] | ||
]) | ||
lgx_group = df_lgx.set_index('A')['lgx'].to_dict() | ||
|
||
db.save_dict(db_connection, table_name='dup_g', dictionary=dup_g, columns=('allele', 'g_group')) | ||
db.save_dict(db_connection, table_name='g_group', dictionary=g_group, columns=('allele', 'g')) | ||
db.save_dict(db_connection, table_name='lg_group', dictionary=lg_group, columns=('allele', 'lg')) | ||
db.save_dict(db_connection, table_name='lgx_group', dictionary=lgx_group, columns=('allele', 'lgx')) | ||
|
||
return dup_g, g_group, lg_group, lgx_group | ||
|
||
|
||
def generate_mac_codes(db_connection: sqlite3.Connection): | ||
""" | ||
MAC files come in 2 different versions: | ||
Martin: when they’re printed, the first is better for encoding and the | ||
second is better for decoding. The entire list was maintained both as an | ||
excel spreadsheet and also as a sybase database table. The excel was the | ||
one that was printed and distributed. | ||
**==> numer.v3.txt <==** | ||
Sorted by the length and the the values in the list | ||
``` | ||
"LAST UPDATED: 09/30/20" | ||
CODE SUBTYPE | ||
AB 01/02 | ||
AC 01/03 | ||
AD 01/04 | ||
AE 01/05 | ||
AG 01/06 | ||
AH 01/07 | ||
AJ 01/08 | ||
``` | ||
**==> alpha.v3.txt <==** | ||
Sorted by the code | ||
``` | ||
"LAST UPDATED: 10/01/20" | ||
* CODE SUBTYPE | ||
AA 01/02/03/05 | ||
AB 01/02 | ||
AC 01/03 | ||
AD 01/04 | ||
AE 01/05 | ||
AF 01/09 | ||
AG 01/06 | ||
``` | ||
:param db_connection: | ||
:param data_dir: | ||
:return: | ||
""" | ||
mac_table_name = 'mac_codes' | ||
if not db.table_exists(db_connection, mac_table_name): | ||
# Load the MAC file to a DataFrame | ||
mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip' | ||
df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', skiprows=3, names=['Code', 'Alleles']) | ||
# Create a dict from code to alleles | ||
mac = df_mac.set_index("Code")["Alleles"].to_dict() | ||
# Save the mac dict to db | ||
db.save_dict(db_connection, table_name=mac_table_name, dictionary=mac, columns=('code', 'alleles')) | ||
|
||
|
||
def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version): | ||
""" | ||
Checks to see if there's already an allele list file for the `imgt_version` | ||
in the `data_dir` directory. If not, will download the file and create | ||
a valid allele set and corresponding xx codes. | ||
The format of the AlleleList file has a 6-line header with a header | ||
on the 7th line | ||
``` | ||
# file: Allelelist.3290.txt | ||
# date: 2017-07-10 | ||
# version: IPD-IMGT/HLA 3.29.0 | ||
# origin: https://github.com/ANHIG/IMGTHLA/Allelelist.3290.txt | ||
# repository: https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist.3290.txt | ||
# author: WHO, Steven G. E. Marsh (steven.marsh@ucl.ac.uk) | ||
AlleleID,Allele | ||
HLA00001,A*01:01:01:01 | ||
HLA02169,A*01:01:01:02N | ||
HLA14798,A*01:01:01:03 | ||
HLA15760,A*01:01:01:04 | ||
HLA16415,A*01:01:01:05 | ||
HLA16417,A*01:01:01:06 | ||
HLA16436,A*01:01:01:07 | ||
``` | ||
:param db_connection: Database connection to the sqlite database | ||
:param imgt_version: IMGT database version | ||
:return: None, updates self | ||
""" | ||
|
||
if db.table_exists(db_connection, 'alleles'): | ||
valid_alleles = db.load_set(db_connection, 'alleles') | ||
xx_codes = db.load_dict(db_connection, 'xx_codes', | ||
('allele_1d', 'allele_list')) | ||
xx_codes = {k: v.split('/') for k, v in xx_codes.items()} | ||
return valid_alleles, xx_codes | ||
|
||
# Create a Pandas DataFrame from the mac_code list file | ||
# Skip the header (first 6 lines) and use only the Allele column | ||
if imgt_version == "Latest": | ||
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.txt' | ||
else: | ||
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.{imgt_version}.txt' | ||
allele_df = pd.read_csv(allele_list_url, header=6, usecols=['Allele']) | ||
|
||
# Create a set of valid alleles | ||
# All 2-field, 3-field and the original Alleles are considered valid alleles | ||
allele_df['2d'] = allele_df['Allele'].apply(get_2field_allele) | ||
allele_df['3d'] = allele_df['Allele'].apply(get_3field_allele) | ||
valid_alleles = set(allele_df['Allele']). \ | ||
union(set(allele_df['2d'])). \ | ||
union(set(allele_df['3d'])) | ||
|
||
# Create xx_codes mapping from the unique alleles in 2-field column | ||
xx_df = pd.DataFrame(allele_df['2d'].unique(), columns=['Allele']) | ||
# Also create a first-field column | ||
xx_df['1d'] = xx_df['Allele'].apply(lambda x: x.split(":")[0]) | ||
# xx_codes maps a first field name to its 2 field expansion | ||
xx_codes = xx_df.groupby(['1d']) \ | ||
.apply(lambda x: list(x['Allele'])) \ | ||
.to_dict() | ||
|
||
# Update xx codes with broads and splits | ||
for broad, splits in broad_splits_mapping.items(): | ||
for split in splits: | ||
if broad in xx_codes: | ||
xx_codes[broad].extend(xx_codes[split]) | ||
else: | ||
xx_codes[broad] = xx_codes[split] | ||
|
||
# Save this version of the valid alleles and xx codes | ||
db.save_set(db_connection, 'alleles', valid_alleles, 'allele') | ||
flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()} | ||
db.save_dict(db_connection, 'xx_codes', flat_xx_codes, | ||
('allele_1d', 'allele_list')) | ||
|
||
return valid_alleles, xx_codes |
Oops, something went wrong.