Skip to content

Commit

Permalink
Merge pull request #57 from pbashyal-nmdp/refactor_data_to_sqlite
Browse files Browse the repository at this point in the history
Refactor reference data to db
  • Loading branch information
mmaiers-nmdp committed Oct 15, 2020
2 parents b5f951c + 916d1c2 commit e06cea7
Show file tree
Hide file tree
Showing 7 changed files with 532 additions and 373 deletions.
2 changes: 1 addition & 1 deletion pyard/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@
from .pyard import ARD

__author__ = """NMDP Bioinformatics"""
__version__ = '0.2.0'
__version__ = '0.3.0'
233 changes: 233 additions & 0 deletions pyard/data_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
import sqlite3

import pandas as pd

from pyard import db
from pyard.broad_splits import broad_splits_mapping

# GitHub URL where IMGT HLA files are downloaded.
IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'

# List of expression characters
expression_chars = ['N', 'Q', 'L', 'S']


def get_n_field_allele(allele: str, n: int) -> str:
"""
Given an HLA allele of >= n field, return n field allele.
Preserve the expression character if it exists
:param allele: Original allele
:param n: n number of fields to reduce to
:return: trimmed to n fields of the original allele
"""
last_char = allele[-1]
fields = allele.split(':')
if last_char in expression_chars and len(fields) > n:
return ':'.join(fields[0:n]) + last_char
else:
return ':'.join(fields[0:n])


def get_3field_allele(a: str) -> str:
return get_n_field_allele(a, 3)


def get_2field_allele(a: str) -> str:
return get_n_field_allele(a, 2)


def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
if db.tables_exists(db_connection, ['dup_g', 'g_group', 'lg_group', 'lgx_group']):
dup_g = db.load_dict(db_connection, table_name='dup_g', columns=('allele', 'g_group'))
g_group = db.load_dict(db_connection, table_name='g_group', columns=('allele', 'g'))
lg_group = db.load_dict(db_connection, table_name='lg_group', columns=('allele', 'lg'))
lgx_group = db.load_dict(db_connection, table_name='lgx_group', columns=('allele', 'lgx'))
return dup_g, g_group, lg_group, lgx_group

ars_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt'
df = pd.read_csv(ars_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()

df['A'] = df['A'].apply(lambda a: a.split('/'))
df = df.explode('A')
df['A'] = df['Locus'] + df['A']
df['G'] = df['Locus'] + df['G']

df['2d'] = df['A'].apply(get_2field_allele)
df['3d'] = df['A'].apply(get_3field_allele)

mg = df.drop_duplicates(['2d', 'G'])['2d'].value_counts()
multiple_g_list = mg[mg > 1].reset_index()['index'].to_list()

dup_g = df[df['2d'].isin(multiple_g_list)][['G', '2d']] \
.drop_duplicates() \
.groupby('2d', as_index=True).agg("/".join) \
.to_dict()['G']

df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]))

# Creating dictionaries with mac_code->ARS group mapping
df_g = pd.concat([
df[['2d', 'G']].rename(columns={'2d': 'A'}),
df[['3d', 'G']].rename(columns={'3d': 'A'}),
df[['A', 'G']]
], ignore_index=True)
g_group = df_g.set_index('A')['G'].to_dict()

df_lg = pd.concat([
df[['2d', 'lg']].rename(columns={'2d': 'A'}),
df[['3d', 'lg']].rename(columns={'3d': 'A'}),
df[['A', 'lg']]
])
lg_group = df_lg.set_index('A')['lg'].to_dict()

df_lgx = pd.concat([
df[['2d', 'lgx']].rename(columns={'2d': 'A'}),
df[['3d', 'lgx']].rename(columns={'3d': 'A'}),
df[['A', 'lgx']]
])
lgx_group = df_lgx.set_index('A')['lgx'].to_dict()

db.save_dict(db_connection, table_name='dup_g', dictionary=dup_g, columns=('allele', 'g_group'))
db.save_dict(db_connection, table_name='g_group', dictionary=g_group, columns=('allele', 'g'))
db.save_dict(db_connection, table_name='lg_group', dictionary=lg_group, columns=('allele', 'lg'))
db.save_dict(db_connection, table_name='lgx_group', dictionary=lgx_group, columns=('allele', 'lgx'))

return dup_g, g_group, lg_group, lgx_group


def generate_mac_codes(db_connection: sqlite3.Connection):
"""
MAC files come in 2 different versions:
Martin: when they’re printed, the first is better for encoding and the
second is better for decoding. The entire list was maintained both as an
excel spreadsheet and also as a sybase database table. The excel was the
one that was printed and distributed.
**==> numer.v3.txt <==**
Sorted by the length and the the values in the list
```
"LAST UPDATED: 09/30/20"
CODE SUBTYPE
AB 01/02
AC 01/03
AD 01/04
AE 01/05
AG 01/06
AH 01/07
AJ 01/08
```
**==> alpha.v3.txt <==**
Sorted by the code
```
"LAST UPDATED: 10/01/20"
* CODE SUBTYPE
AA 01/02/03/05
AB 01/02
AC 01/03
AD 01/04
AE 01/05
AF 01/09
AG 01/06
```
:param db_connection:
:param data_dir:
:return:
"""
mac_table_name = 'mac_codes'
if not db.table_exists(db_connection, mac_table_name):
# Load the MAC file to a DataFrame
mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', skiprows=3, names=['Code', 'Alleles'])
# Create a dict from code to alleles
mac = df_mac.set_index("Code")["Alleles"].to_dict()
# Save the mac dict to db
db.save_dict(db_connection, table_name=mac_table_name, dictionary=mac, columns=('code', 'alleles'))


def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version):
"""
Checks to see if there's already an allele list file for the `imgt_version`
in the `data_dir` directory. If not, will download the file and create
a valid allele set and corresponding xx codes.
The format of the AlleleList file has a 6-line header with a header
on the 7th line
```
# file: Allelelist.3290.txt
# date: 2017-07-10
# version: IPD-IMGT/HLA 3.29.0
# origin: https://github.com/ANHIG/IMGTHLA/Allelelist.3290.txt
# repository: https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist.3290.txt
# author: WHO, Steven G. E. Marsh (steven.marsh@ucl.ac.uk)
AlleleID,Allele
HLA00001,A*01:01:01:01
HLA02169,A*01:01:01:02N
HLA14798,A*01:01:01:03
HLA15760,A*01:01:01:04
HLA16415,A*01:01:01:05
HLA16417,A*01:01:01:06
HLA16436,A*01:01:01:07
```
:param db_connection: Database connection to the sqlite database
:param imgt_version: IMGT database version
:return: None, updates self
"""

if db.table_exists(db_connection, 'alleles'):
valid_alleles = db.load_set(db_connection, 'alleles')
xx_codes = db.load_dict(db_connection, 'xx_codes',
('allele_1d', 'allele_list'))
xx_codes = {k: v.split('/') for k, v in xx_codes.items()}
return valid_alleles, xx_codes

# Create a Pandas DataFrame from the mac_code list file
# Skip the header (first 6 lines) and use only the Allele column
if imgt_version == "Latest":
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.txt'
else:
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.{imgt_version}.txt'
allele_df = pd.read_csv(allele_list_url, header=6, usecols=['Allele'])

# Create a set of valid alleles
# All 2-field, 3-field and the original Alleles are considered valid alleles
allele_df['2d'] = allele_df['Allele'].apply(get_2field_allele)
allele_df['3d'] = allele_df['Allele'].apply(get_3field_allele)
valid_alleles = set(allele_df['Allele']). \
union(set(allele_df['2d'])). \
union(set(allele_df['3d']))

# Create xx_codes mapping from the unique alleles in 2-field column
xx_df = pd.DataFrame(allele_df['2d'].unique(), columns=['Allele'])
# Also create a first-field column
xx_df['1d'] = xx_df['Allele'].apply(lambda x: x.split(":")[0])
# xx_codes maps a first field name to its 2 field expansion
xx_codes = xx_df.groupby(['1d']) \
.apply(lambda x: list(x['Allele'])) \
.to_dict()

# Update xx codes with broads and splits
for broad, splits in broad_splits_mapping.items():
for split in splits:
if broad in xx_codes:
xx_codes[broad].extend(xx_codes[split])
else:
xx_codes[broad] = xx_codes[split]

# Save this version of the valid alleles and xx codes
db.save_set(db_connection, 'alleles', valid_alleles, 'allele')
flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()}
db.save_dict(db_connection, 'xx_codes', flat_xx_codes,
('allele_1d', 'allele_list'))

return valid_alleles, xx_codes
Loading

0 comments on commit e06cea7

Please sign in to comment.