Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions extras/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Extras

# Batch Script for CSV File

**Example Scripts to batch reduce HLA typings from a CSV File**

`reduce_csv.py` and `conf.py` scripts can be used to take a CSV file with HLA
typing data and reduce certain columns and produce a new CSV and Excel file.

For most use case, installing `py-ard`, specifying the changes in `conf.py` file
and running `python reduce_csv.py` will produce result based on the configuration
in the `conf.py`.


```python
#
# configurations for processing CSV files
#

# The column names that are in CSV
# The output file will have these columns
all_columns_in_csv = [
"nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
"r_dpb1_typ1", "r_dpb1_typ2"
]

#
# List of columns which have typing information and need to be reduced.
# The locus is the 2nd term in the column name
# Eg: For column R_DRB1_type1, DPB1 is the locus name
#
columns_to_reduce_in_csv = [
"r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
"r_dpb1_typ2"
]

#
# Configuration options to ARD reduction of a CSV file
#
ard_config = {
# All Columns in the CSV file
"csv_in_column_names": all_columns_in_csv,

# Columns to check for typings
"columns_to_check": columns_to_reduce_in_csv,

# How should the typings be reduced
# Valid Options:
# - G
# - lg
# - lgx
"redux_type": "lgx",

# Input CSV filename
"in_csv_filename": "sample.csv",

# Output CSV filename
"out_csv_filename": 'clean_sample.csv',

# Use compression
# Valid options
# - 'gzip'
# - 'zip'
# - None
"apply_compression": 'gzip',

# Show verbose log
# Valid options:
# - True
# - False
"verbose_log": True,

# What to reduce ?
"reduce_serology": False,
"reduce_v2": True,
"reduce_3field": True,
"reduce_P": True,
"reduce_XX": False,
"reduce_MAC": True,

# Is locus name present in allele
# Eg. A*01:01 vs 01:01
"locus_in_allele_name": False,

# Format
# Valid options:
# - csv
# - xlsx
"output_file_format": 'csv',

# Add a separate column for processed column
"new_column_for_redux": False,
}
```

The included sample CSV file `sample.csv` can be processed using the script.

```shell

```
78 changes: 78 additions & 0 deletions extras/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#
# configurations for processing CSV files
#

# The column names that are in CSV
# The output file will have these columns
all_columns_in_csv = [
"nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
"r_dpb1_typ1", "r_dpb1_typ2"
]

#
# List of columns which have typing information and need to be reduced.
# The locus is the 2nd term in the column name
# Eg: For column R_DRB1_type1, DPB1 is the locus name
#
columns_to_reduce_in_csv = [
"r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
"r_dpb1_typ2"
]

#
# Configuration options to ARD reduction of a CSV file
#
ard_config = {
# All Columns in the CSV file
"csv_in_column_names": all_columns_in_csv,

# Columns to check for typings
"columns_to_check": columns_to_reduce_in_csv,

# How should the typings be reduced
# Valid Options:
# - G
# - lg
# - lgx
"redux_type": "lgx",

# Input CSV filename
"in_csv_filename": "sample.csv",

# Output CSV filename
"out_csv_filename": 'clean_sample.csv',

# Use compression
# Valid options
# - 'gzip'
# - 'zip'
# - None
"apply_compression": 'gzip',

# Show verbose log
# Valid options:
# - True
# - False
"verbose_log": True,

# What to reduce ?
"reduce_serology": False,
"reduce_v2": True,
"reduce_3field": True,
"reduce_P": True,
"reduce_XX": False,
"reduce_MAC": True,

# Is locus name present in allele
# Eg. A*01:01 vs 01:01
"locus_in_allele_name": False,

# Format
# Valid options:
# - csv
# - xlsx
"output_file_format": 'csv',

# Add a separate column for processed column
"new_column_for_redux": False,
}
127 changes: 127 additions & 0 deletions extras/reduce_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#
#
# Quick script to reduce alleles from a CSV file
#
# Use `conf.py` to setup configurations that's used here
# For Excel output, openpyxl library needs to be installed.
# pip install openpyxl
#

import pandas as pd
import pyard
import re

from conf import ard_config

verbose = ard_config["verbose_log"]
white_space_regex = re.compile(r"\s+")


def is_serology(allele: str) -> bool:
if len(allele.split(':')) == 1:
return True


def is_3field(allele: str) -> bool:
return len(allele.split(':')) > 2


def is_P(allele: str) -> bool:
if allele.endswith('P'):
fields = allele.split(':')
if len(fields) == 2: # Ps are 2 fields
return fields[0].isdigit() and fields[0].isdigit()
return False


def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
if allele:
# Remove all white spaces
allele = white_space_regex.sub('', allele)
locus = column_name.split('_')[1].upper()
# If the allele comes in as an allele list, apply reduce to all alleles
if '/' in allele:
return "/".join(map(reduce, allele.split('/'), locus))
else:
return reduce(allele, locus)
return allele


def should_be_reduced(allele, locus_allele):
if is_serology(allele):
return ard_config["reduce_serology"]

if ard_config["reduce_v2"]:
if ard.is_v2(locus_allele):
return True

if ard_config["reduce_3field"]:
if is_3field(locus_allele):
return True

if ard_config["reduce_P"]:
if is_P(allele):
return True

if ard_config["reduce_XX"]:
if ard.is_XX(locus_allele):
return True

if ard_config["reduce_MAC"]:
if ard.is_mac(locus_allele) and not ard.is_XX(locus_allele):
return True

return False


def reduce(allele, locus):
# Does the allele name have the locus in it ?
if '*' in allele:
locus_allele = allele
elif ard_config["locus_in_allele_name"]:
locus_allele = allele
else:
locus_allele = f"{locus}*{allele}"

# Check the config if this allele should be reduced
if should_be_reduced(allele, locus_allele):
# print(f"reducing '{locus_allele}'")
reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
# print(f"reduced to '{reduced_allele}'")
if reduced_allele:
allele = "/".join(map(lambda a: a.split('*')[1],
reduced_allele.split('/')))
else:
if verbose:
print(f"Failed to reduce {locus_allele}")

if verbose:
print(f"\t{locus_allele} => {allele}")
return allele


if __name__ == '__main__':
ard = pyard.ARD(remove_invalid=False)

df = pd.read_csv(ard_config["in_csv_filename"], names=ard_config["csv_in_column_names"], header=0, dtype=str)
df.fillna('', inplace=True)

for column in ard_config["columns_to_check"]:
if verbose:
print(f"Column:{column} =>")
if ard_config["new_column_for_redux"]:
# insert a new column
new_column_name = f"reduced_{column}"
new_column_index = df.columns.get_loc(column) + 1
df.insert(new_column_index, new_column_name, df[column].apply(clean_locus, column_name=column))
else:
df[column] = df[column].apply(clean_locus, column_name=column)

if ard_config["output_file_format"] == 'xlsx':
out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
df.to_excel(out_file_name, index=False)
else:
out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
if verbose:
print(f"Saved result to file:{out_file_name}")
4 changes: 4 additions & 0 deletions extras/sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
nmdp_id,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2
123,A*23:71,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01
456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01
789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01