From 508ce799c759ed3cc5480e9724a04d2f8176e6f0 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Wed, 3 Feb 2021 08:44:14 -0600 Subject: [PATCH] Extra Helper Scripts for Batch processing CSV File `reduce_csv.py` : the driver file `conf.py`: configuration `sample.csv`: sample CSV file --- extras/README.md | 100 ++++++++++++++++++++++++++++++++++ extras/conf.py | 78 ++++++++++++++++++++++++++ extras/reduce_csv.py | 127 +++++++++++++++++++++++++++++++++++++++++++ extras/sample.csv | 4 ++ 4 files changed, 309 insertions(+) create mode 100644 extras/README.md create mode 100644 extras/conf.py create mode 100644 extras/reduce_csv.py create mode 100644 extras/sample.csv diff --git a/extras/README.md b/extras/README.md new file mode 100644 index 0000000..9a69c57 --- /dev/null +++ b/extras/README.md @@ -0,0 +1,100 @@ +# Extras + +# Batch Script for CSV File + +**Example Scripts to batch reduce HLA typings from a CSV File** + +`reduce_csv.py` and `conf.py` scripts can be used to take a CSV file with HLA +typing data and reduce certain columns and produce a new CSV and Excel file. + +For most use case, installing `py-ard`, specifying the changes in `conf.py` file +and running `python reduce_csv.py` will produce result based on the configuration +in the `conf.py`. + + +```python +# +# configurations for processing CSV files +# + +# The column names that are in CSV +# The output file will have these columns +all_columns_in_csv = [ + "nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", + "r_dpb1_typ1", "r_dpb1_typ2" +] + +# +# List of columns which have typing information and need to be reduced. +# The locus is the 2nd term in the column name +# Eg: For column R_DRB1_type1, DPB1 is the locus name +# +columns_to_reduce_in_csv = [ + "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1", + "r_dpb1_typ2" +] + +# +# Configuration options to ARD reduction of a CSV file +# +ard_config = { + # All Columns in the CSV file + "csv_in_column_names": all_columns_in_csv, + + # Columns to check for typings + "columns_to_check": columns_to_reduce_in_csv, + + # How should the typings be reduced + # Valid Options: + # - G + # - lg + # - lgx + "redux_type": "lgx", + + # Input CSV filename + "in_csv_filename": "sample.csv", + + # Output CSV filename + "out_csv_filename": 'clean_sample.csv', + + # Use compression + # Valid options + # - 'gzip' + # - 'zip' + # - None + "apply_compression": 'gzip', + + # Show verbose log + # Valid options: + # - True + # - False + "verbose_log": True, + + # What to reduce ? + "reduce_serology": False, + "reduce_v2": True, + "reduce_3field": True, + "reduce_P": True, + "reduce_XX": False, + "reduce_MAC": True, + + # Is locus name present in allele + # Eg. A*01:01 vs 01:01 + "locus_in_allele_name": False, + + # Format + # Valid options: + # - csv + # - xlsx + "output_file_format": 'csv', + + # Add a separate column for processed column + "new_column_for_redux": False, +} +``` + +The included sample CSV file `sample.csv` can be processed using the script. + +```shell + +``` diff --git a/extras/conf.py b/extras/conf.py new file mode 100644 index 0000000..82fbc56 --- /dev/null +++ b/extras/conf.py @@ -0,0 +1,78 @@ +# +# configurations for processing CSV files +# + +# The column names that are in CSV +# The output file will have these columns +all_columns_in_csv = [ + "nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", + "r_dpb1_typ1", "r_dpb1_typ2" +] + +# +# List of columns which have typing information and need to be reduced. +# The locus is the 2nd term in the column name +# Eg: For column R_DRB1_type1, DPB1 is the locus name +# +columns_to_reduce_in_csv = [ + "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1", + "r_dpb1_typ2" +] + +# +# Configuration options to ARD reduction of a CSV file +# +ard_config = { + # All Columns in the CSV file + "csv_in_column_names": all_columns_in_csv, + + # Columns to check for typings + "columns_to_check": columns_to_reduce_in_csv, + + # How should the typings be reduced + # Valid Options: + # - G + # - lg + # - lgx + "redux_type": "lgx", + + # Input CSV filename + "in_csv_filename": "sample.csv", + + # Output CSV filename + "out_csv_filename": 'clean_sample.csv', + + # Use compression + # Valid options + # - 'gzip' + # - 'zip' + # - None + "apply_compression": 'gzip', + + # Show verbose log + # Valid options: + # - True + # - False + "verbose_log": True, + + # What to reduce ? + "reduce_serology": False, + "reduce_v2": True, + "reduce_3field": True, + "reduce_P": True, + "reduce_XX": False, + "reduce_MAC": True, + + # Is locus name present in allele + # Eg. A*01:01 vs 01:01 + "locus_in_allele_name": False, + + # Format + # Valid options: + # - csv + # - xlsx + "output_file_format": 'csv', + + # Add a separate column for processed column + "new_column_for_redux": False, +} diff --git a/extras/reduce_csv.py b/extras/reduce_csv.py new file mode 100644 index 0000000..41ae1f6 --- /dev/null +++ b/extras/reduce_csv.py @@ -0,0 +1,127 @@ +# +# +# Quick script to reduce alleles from a CSV file +# +# Use `conf.py` to setup configurations that's used here +# For Excel output, openpyxl library needs to be installed. +# pip install openpyxl +# + +import pandas as pd +import pyard +import re + +from conf import ard_config + +verbose = ard_config["verbose_log"] +white_space_regex = re.compile(r"\s+") + + +def is_serology(allele: str) -> bool: + if len(allele.split(':')) == 1: + return True + + +def is_3field(allele: str) -> bool: + return len(allele.split(':')) > 2 + + +def is_P(allele: str) -> bool: + if allele.endswith('P'): + fields = allele.split(':') + if len(fields) == 2: # Ps are 2 fields + return fields[0].isdigit() and fields[0].isdigit() + return False + + +def clean_locus(allele: str, column_name: str = 'Unknown') -> str: + if allele: + # Remove all white spaces + allele = white_space_regex.sub('', allele) + locus = column_name.split('_')[1].upper() + # If the allele comes in as an allele list, apply reduce to all alleles + if '/' in allele: + return "/".join(map(reduce, allele.split('/'), locus)) + else: + return reduce(allele, locus) + return allele + + +def should_be_reduced(allele, locus_allele): + if is_serology(allele): + return ard_config["reduce_serology"] + + if ard_config["reduce_v2"]: + if ard.is_v2(locus_allele): + return True + + if ard_config["reduce_3field"]: + if is_3field(locus_allele): + return True + + if ard_config["reduce_P"]: + if is_P(allele): + return True + + if ard_config["reduce_XX"]: + if ard.is_XX(locus_allele): + return True + + if ard_config["reduce_MAC"]: + if ard.is_mac(locus_allele) and not ard.is_XX(locus_allele): + return True + + return False + + +def reduce(allele, locus): + # Does the allele name have the locus in it ? + if '*' in allele: + locus_allele = allele + elif ard_config["locus_in_allele_name"]: + locus_allele = allele + else: + locus_allele = f"{locus}*{allele}" + + # Check the config if this allele should be reduced + if should_be_reduced(allele, locus_allele): + # print(f"reducing '{locus_allele}'") + reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"]) + # print(f"reduced to '{reduced_allele}'") + if reduced_allele: + allele = "/".join(map(lambda a: a.split('*')[1], + reduced_allele.split('/'))) + else: + if verbose: + print(f"Failed to reduce {locus_allele}") + + if verbose: + print(f"\t{locus_allele} => {allele}") + return allele + + +if __name__ == '__main__': + ard = pyard.ARD(remove_invalid=False) + + df = pd.read_csv(ard_config["in_csv_filename"], names=ard_config["csv_in_column_names"], header=0, dtype=str) + df.fillna('', inplace=True) + + for column in ard_config["columns_to_check"]: + if verbose: + print(f"Column:{column} =>") + if ard_config["new_column_for_redux"]: + # insert a new column + new_column_name = f"reduced_{column}" + new_column_index = df.columns.get_loc(column) + 1 + df.insert(new_column_index, new_column_name, df[column].apply(clean_locus, column_name=column)) + else: + df[column] = df[column].apply(clean_locus, column_name=column) + + if ard_config["output_file_format"] == 'xlsx': + out_file_name = f"{ard_config['out_csv_filename']}.xlsx" + df.to_excel(out_file_name, index=False) + else: + out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}" + df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"]) + if verbose: + print(f"Saved result to file:{out_file_name}") diff --git a/extras/sample.csv b/extras/sample.csv new file mode 100644 index 0000000..01238df --- /dev/null +++ b/extras/sample.csv @@ -0,0 +1,4 @@ +nmdp_id,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2 +123,A*23:71,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01 +456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01 +789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01