From 823abdcaa9c740931e6067a05fb011f21f88df22 Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Thu, 1 Jul 2021 10:30:36 -0500 Subject: [PATCH 1/3] Handle cases when there is no typing and when redux fails. - If typing is '' return it as empty - If redux_gl fails, print it as problematic Produce a summary of all problematic typings with column name. ``` Summary ------- Failed reducing 'A*0.559722222' in column d_a_typ1 Failed reducing 'A*0.613194444' in column d_a_typ1 Failed reducing 'A*0.559722222' in column d_a_typ1 Failed reducing 'A*0.247916667' in column d_a_typ1 Failed reducing 'A*0.215972222' in column d_a_typ1 Failed reducing 'A*0.45994213' in column d_a_typ1 Failed reducing 'A*0.529166667' in column d_a_typ1 Failed reducing 'A*0.529166667' in column d_a_typ1 Failed reducing 'A*0.559722222' in column d_a_typ1 Failed reducing 'A*0.529166667' in column d_a_typ2 ``` Fixes #96 --- scripts/pyard-reduce-csv | 139 ++++++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 62 deletions(-) mode change 100644 => 100755 scripts/pyard-reduce-csv diff --git a/scripts/pyard-reduce-csv b/scripts/pyard-reduce-csv old mode 100644 new mode 100755 index 7aa7ba7..da66359 --- a/scripts/pyard-reduce-csv +++ b/scripts/pyard-reduce-csv @@ -56,19 +56,6 @@ def is_P(allele: str) -> bool: return False -def clean_locus(allele: str, column_name: str = 'Unknown') -> str: - if allele: - # Remove all white spaces - allele = white_space_regex.sub('', allele) - locus = column_name.split('_')[1].upper() - # If the allele comes in as an allele list, apply reduce to all alleles - if '/' in allele: - return "/".join(map(reduce, allele.split('/'), locus)) - else: - return reduce(allele, locus) - return allele - - def should_be_reduced(allele, locus_allele): if is_serology(allele): return ard_config["reduce_serology"] @@ -96,8 +83,10 @@ def should_be_reduced(allele, locus_allele): return False -def reduce(allele, locus): +def reduce(allele, locus, column_name): # Does the allele name have the locus in it ? + if allele == '': + return allele if '*' in allele: locus_allele = allele elif ard_config["locus_in_allele_name"]: @@ -108,7 +97,15 @@ def reduce(allele, locus): # Check the config if this allele should be reduced if should_be_reduced(allele, locus_allele): # print(f"reducing '{locus_allele}'") - reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"]) + try: + reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"]) + except RuntimeError as e: + if verbose: + print(e) + message = f"Failed reducing '{locus_allele}' in column {column_name}" + print(message) + failure_summary_messages.append(message) + return allele # print(f"reduced to '{reduced_allele}'") if reduced_allele: if ard_config["keep_locus_in_allele_name"]: @@ -129,6 +126,19 @@ def reduce(allele, locus): return allele +def clean_locus(allele: str, column_name: str = 'Unknown') -> str: + if allele: + # Remove all white spaces + allele = white_space_regex.sub('', allele) + locus = column_name.split('_')[1].upper() + # If the allele comes in as an allele list, apply reduce to all alleles + if '/' in allele: + return "/".join(map(reduce, allele.split('/'), locus, column_name)) + else: + return reduce(allele, locus, column_name) + return allele + + def create_drbx(row, locus_in_allele_name): return drbx.map_drbx(row.values, locus_in_allele_name) @@ -159,51 +169,56 @@ if __name__ == '__main__': print(" pip install openpyxl") sys.exit(1) -# Instantiate py-ard object with the latest -ard = pyard.ARD(remove_invalid=False) - -# Read the Input File -# Read only the columns to be saved. -# Header is the first row -# Don't convert to NAs -df = pd.read_csv(ard_config["in_csv_filename"], - usecols=ard_config["columns_from_csv"], - header=0, dtype=str, - keep_default_na=False) - -# Reduce each of the specified columns -for column in ard_config["columns_to_reduce_in_csv"]: - if verbose: - print(f"Column:{column} =>") - if ard_config["new_column_for_redux"]: - # insert a new column - new_column_name = f"reduced_{column}" - new_column_index = df.columns.get_loc(column) + 1 - # Apply clean_locus function to the column and insert as a new column - df.insert(new_column_index, new_column_name, - df[column].apply(clean_locus, column_name=column)) + # Instantiate py-ard object with the latest + ard = pyard.ARD(remove_invalid=False) + + # Read the Input File + # Read only the columns to be saved. + # Header is the first row + # Don't convert to NAs + df = pd.read_csv(ard_config["in_csv_filename"], + usecols=ard_config["columns_from_csv"], + header=0, dtype=str, + keep_default_na=False) + + failure_summary_messages = [] + # Reduce each of the specified columns + for column in ard_config["columns_to_reduce_in_csv"]: + if verbose: + print(f"Column:{column} =>") + if ard_config["new_column_for_redux"]: + # insert a new column + new_column_name = f"reduced_{column}" + new_column_index = df.columns.get_loc(column) + 1 + # Apply clean_locus function to the column and insert as a new column + df.insert(new_column_index, new_column_name, + df[column].apply(clean_locus, column_name=column)) + else: + # Apply clean_locus function to the column and replace the column + df[column] = df[column].apply(clean_locus, column_name=column) + + # Map DRB3,DRB4,DRB5 to DRBX if specified + # New columns DRBX_1 and DRBX_2 are created + if ard_config['map_drb345_to_drbx']: + drbx_loci = ['DRB3', 'DRB4', 'DRB5'] + drbx_columns = [col_name for col_name in df.columns if col_name.split('_')[1] in drbx_loci] + if len(drbx_columns) == len(drbx_loci) * 2: # For Type1/Type2 + locus_in_allele_name = ard_config["keep_locus_in_allele_name"] + df_drbx = df[drbx_columns].apply(create_drbx, axis=1, args=(locus_in_allele_name,)) + df['DRBX_1'], df['DRBX_2'] = zip(*df_drbx) + + # Save as XLSX if specified + if ard_config["output_file_format"] == 'xlsx': + out_file_name = f"{ard_config['out_csv_filename']}.xlsx" + df.to_excel(out_file_name, index=False) else: - # Apply clean_locus function to the column and replace the column - df[column] = df[column].apply(clean_locus, column_name=column) - -# Map DRB3,DRB4,DRB5 to DRBX if specified -# New columns DRBX_1 and DRBX_2 are created -if ard_config['map_drb345_to_drbx']: - drbx_loci = ['DRB3', 'DRB4', 'DRB5'] - drbx_columns = [col_name for col_name in df.columns if col_name.split('_')[1] in drbx_loci] - if len(drbx_columns) == len(drbx_loci) * 2: # For Type1/Type2 - locus_in_allele_name = ard_config["keep_locus_in_allele_name"] - df_drbx = df[drbx_columns].apply(create_drbx, axis=1, args=(locus_in_allele_name,)) - df['DRBX_1'], df['DRBX_2'] = zip(*df_drbx) - -# Save as XLSX if specified -if ard_config["output_file_format"] == 'xlsx': - out_file_name = f"{ard_config['out_csv_filename']}.xlsx" - df.to_excel(out_file_name, index=False) -else: - # Save as compressed CSV - out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}" - df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"]) - -# Done -print(f"Saved result to file:{out_file_name}") + # Save as compressed CSV + out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}" + df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"]) + + print("Summary") + print("-------") + for message in failure_summary_messages: + print("\t", message) + # Done + print(f"Saved result to file:{out_file_name}") From f33987670ba1b5493478185ca66153d67707c3eb Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Thu, 1 Jul 2021 10:44:33 -0500 Subject: [PATCH 2/3] =?UTF-8?q?Bump=20version:=200.6.5=20=E2=86=92=200.6.6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyard/__init__.py b/pyard/__init__.py index 7c35c79..6822be9 100644 --- a/pyard/__init__.py +++ b/pyard/__init__.py @@ -24,4 +24,4 @@ from .pyard import ARD __author__ = """NMDP Bioinformatics""" -__version__ = '0.6.5' +__version__ = '0.6.6' diff --git a/setup.cfg b/setup.cfg index 0c1d453..a841f5e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.6.5 +current_version = 0.6.6 commit = True tag = True diff --git a/setup.py b/setup.py index 2ef0b55..fe8844c 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ setup( name='py-ard', - version='0.6.5', + version='0.6.6', description="ARD reduction for HLA with Python", long_description=readme + '\n\n' + history, author="CIBMTR", From c02fce6810c9cd99edf2832c952d8e60fb37c82b Mon Sep 17 00:00:00 2001 From: pbashyal-nmdp Date: Thu, 1 Jul 2021 13:26:25 -0500 Subject: [PATCH 3/3] Added message when no errors --- scripts/pyard-reduce-csv | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/pyard-reduce-csv b/scripts/pyard-reduce-csv index da66359..59e453c 100755 --- a/scripts/pyard-reduce-csv +++ b/scripts/pyard-reduce-csv @@ -216,9 +216,12 @@ if __name__ == '__main__': out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}" df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"]) - print("Summary") - print("-------") - for message in failure_summary_messages: - print("\t", message) + if len(failure_summary_messages) == 0: + print("No Errors") + else: + print("Summary") + print("-------") + for message in failure_summary_messages: + print("\t", message) # Done print(f"Saved result to file:{out_file_name}")