From 823abdcaa9c740931e6067a05fb011f21f88df22 Mon Sep 17 00:00:00 2001
From: pbashyal-nmdp <pbashyal@nmdp.org>
Date: Thu, 1 Jul 2021 10:30:36 -0500
Subject: [PATCH 1/3] Handle cases when there is no typing and when redux
 fails.

- If typing is '' return it as empty
- If redux_gl fails, print it as problematic

Produce a summary of all problematic typings with column name.

```
Summary
-------
	 Failed reducing 'A*0.559722222' in column d_a_typ1
	 Failed reducing 'A*0.613194444' in column d_a_typ1
	 Failed reducing 'A*0.559722222' in column d_a_typ1
	 Failed reducing 'A*0.247916667' in column d_a_typ1
	 Failed reducing 'A*0.215972222' in column d_a_typ1
	 Failed reducing 'A*0.45994213' in column d_a_typ1
	 Failed reducing 'A*0.529166667' in column d_a_typ1
	 Failed reducing 'A*0.529166667' in column d_a_typ1
	 Failed reducing 'A*0.559722222' in column d_a_typ1
	 Failed reducing 'A*0.529166667' in column d_a_typ2
```

Fixes #96
---
 scripts/pyard-reduce-csv | 139 ++++++++++++++++++++++-----------------
 1 file changed, 77 insertions(+), 62 deletions(-)
 mode change 100644 => 100755 scripts/pyard-reduce-csv

diff --git a/scripts/pyard-reduce-csv b/scripts/pyard-reduce-csv
old mode 100644
new mode 100755
index 7aa7ba7..da66359
--- a/scripts/pyard-reduce-csv
+++ b/scripts/pyard-reduce-csv
@@ -56,19 +56,6 @@ def is_P(allele: str) -> bool:
     return False
 
 
-def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
-    if allele:
-        # Remove all white spaces
-        allele = white_space_regex.sub('', allele)
-        locus = column_name.split('_')[1].upper()
-        # If the allele comes in as an allele list, apply reduce to all alleles
-        if '/' in allele:
-            return "/".join(map(reduce, allele.split('/'), locus))
-        else:
-            return reduce(allele, locus)
-    return allele
-
-
 def should_be_reduced(allele, locus_allele):
     if is_serology(allele):
         return ard_config["reduce_serology"]
@@ -96,8 +83,10 @@ def should_be_reduced(allele, locus_allele):
     return False
 
 
-def reduce(allele, locus):
+def reduce(allele, locus, column_name):
     # Does the allele name have the locus in it ?
+    if allele == '':
+        return allele
     if '*' in allele:
         locus_allele = allele
     elif ard_config["locus_in_allele_name"]:
@@ -108,7 +97,15 @@ def reduce(allele, locus):
     # Check the config if this allele should be reduced
     if should_be_reduced(allele, locus_allele):
         # print(f"reducing '{locus_allele}'")
-        reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
+        try:
+            reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
+        except RuntimeError as e:
+            if verbose:
+                print(e)
+            message = f"Failed reducing '{locus_allele}' in column {column_name}"
+            print(message)
+            failure_summary_messages.append(message)
+            return allele
         # print(f"reduced to '{reduced_allele}'")
         if reduced_allele:
             if ard_config["keep_locus_in_allele_name"]:
@@ -129,6 +126,19 @@ def reduce(allele, locus):
     return allele
 
 
+def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
+    if allele:
+        # Remove all white spaces
+        allele = white_space_regex.sub('', allele)
+        locus = column_name.split('_')[1].upper()
+        # If the allele comes in as an allele list, apply reduce to all alleles
+        if '/' in allele:
+            return "/".join(map(reduce, allele.split('/'), locus, column_name))
+        else:
+            return reduce(allele, locus, column_name)
+    return allele
+
+
 def create_drbx(row, locus_in_allele_name):
     return drbx.map_drbx(row.values, locus_in_allele_name)
 
@@ -159,51 +169,56 @@ if __name__ == '__main__':
         print("  pip install openpyxl")
         sys.exit(1)
 
-# Instantiate py-ard object with the latest
-ard = pyard.ARD(remove_invalid=False)
-
-# Read the Input File
-# Read only the columns to be saved.
-# Header is the first row
-# Don't convert to NAs
-df = pd.read_csv(ard_config["in_csv_filename"],
-                 usecols=ard_config["columns_from_csv"],
-                 header=0, dtype=str,
-                 keep_default_na=False)
-
-# Reduce each of the specified columns
-for column in ard_config["columns_to_reduce_in_csv"]:
-    if verbose:
-        print(f"Column:{column} =>")
-    if ard_config["new_column_for_redux"]:
-        # insert a new column
-        new_column_name = f"reduced_{column}"
-        new_column_index = df.columns.get_loc(column) + 1
-        # Apply clean_locus function to the column and insert as a new column
-        df.insert(new_column_index, new_column_name,
-                  df[column].apply(clean_locus, column_name=column))
+    # Instantiate py-ard object with the latest
+    ard = pyard.ARD(remove_invalid=False)
+
+    # Read the Input File
+    # Read only the columns to be saved.
+    # Header is the first row
+    # Don't convert to NAs
+    df = pd.read_csv(ard_config["in_csv_filename"],
+                     usecols=ard_config["columns_from_csv"],
+                     header=0, dtype=str,
+                     keep_default_na=False)
+
+    failure_summary_messages = []
+    # Reduce each of the specified columns
+    for column in ard_config["columns_to_reduce_in_csv"]:
+        if verbose:
+            print(f"Column:{column} =>")
+        if ard_config["new_column_for_redux"]:
+            # insert a new column
+            new_column_name = f"reduced_{column}"
+            new_column_index = df.columns.get_loc(column) + 1
+            # Apply clean_locus function to the column and insert as a new column
+            df.insert(new_column_index, new_column_name,
+                      df[column].apply(clean_locus, column_name=column))
+        else:
+            # Apply clean_locus function to the column and replace the column
+            df[column] = df[column].apply(clean_locus, column_name=column)
+
+    # Map DRB3,DRB4,DRB5 to DRBX if specified
+    # New columns DRBX_1 and DRBX_2 are created
+    if ard_config['map_drb345_to_drbx']:
+        drbx_loci = ['DRB3', 'DRB4', 'DRB5']
+        drbx_columns = [col_name for col_name in df.columns if col_name.split('_')[1] in drbx_loci]
+        if len(drbx_columns) == len(drbx_loci) * 2:  # For Type1/Type2
+            locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
+            df_drbx = df[drbx_columns].apply(create_drbx, axis=1, args=(locus_in_allele_name,))
+            df['DRBX_1'], df['DRBX_2'] = zip(*df_drbx)
+
+    # Save as XLSX if specified
+    if ard_config["output_file_format"] == 'xlsx':
+        out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
+        df.to_excel(out_file_name, index=False)
     else:
-        # Apply clean_locus function to the column and replace the column
-        df[column] = df[column].apply(clean_locus, column_name=column)
-
-# Map DRB3,DRB4,DRB5 to DRBX if specified
-# New columns DRBX_1 and DRBX_2 are created
-if ard_config['map_drb345_to_drbx']:
-    drbx_loci = ['DRB3', 'DRB4', 'DRB5']
-    drbx_columns = [col_name for col_name in df.columns if col_name.split('_')[1] in drbx_loci]
-    if len(drbx_columns) == len(drbx_loci) * 2:  # For Type1/Type2
-        locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
-        df_drbx = df[drbx_columns].apply(create_drbx, axis=1, args=(locus_in_allele_name,))
-        df['DRBX_1'], df['DRBX_2'] = zip(*df_drbx)
-
-# Save as XLSX if specified
-if ard_config["output_file_format"] == 'xlsx':
-    out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
-    df.to_excel(out_file_name, index=False)
-else:
-    # Save as compressed CSV
-    out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
-    df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
-
-# Done
-print(f"Saved result to file:{out_file_name}")
+        # Save as compressed CSV
+        out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
+        df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
+
+    print("Summary")
+    print("-------")
+    for message in failure_summary_messages:
+        print("\t", message)
+    # Done
+    print(f"Saved result to file:{out_file_name}")

From f33987670ba1b5493478185ca66153d67707c3eb Mon Sep 17 00:00:00 2001
From: pbashyal-nmdp <pbashyal@nmdp.org>
Date: Thu, 1 Jul 2021 10:44:33 -0500
Subject: [PATCH 2/3] =?UTF-8?q?Bump=20version:=200.6.5=20=E2=86=92=200.6.6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pyard/__init__.py | 2 +-
 setup.cfg         | 2 +-
 setup.py          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyard/__init__.py b/pyard/__init__.py
index 7c35c79..6822be9 100644
--- a/pyard/__init__.py
+++ b/pyard/__init__.py
@@ -24,4 +24,4 @@
 from .pyard import ARD
 
 __author__ = """NMDP Bioinformatics"""
-__version__ = '0.6.5'
+__version__ = '0.6.6'
diff --git a/setup.cfg b/setup.cfg
index 0c1d453..a841f5e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.5
+current_version = 0.6.6
 commit = True
 tag = True
 
diff --git a/setup.py b/setup.py
index 2ef0b55..fe8844c 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
 
 setup(
     name='py-ard',
-    version='0.6.5',
+    version='0.6.6',
     description="ARD reduction for HLA with Python",
     long_description=readme + '\n\n' + history,
     author="CIBMTR",

From c02fce6810c9cd99edf2832c952d8e60fb37c82b Mon Sep 17 00:00:00 2001
From: pbashyal-nmdp <pbashyal@nmdp.org>
Date: Thu, 1 Jul 2021 13:26:25 -0500
Subject: [PATCH 3/3] Added message when no errors

---
 scripts/pyard-reduce-csv | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/pyard-reduce-csv b/scripts/pyard-reduce-csv
index da66359..59e453c 100755
--- a/scripts/pyard-reduce-csv
+++ b/scripts/pyard-reduce-csv
@@ -216,9 +216,12 @@ if __name__ == '__main__':
         out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
         df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
 
-    print("Summary")
-    print("-------")
-    for message in failure_summary_messages:
-        print("\t", message)
+    if len(failure_summary_messages) == 0:
+        print("No Errors")
+    else:
+        print("Summary")
+        print("-------")
+        for message in failure_summary_messages:
+            print("\t", message)
     # Done
     print(f"Saved result to file:{out_file_name}")