nmdp-bioinformatics · mmaiers-nmdp · Feb 14, 2022 · Feb 9, 2022 · Feb 9, 2022 · Feb 9, 2022
diff --git a/README.md b/README.md
@@ -3,6 +3,14 @@
 
 ARD reduction for HLA with Python
 
+`py-ard` works with Python 3.8 and higher.
+
+## Install from PyPi
+
+```shell
+pip install py-ard
+```
+
 ## Install from source
 
 ```shell
@@ -11,13 +19,6 @@ source venv/bin/activate
 
 python setup.py install
 ```
-
-## Install from PyPi
-
-```shell
-pip install py-ard
-```
-
 ## Testing
 
 To run behavior-driven development (BDD) tests locally via the behave framework, you'll need to set up a virtual
@@ -30,10 +31,15 @@ pip install -r test-requirements.txt
 
 # Running Behave and all BDD tests
 behave
+
+# Run unit-tests
+python -m unittest tests.test_pyard
 ```
 
 ## Using `py-ard` from Python code
 
+`py-ard` can be used in a program to reduce/expand HLA GL String representation. If pyard discovers an invalid Allele, it'll throw an Invalid Exception, not silently return an empty result.
+
 ### Initialize `py-ard`
 
 Import `pyard` package.
@@ -42,8 +48,7 @@ Import `pyard` package.
 import pyard
 ```
 
-The cache size of pre-computed reductions can be changed from the default of 1000
-
+The cache size of pre-computed reductions can be changed from the default of 1000 (_not working_: will be fixed in a later release.)
 ```python
 pyard.max_cache_size = 1_000_000
 ```
@@ -74,7 +79,7 @@ ard = pyard.ARD()
 
 ### Reduce Typings
 
-Reduce a single locus HLA Typing
+Reduce a single locus HLA Typing.
 
 ```python
 allele = "A*01:01:01"
@@ -107,13 +112,13 @@ ard.redux_gl('B14', 'lg')
 
 ## Valid Reduction Types
 
-|Reduction Type | Description |
-|-------------- |-------------|
-| `G` | Reduce to G Group Level |
-| `lg` | Reduce to 2 field ARD level (append `g`) |
-| `lgx` | Reduce to 2 field ARD level |
-| `W` | Reduce/Expand to 3 field WHO nomenclature level|
-| `exon` | Reduce/Expand to exon level |
+| Reduction Type | Description                                     |
+|----------------|-------------------------------------------------|
+| `G`            | Reduce to G Group Level                         |
+| `lg`           | Reduce to 2 field ARD level (append `g`)        |
+| `lgx`          | Reduce to 2 field ARD level                     |
+| `W`            | Reduce/Expand to 3 field WHO nomenclature level |
+| `exon`         | Reduce/Expand to exon level                     |
 
 # Command Line Tools
 
@@ -160,6 +165,12 @@ $ pyard-import --v2-to-v3-mapping map2to3.csv
 $ pyard-import --db-version 3450 --refresh-mac
 ```
 
+### Show the status of all `py-ard` databases
+
+```shell
+$ pyard-status
+```
+
 ### Reduce a GL String from command line
 
 ```shell
@@ -172,10 +183,6 @@ DRB1*08:01:01G/DRB1*08:02:01G/DRB1*08:03:02G/DRB1*08:04:01G/DRB1*08:05/ ...
 $ pyard -v 3290 --gl 'A1' -r lgx # For a particular version of DB
 A*01:01/A*01:02/A*01:03/A*01:06/A*01:07/A*01:08/A*01:09/A*01:10/A*01:12/ ...
 ```
+### Batch Reduce a CSV file
 
-### Show the status of all `py-ard` databases
-
-```shell
-$ pyard-status
-```
-
+`pyard-csv-reduce` can be used to batch process a CSV file with HLA typings. See [documentation](extras/README.md) for instructions on how to configure and run. 
diff --git a/extras/README.md b/extras/README.md
@@ -4,111 +4,133 @@
 
 **Example Scripts to batch reduce HLA typings from a CSV File**
 
-`pyard-reduce-csv` command can be used with a config file(that describes ways
-to reduce the file) can be used to take a CSV file with HLA typing data and 
-reduce certain columns and produce a new CSV or an Excel file.
-
-Install `py-ard` and use `pyard-reduce-csv` command specifying the changes in a JSON
-config file and running `pyard-reduce-csv -c <config-file>` will produce result based
-on the configuration in the config file.
+`pyard-reduce-csv` command can be used with a config file(that describes ways to reduce the file) can be used to take a
+CSV file with HLA typing data and reduce certain columns and produce a new CSV or an Excel file.
 
+Install `py-ard` and use `pyard-reduce-csv` command specifying the changes in a JSON config file and
+running `pyard-reduce-csv -c <config-file>` to produce a resulting file based on the configuration in the config file.
 
 See [Example JSON config file](reduce_conf.json).
 
-
 ### Input CSV filename
+
 `in_csv_filename` Directory path and file name of the Input CSV file
 
 ### Output CSV filename
+
 `out_csv_filename` Directory path and file name of the Reduced Output CSV file
 
 ### CSV Columns to read
+
 `columns_from_csv` The column names to read from CSV file
 
 ```json
  [
-    "nmdp_id",
-    "r_a_typ1",
-    "r_a_typ2",
-    "r_b_typ1",
-    "r_b_typ2",
-    "r_c_typ1",
-    "r_c_typ2",
-    "r_drb1_typ1",
-    "r_drb1_typ2",
-    "r_dpb1_typ1",
-    "r_dpb1_typ2"
-  ]
+  "nmdp_id",
+  "r_a_typ1",
+  "r_a_typ2",
+  "r_b_typ1",
+  "r_b_typ2",
+  "r_c_typ1",
+  "r_c_typ2",
+  "r_drb1_typ1",
+  "r_drb1_typ2",
+  "r_dpb1_typ1",
+  "r_dpb1_typ2"
+]
 ```
 
 ### CSV Columns to reduce
+
 `columns_to_reduce_in_csv` List of columns which have typing information and need to be reduced.
 
-**NOTE**: The locus is the 2nd term in the column name
-E.g., for column `column R_DRB1_type1`, `DPB1` is the locus name
+**Important**: The locus is the 2nd term in the column name separated by `_`. The program uses this to figure out the
+column name for the typings in that column.
+
+E.g., for column `R_DRB1_type1`, `DPB1` is the locus name
 
 ```json
   [
-    "r_a_typ1",
-    "r_a_typ2",
-    "r_b_typ1",
-    "r_b_typ2",
-    "r_c_typ1",
-    "r_c_typ2",
-    "r_drb1_typ1",
-    "r_drb1_typ2",
-    "r_dpb1_typ1",
-    "r_dpb1_typ2"
-  ],
+  "r_a_typ1",
+  "r_a_typ2",
+  "r_b_typ1",
+  "r_b_typ2",
+  "r_c_typ1",
+  "r_c_typ2",
+  "r_drb1_typ1",
+  "r_drb1_typ2",
+  "r_dpb1_typ1",
+  "r_dpb1_typ2"
+]
 ```
 
-
 ### Redux Options
-`redux_type` Reduction Type
 
-Valid Options: `G`, `lg` and `lgx`
+`redux_type` Reduction Type
 
-### Compression Options
-`apply_compression` Compression to use for output file
+Valid Options are:
 
-Valid options: `'gzip'`, `'zip'` or `null`
+| Reduction Type | Description                                     |
+|----------------|-------------------------------------------------|
+| `G`            | Reduce to G Group Level                         |
+| `lg`           | Reduce to 2 field ARD level (append `g`)        |
+| `lgx`          | Reduce to 2 field ARD level                     |
+| `W`            | Reduce/Expand to 3 field WHO nomenclature level |
+| `exon`         | Reduce/Expand to exon level                     |
 
-### Verbose log Options
-`log_comment` Show verbose log ?
 
-Valid options: `true` or `false`
+### Kinds of typings to reduce
 
-### Types of typings to reduce 
 ```json
-  "verbose_log": true,
-  "reduce_serology": false,
-  "reduce_v2": true,
-  "reduce_3field": true,
-  "reduce_P": true,
-  "reduce_XX": false,
-  "reduce_MAC": true,
+"reduce_serology": false,
+"reduce_v2": true,
+"convert_v2_to_v3": false,
+"reduce_3field": true,
+"reduce_P": true,
+"reduce_XX": false,
+"reduce_MAC": true,
 ```
 Valid options: `true` or `false`
 
-
 ### Locus Name in Allele
-`locus_in_allele_name` 
-Is locus name present in allele ? E.g. A*01:01 vs 01:01
+
+`locus_in_allele_name`
+Is locus name present in allele ? E.g. `A*01:01` vs `01:01`
 
 Valid options: `true` or `false`
 
 ### Output Format
+
 `output_file_format` Format of the output file
 
-Valid options: `csv` or `xlsx`
+Valid options: `csv` or `xlsx` 
+
+For Excel output, `openpyxl` library needs to be installed. Install with:
+```shell
+ pip install openpyxl
+```
+
 
-### Create New Column 
-`new_column_for_redux` Add a separate column for processed column or replace
-the current column. Creates a `reduced_` version of the column.
+### Create New Column
+
+`new_column_for_redux` Add a separate column for processed column or replace the current column. Creates a `reduced_` version of the column. Otherwise, the same column is replaced with the reduced version. 
 
 Valid options: `true`, `false`
 
 ### Map to DRBX
-`map_drb345_to_drbx` Map to DRBX Typings based on DRB3, DRB4 and DRB5 typings.
+
+`map_drb345_to_drbx` Map to DRBX Typings based on DRB3, DRB4 and DRB5 typings using [WMDA method](https://www.nature.com/articles/1705672).
 
 Valid options: `true` or `false`
+
+### Compression Options
+
+`apply_compression` Compression to use for output file. Applies only to CSV files.
+
+Valid options: `'gzip'`, `'zip'` or `null`
+
+### Verbose log Options
+
+`verbose_log` Show verbose log ?
+
+Valid options: `true` or `false`
diff --git a/extras/reduce_conf.json b/extras/reduce_conf.json
@@ -27,7 +27,6 @@
     "r_dpb1_typ2"
   ],
   "redux_type": "lgx",
-  "apply_compression": "gzip",
   "reduce_serology": false,
   "reduce_v2": true,
   "convert_v2_to_v3": false,
@@ -40,5 +39,6 @@
   "output_file_format": "csv",
   "new_column_for_redux": false,
   "map_drb345_to_drbx": false,
+  "apply_compression": "gzip",
   "verbose_log": true
 }
diff --git a/pyard/data_repository.py b/pyard/data_repository.py
@@ -387,15 +387,28 @@ def to_serological_name(locus_name: str):
 
 def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
     if not db.table_exists(db_connection, 'serology_mapping'):
-        # Load WMDA serology mapping data
+        """
+        Read `rel_dna_ser.txt` file that contains alleles and their serological equivalents.
+
+        The fields of the Alleles->Serological mapping file are:
+           Locus - HLA Locus
+           Allele - HLA Allele Name
+           USA - Unambiguous Serological Antigen associated with allele
+           PSA - Possible Serological Antigen associated with allele
+           ASA - Assumed Serological Antigen associated with allele
+           EAE - Expert Assigned Exceptions in search determinants of some registries
+
+        EAE is ignored when generating the serology map.
+        """
         rel_dna_ser_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/rel_dna_ser.txt'
+        # Load WMDA serology mapping data from URL
         df_sero = pd.read_csv(rel_dna_ser_url, sep=';', skiprows=6,
-                              names=['Locus', 'Allele', 'USA', 'PSA', 'ASA'],
+                              names=['Locus', 'Allele', 'USA', 'PSA', 'ASA', 'EAE'],
                               index_col=False)
 
         # Remove 0 and ? from USA
         df_sero = df_sero[(df_sero['USA'] != '0') & (df_sero['USA'] != '?')]
-        df_sero['Allele'] = df_sero['Locus'] + df_sero['Allele']
+        df_sero['Allele'] = df_sero.loc[:, 'Locus'] + df_sero.loc[:, 'Allele']
 
         usa = df_sero[['Locus', 'Allele', 'USA']].dropna()
         usa['Sero'] = usa['Locus'] + usa['USA']