nmdp-bioinformatics · mmaiers-nmdp · Jun 17, 2021 · Jun 17, 2021 · Jun 17, 2021 · Jun 17, 2021
diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
@@ -34,7 +34,13 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Run BDD Tests
       run: |
+        # When run the first time, it'll build the library
+        behave
+        # When run the second time, it should use the already installed library
         behave
     - name: Run Unit Tests
       run: |
+        # When run the first time, it'll build the library
+        python -m unittest tests.test_pyard tests.test_smart_sort
+        # When run the second time, it should use the already installed library
         python -m unittest tests.test_pyard tests.test_smart_sort
diff --git a/extras/README.md b/extras/README.md
@@ -1,100 +1,114 @@
 # Extras
 
-# Batch Script for CSV File
+# Script to batch process a CSV File
 
 **Example Scripts to batch reduce HLA typings from a CSV File**
 
-`reduce_csv.py` and `conf.py` scripts can be used to take a CSV file with HLA 
-typing data and reduce certain columns and produce a new CSV and Excel file.
-
-For most use case, installing `py-ard`, specifying the changes in `conf.py` file
-and running `python reduce_csv.py` will produce result based on the configuration
-in the `conf.py`.
-
-
-```python
-#
-# configurations for processing CSV files
-#
-
-# The column names that are in CSV
-# The output file will have these columns
-all_columns_in_csv = [
-    "nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
-    "r_dpb1_typ1", "r_dpb1_typ2"
-]
-
-#
-# List of columns which have typing information and need to be reduced.
-# The locus is the 2nd term in the column name
-# Eg: For column R_DRB1_type1, DPB1 is the locus name
-#
-columns_to_reduce_in_csv = [
-    "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
+`pyard-reduce-csv` command can be used with a config file(that describes ways
+to reduce the file) can be used to take a CSV file with HLA typing data and 
+reduce certain columns and produce a new CSV or an Excel file.
+
+Install `py-ard` and use `pyard-reduce-csv` command specifying the changes in a JSON
+config file and running `pyard-reduce-csv -c <config-file>` will produce result based
+on the configuration in the config file.
+
+
+See [Example JSON config file](reduce_conf.json).
+
+
+### Input CSV filename
+`in_csv_filename` Directory path and file name of the Input CSV file
+
+### Output CSV filename
+`out_csv_filename` Directory path and file name of the Reduced Output CSV file
+
+### CSV Columns to read
+`columns_from_csv` The column names to read from CSV file
+
+```json
+ [
+    "nmdp_id",
+    "r_a_typ1",
+    "r_a_typ2",
+    "r_b_typ1",
+    "r_b_typ2",
+    "r_c_typ1",
+    "r_c_typ2",
+    "r_drb1_typ1",
+    "r_drb1_typ2",
+    "r_dpb1_typ1",
+    "r_dpb1_typ2"
+  ]
+```
+
+### CSV Columns to reduce
+`columns_to_reduce_in_csv` List of columns which have typing information and need to be reduced.
+
+**NOTE**: The locus is the 2nd term in the column name
+E.g., for column `column R_DRB1_type1`, `DPB1` is the locus name
+
+```json
+  [
+    "r_a_typ1",
+    "r_a_typ2",
+    "r_b_typ1",
+    "r_b_typ2",
+    "r_c_typ1",
+    "r_c_typ2",
+    "r_drb1_typ1",
+    "r_drb1_typ2",
+    "r_dpb1_typ1",
     "r_dpb1_typ2"
-]
-
-#
-# Configuration options to ARD reduction of a CSV file
-#
-ard_config = {
-    # All Columns in the CSV file
-    "csv_in_column_names": all_columns_in_csv,
-
-    # Columns to check for typings
-    "columns_to_check": columns_to_reduce_in_csv,
-
-    # How should the typings be reduced
-    # Valid Options:
-    # - G
-    # - lg
-    # - lgx
-    "redux_type": "lgx",
-
-    # Input CSV filename
-    "in_csv_filename": "sample.csv",
-
-    # Output CSV filename
-    "out_csv_filename": 'clean_sample.csv',
-
-    # Use compression
-    # Valid options
-    # - 'gzip'
-    # - 'zip'
-    # - None
-    "apply_compression": 'gzip',
-
-    # Show verbose log
-    # Valid options:
-    # - True
-    # - False
-    "verbose_log": True,
-
-    # What to reduce ?
-    "reduce_serology": False,
-    "reduce_v2": True,
-    "reduce_3field": True,
-    "reduce_P": True,
-    "reduce_XX": False,
-    "reduce_MAC": True,
-
-    # Is locus name present in allele
-    # Eg. A*01:01 vs 01:01
-    "locus_in_allele_name": False,
-
-    # Format
-    # Valid options:
-    # - csv
-    # - xlsx
-    "output_file_format": 'csv',
-
-    # Add a separate column for processed column
-    "new_column_for_redux": False,
-}
+  ],
 ```
 
-The included sample CSV file `sample.csv` can be processed using the script.
 
-```shell
+### Redux Options
+`redux_type` Reduction Type
+
+Valid Options: `G`, `lg` and `lgx`
+
+### Compression Options
+`apply_compression` Compression to use for output file
 
+Valid options: `'gzip'`, `'zip'` or `null`
+
+### Verbose log Options
+`log_comment` Show verbose log ?
+
+Valid options: `true` or `false`
+
+### Types of typings to reduce 
+```json
+  "verbose_log": true,
+  "reduce_serology": false,
+  "reduce_v2": true,
+  "reduce_3field": true,
+  "reduce_P": true,
+  "reduce_XX": false,
+  "reduce_MAC": true,
 ```
+Valid options: `true` or `false`
+
+
+### Locus Name in Allele
+`locus_in_allele_name` 
+Is locus name present in allele ? E.g. A*01:01 vs 01:01
+
+Valid options: `true` or `false`
+
+### Output Format
+`output_file_format` Format of the output file
+
+Valid options: `csv` or `xlsx`
+
+### Create New Column 
+`new_column_for_redux` Add a separate column for processed column or replace
+the current column. Creates a `reduced_` version of the column.
+
+Valid options: `true`, `false`
+
+### Map to DRBX
+`map_drb345_to_drbx` Map to DRBX Typings based on DRB3, DRB4 and DRB5 typings.
+
+Valid options: `true` or `false`
diff --git a/extras/conf.py b/extras/conf.py
diff --git a/extras/reduce_conf.json b/extras/reduce_conf.json
@@ -0,0 +1,43 @@
+{
+  "in_csv_filename": "sample.csv",
+  "out_csv_filename": "clean_sample.csv",
+  "columns_from_csv": [
+    "nmdp_id",
+    "r_a_typ1",
+    "r_a_typ2",
+    "r_b_typ1",
+    "r_b_typ2",
+    "r_c_typ1",
+    "r_c_typ2",
+    "r_drb1_typ1",
+    "r_drb1_typ2",
+    "r_dpb1_typ1",
+    "r_dpb1_typ2"
+  ],
+  "columns_to_reduce_in_csv": [
+    "r_a_typ1",
+    "r_a_typ2",
+    "r_b_typ1",
+    "r_b_typ2",
+    "r_c_typ1",
+    "r_c_typ2",
+    "r_drb1_typ1",
+    "r_drb1_typ2",
+    "r_dpb1_typ1",
+    "r_dpb1_typ2"
+  ],
+  "redux_type": "lgx",
+  "apply_compression": "gzip",
+  "reduce_serology": false,
+  "reduce_v2": true,
+  "reduce_3field": true,
+  "reduce_P": true,
+  "reduce_XX": false,
+  "reduce_MAC": true,
+  "locus_in_allele_name": false,
+  "keep_locus_in_allele_name": false,
+  "output_file_format": "csv",
+  "new_column_for_redux": false,
+  "map_drb345_to_drbx": false,
+  "verbose_log": true
+}