diff --git a/calibration/BraTS/brats_cal_images_list.py b/calibration/BraTS/brats_cal_images_list.py
index c0a43403c..a87539198 100644
--- a/calibration/BraTS/brats_cal_images_list.py
+++ b/calibration/BraTS/brats_cal_images_list.py
@@ -1,12 +1,16 @@
 import numpy as np
+
 np.random.seed(0)
 images = []
 for i in [0, 2, 3, 4]:
-    with open("../../v0.7/medical_imaging/3d-unet/folds/fold{:d}_validation.txt".format(i)) as f:
+    with open(
+        "../../v0.7/medical_imaging/3d-unet/folds/fold{:d}_validation.txt".format(
+            i)
+    ) as f:
         for line in f:
             images.append(line.rstrip())
 indices = np.random.permutation(len(images))[:40]
 selected = sorted([images[idx] for idx in indices])
 with open("brats_cal_images_list.txt", "w") as f:
     for img in selected:
-        print(img, file=f)
\ No newline at end of file
+        print(img, file=f)
diff --git a/compliance/nvidia/TEST01/run_verification.py b/compliance/nvidia/TEST01/run_verification.py
index 4923ca750..51f265e20 100644
--- a/compliance/nvidia/TEST01/run_verification.py
+++ b/compliance/nvidia/TEST01/run_verification.py
@@ -28,36 +28,45 @@
     "byte": np.byte,
     "float32": np.float32,
     "int32": np.int32,
-    "int64": np.int64
+    "int64": np.int64,
 }
 
-def main():
 
+def main():
 
-    py3 = sys.version_info >= (3,0)
+    py3 = sys.version_info >= (3, 0)
     # Parse arguments to identify the path to the accuracy logs from
     #   the accuracy and performance runs
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--results_dir", "-r",
+        "--results_dir",
+        "-r",
         help="Specifies the path to the corresponding results directory that contains the accuracy and performance subdirectories containing the submission logs, i.e. inference_results_v0.7/closed/NVIDIA/results/T4x8/resnet/Offline.",
-        required=True
+        required=True,
     )
     parser.add_argument(
-        "--compliance_dir", "-c",
+        "--compliance_dir",
+        "-c",
         help="Specifies the path to the directory containing the logs from the compliance test run.",
-        required=True
+        required=True,
     )
     parser.add_argument(
-        "--output_dir", "-o",
+        "--output_dir",
+        "-o",
         help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.",
-        required=True
+        required=True,
     )
     parser.add_argument(
-        "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label (not needed in unixmode")
+        "--dtype",
+        default="byte",
+        choices=["byte", "float32", "int32", "int64"],
+        help="data type of the label (not needed in unixmode",
+    )
     parser.add_argument(
-        "--unixmode", action="store_true",
-        help="Use UNIX commandline utilities to verify accuracy (uses less memory but much slower.")
+        "--unixmode",
+        action="store_true",
+        help="Use UNIX commandline utilities to verify accuracy (uses less memory but much slower.",
+    )
 
     args = parser.parse_args()
 
@@ -70,44 +79,81 @@ def main():
         unixmode = " --unixmode"
         for binary in ["wc", "md5sum", "grep", "awk", "sed", "head", "tail"]:
             missing_binary = False
-            if shutil.which(binary) == None:
-                print("Error: This script requires the {:} commandline utility".format(binary))
+            if shutil.which(binary) is None:
+                print(
+                    "Error: This script requires the {:} commandline utility".format(
+                        binary
+                    )
+                )
                 missing_binary = True
         if missing_binary:
             exit()
 
     dtype = args.dtype
 
-    verify_accuracy_binary = os.path.join(os.path.dirname(__file__),"verify_accuracy.py")
+    verify_accuracy_binary = os.path.join(
+        os.path.dirname(__file__), "verify_accuracy.py"
+    )
     # run verify accuracy
-    verify_accuracy_command = "python3 " + verify_accuracy_binary + " --dtype " + args.dtype + unixmode + " -r " + results_dir + "/accuracy/mlperf_log_accuracy.json" + " -t " + compliance_dir + "/mlperf_log_accuracy.json | tee verify_accuracy.txt"
+    verify_accuracy_command = (
+        "python3 "
+        + verify_accuracy_binary
+        + " --dtype "
+        + args.dtype
+        + unixmode
+        + " -r "
+        + results_dir
+        + "/accuracy/mlperf_log_accuracy.json"
+        + " -t "
+        + compliance_dir
+        + "/mlperf_log_accuracy.json | tee verify_accuracy.txt"
+    )
     try:
         os.system(verify_accuracy_command)
     except Exception:
-        print("Exception occurred trying to execute:\n  " + verify_accuracy_command)
+        print(
+            "Exception occurred trying to execute:\n  " +
+            verify_accuracy_command)
     # check if verify accuracy script passes
 
     accuracy_pass_command = "grep PASS verify_accuracy.txt"
     try:
-        accuracy_pass = "TEST PASS" in subprocess.check_output(accuracy_pass_command, shell=True).decode("utf-8")
+        accuracy_pass = "TEST PASS" in subprocess.check_output(
+            accuracy_pass_command, shell=True
+        ).decode("utf-8")
     except Exception:
         accuracy_pass = False
 
     # run verify performance
-    verify_performance_binary = os.path.join(os.path.dirname(__file__),"verify_performance.py")
-    verify_performance_command = "python3 " + verify_performance_binary + " -r " + results_dir + "/performance/run_1/mlperf_log_summary.txt" + " -t " + compliance_dir + "/mlperf_log_summary.txt | tee verify_performance.txt"
+    verify_performance_binary = os.path.join(
+        os.path.dirname(__file__), "verify_performance.py"
+    )
+    verify_performance_command = (
+        "python3 "
+        + verify_performance_binary
+        + " -r "
+        + results_dir
+        + "/performance/run_1/mlperf_log_summary.txt"
+        + " -t "
+        + compliance_dir
+        + "/mlperf_log_summary.txt | tee verify_performance.txt"
+    )
     try:
         os.system(verify_performance_command)
     except Exception:
-        print("Exception occurred trying to execute:\n  " + verify_performance_command)
+        print(
+            "Exception occurred trying to execute:\n  " +
+            verify_performance_command)
 
     # check if verify performance script passes
     performance_pass_command = "grep PASS verify_performance.txt"
     try:
-        performance_pass = "TEST PASS" in subprocess.check_output(performance_pass_command, shell=True).decode("utf-8")
+        performance_pass = "TEST PASS" in subprocess.check_output(
+            performance_pass_command, shell=True
+        ).decode("utf-8")
     except Exception:
         performance_pass = False
-    
+
     # setup output compliance directory structure
     output_accuracy_dir = os.path.join(output_dir, "accuracy")
     output_performance_dir = os.path.join(output_dir, "performance", "run_1")
@@ -123,28 +169,44 @@ def main():
         print("Exception occurred trying to create " + output_performance_dir)
 
     # copy compliance logs to output compliance directory
-    shutil.copy2("verify_accuracy.txt",output_dir)
-    shutil.copy2("verify_performance.txt",output_dir)
-    accuracy_file = os.path.join(compliance_dir,"mlperf_log_accuracy.json")
-    summary_file = os.path.join(compliance_dir,"mlperf_log_summary.txt")
-    detail_file = os.path.join(compliance_dir,"mlperf_log_detail.txt")
+    shutil.copy2("verify_accuracy.txt", output_dir)
+    shutil.copy2("verify_performance.txt", output_dir)
+    accuracy_file = os.path.join(compliance_dir, "mlperf_log_accuracy.json")
+    summary_file = os.path.join(compliance_dir, "mlperf_log_summary.txt")
+    detail_file = os.path.join(compliance_dir, "mlperf_log_detail.txt")
 
     try:
-        shutil.copy2(accuracy_file,output_accuracy_dir)
+        shutil.copy2(accuracy_file, output_accuracy_dir)
     except Exception:
-        print("Exception occured trying to copy " + accuracy_file + " to " + output_accuracy_dir)
+        print(
+            "Exception occured trying to copy "
+            + accuracy_file
+            + " to "
+            + output_accuracy_dir
+        )
     try:
-        shutil.copy2(summary_file,output_performance_dir)
+        shutil.copy2(summary_file, output_performance_dir)
     except Exception:
-        print("Exception occured trying to copy " + summary_file + " to " + output_performance_dir)
+        print(
+            "Exception occured trying to copy "
+            + summary_file
+            + " to "
+            + output_performance_dir
+        )
     try:
-        shutil.copy2(detail_file,output_performance_dir)
+        shutil.copy2(detail_file, output_performance_dir)
     except Exception:
-        print("Exception occured trying to copy " + detail_file + " to " + output_performance_dir)
+        print(
+            "Exception occured trying to copy "
+            + detail_file
+            + " to "
+            + output_performance_dir
+        )
 
     print("Accuracy check pass: {:}".format(accuracy_pass))
     print("Performance check pass: {:}".format(performance_pass))
     print("TEST01 verification complete")
 
-if __name__ == '__main__':
-	main()
+
+if __name__ == "__main__":
+    main()
diff --git a/compliance/nvidia/TEST01/verify_accuracy.py b/compliance/nvidia/TEST01/verify_accuracy.py
index cfaca5dec..7f8a750a3 100644
--- a/compliance/nvidia/TEST01/verify_accuracy.py
+++ b/compliance/nvidia/TEST01/verify_accuracy.py
@@ -13,54 +13,65 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
+import numpy as np
+import json
+import argparse
 import os
 import subprocess
 import sys
 import shutil
-sys.path.append(os.getcwd())
 
-import argparse
-import json
+sys.path.append(os.getcwd())
 
-import numpy as np
 
 dtype_map = {
     "byte": np.byte,
     "float32": np.float32,
     "int32": np.int32,
-    "int64": np.int64
+    "int64": np.int64,
 }
 
+
 def main():
 
-    py3 = sys.version_info >= (3,0)
+    py3 = sys.version_info >= (3, 0)
     # Parse arguments to identify the path to the accuracy logs from
     #   the accuracy and performance runs
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--reference_accuracy", "-r",
+        "--reference_accuracy",
+        "-r",
         help="Specifies the path to the accuracy log from a submission/accuracy run.",
-        default=""
+        default="",
     )
     parser.add_argument(
-        "--test_accuracy", "-t",
+        "--test_accuracy",
+        "-t",
         help="Specifies the path to the accuracy log from a performance run with accuracy log sampling enabled.",
-        default=""
+        default="",
     )
     parser.add_argument(
-        "--dtype", default="byte", choices=["byte", "float32", "int32", "int64"], help="data type of the label")
+        "--dtype",
+        default="byte",
+        choices=["byte", "float32", "int32", "int64"],
+        help="data type of the label",
+    )
 
     parser.add_argument(
-        "--unixmode", action="store_true",
-        help="Use unix commandline utilities instead of python JSON library (uses less memory but much slower.")
+        "--unixmode",
+        action="store_true",
+        help="Use unix commandline utilities instead of python JSON library (uses less memory but much slower.",
+    )
 
     parser.add_argument(
-        "--fastmode", action="store_true",
-        help="This flag has been deprecated. This script runs in fastmode by default. Use --unixmode to run in low memory consumption mode.")
+        "--fastmode",
+        action="store_true",
+        help="This flag has been deprecated. This script runs in fastmode by default. Use --unixmode to run in low memory consumption mode.",
+    )
     args = parser.parse_args()
 
     print("Verifying accuracy. This might take a while...")
-    acc_log  = args.reference_accuracy
+    acc_log = args.reference_accuracy
     perf_log = args.test_accuracy
 
     if not args.unixmode:
@@ -80,10 +91,10 @@ def main():
 
         print("Reading accuracy mode results...")
         for sample in acc_data:
-            #print sample["qsl_idx"]
+            # print sample["qsl_idx"]
             qsl_idx = sample["qsl_idx"]
             data = sample["data"]
-            if data == '':
+            if data == "":
                 data = ""
             if qsl_idx in results_dict.keys():
                 num_acc_log_duplicate_keys += 1
@@ -95,13 +106,28 @@ def main():
         print("Reading performance mode results...")
         for sample in perf_data:
             qsl_idx = sample["qsl_idx"]
-            data = np.frombuffer(bytes.fromhex(sample['data']), dtype_map[args.dtype]) if py3 == True \
-                else np.frombuffer(bytearray.fromhex(sample['data']), dtype_map[args.dtype])
+            data = (
+                np.frombuffer(bytes.fromhex(
+                    sample["data"]), dtype_map[args.dtype])
+                if py3 == True
+                else np.frombuffer(
+                    bytearray.fromhex(sample["data"]), dtype_map[args.dtype]
+                )
+            )
 
             if qsl_idx in results_dict.keys():
                 num_perf_log_qsl_idx_match += 1
-                data_perf = np.frombuffer(bytes.fromhex(results_dict[qsl_idx]), dtype_map[args.dtype]) \
-                    if py3 == True else np.frombuffer(bytearray.fromhex(results_dict[qsl_idx]), dtype_map[args.dtype])
+                data_perf = (
+                    np.frombuffer(
+                        bytes.fromhex(
+                            results_dict[qsl_idx]), dtype_map[args.dtype]
+                    )
+                    if py3 == True
+                    else np.frombuffer(
+                        bytearray.fromhex(
+                            results_dict[qsl_idx]), dtype_map[args.dtype]
+                    )
+                )
                 if data_perf.size == 0 or data.size == 0:
                     if data_perf.size != data.size:
                         num_perf_log_data_mismatch += 1
@@ -112,13 +138,16 @@ def main():
 
             results_dict[sample["qsl_idx"]] = sample["data"]
 
-
         print("num_acc_log_entries = {:}".format(len(acc_data)))
-        print("num_acc_log_duplicate_keys = {:}".format(num_acc_log_duplicate_keys))
-        print("num_acc_log_data_mismatch = {:}".format(num_acc_log_data_mismatch))
+        print("num_acc_log_duplicate_keys = {:}".format(
+            num_acc_log_duplicate_keys))
+        print("num_acc_log_data_mismatch = {:}".format(
+            num_acc_log_data_mismatch))
         print("num_perf_log_entries = {:}".format(len(perf_data)))
-        print("num_perf_log_qsl_idx_match = {:}".format(num_perf_log_qsl_idx_match))
-        print("num_perf_log_data_mismatch = {:}".format(num_perf_log_data_mismatch))
+        print("num_perf_log_qsl_idx_match = {:}".format(
+            num_perf_log_qsl_idx_match))
+        print("num_perf_log_data_mismatch = {:}".format(
+            num_perf_log_data_mismatch))
         print("num_missing_qsl_idxs = {:}".format(num_missing_qsl_idxs))
         if num_perf_log_data_mismatch == 0 and num_perf_log_qsl_idx_match > 0:
             print("TEST PASS\n")
@@ -126,48 +155,76 @@ def main():
             print("TEST FAIL\n")
         exit()
 
-    py33 = sys.version_info >= (3,3)
+    py33 = sys.version_info >= (3, 3)
 
     if not py33:
         print("Error: This script requires Python v3.3 or later")
         exit()
 
-
     get_perf_lines_cmd = "wc -l " + perf_log + "| awk '{print $1}'"
-    num_perf_lines = int(subprocess.check_output(get_perf_lines_cmd, shell=True).decode("utf-8"))
+    num_perf_lines = int(
+        subprocess.check_output(get_perf_lines_cmd, shell=True).decode("utf-8")
+    )
 
     get_acc_lines_cmd = "wc -l " + acc_log + "| awk '{print $1}'"
-    num_acc_lines = int(subprocess.check_output(get_acc_lines_cmd, shell=True).decode("utf-8"))
+    num_acc_lines = int(
+        subprocess.check_output(get_acc_lines_cmd, shell=True).decode("utf-8")
+    )
 
     num_acc_log_entries = num_acc_lines - 2
     num_perf_log_entries = num_perf_lines - 2
-    #print(perf_qsl_idx)
-    #print(get_perf_lines_cmd)
-    #print(num_perf_lines)
-    
+    # print(perf_qsl_idx)
+    # print(get_perf_lines_cmd)
+    # print(num_perf_lines)
+
     num_perf_log_data_mismatch = 0
     num_perf_log_data_match = 0
     print("Each dot represents 1% completion:")
     for perf_line in range(0, num_perf_lines):
-        if perf_line % int(num_perf_lines/100) == 0:
+        if perf_line % int(num_perf_lines / 100) == 0:
             sys.stdout.write(".")
             sys.stdout.flush()
         # first and last line are brackets
-        if perf_line == 0 or perf_line == int(num_perf_lines)-1:
+        if perf_line == 0 or perf_line == int(num_perf_lines) - 1:
             continue
 
         # calculate md5sum of line in perf mode accuracy_log
-        perf_md5sum_cmd = "head -n " + str(perf_line + 1) + " " + perf_log + "| tail -n 1| sed -r 's/,//g' | sed -r 's/\"seq_id\" : \S+//g' | md5sum"
-        #print(perf_md5sum_cmd)
-        perf_md5sum = subprocess.check_output(perf_md5sum_cmd, shell=True).decode("utf-8")
+        perf_md5sum_cmd = (
+            "head -n "
+            + str(perf_line + 1)
+            + " "
+            + perf_log
+            + "| tail -n 1| sed -r 's/,//g' | sed -r 's/\"seq_id\" : \\S+//g' | md5sum"
+        )
+        # print(perf_md5sum_cmd)
+        perf_md5sum = subprocess.check_output(perf_md5sum_cmd, shell=True).decode(
+            "utf-8"
+        )
 
         # get qsl idx
-        get_qsl_idx_cmd = "head -n " + str(perf_line + 1) + " " + perf_log + "| tail -n 1| awk -F\": |,\" '{print $4}'"
-        qsl_idx = subprocess.check_output(get_qsl_idx_cmd, shell=True).decode("utf-8").rstrip()
+        get_qsl_idx_cmd = (
+            "head -n "
+            + str(perf_line + 1)
+            + " "
+            + perf_log
+            + "| tail -n 1| awk -F\": |,\" '{print $4}'"
+        )
+        qsl_idx = (
+            subprocess.check_output(get_qsl_idx_cmd, shell=True)
+            .decode("utf-8")
+            .rstrip()
+        )
 
         # calculate md5sum of line in acc mode accuracy_log
-        acc_md5sum_cmd = "grep \"qsl_idx\\\" : " + qsl_idx + ",\" " + acc_log + "| sed -r 's/,//g' | sed -r 's/\"seq_id\" : \S+//g' | md5sum"
-        acc_md5sum = subprocess.check_output(acc_md5sum_cmd, shell=True).decode("utf-8")
+        acc_md5sum_cmd = (
+            'grep "qsl_idx\\" : '
+            + qsl_idx
+            + '," '
+            + acc_log
+            + "| sed -r 's/,//g' | sed -r 's/\"seq_id\" : \\S+//g' | md5sum"
+        )
+        acc_md5sum = subprocess.check_output(
+            acc_md5sum_cmd, shell=True).decode("utf-8")
 
         if perf_md5sum != acc_md5sum:
             num_perf_log_data_mismatch += 1
@@ -176,12 +233,14 @@ def main():
 
     print("")
     print("num_acc_log_entries = {:}".format(num_acc_log_entries))
-    print("num_perf_log_data_mismatch = {:}".format(num_perf_log_data_mismatch))
+    print("num_perf_log_data_mismatch = {:}".format(
+        num_perf_log_data_mismatch))
     print("num_perf_log_entries = {:}".format(num_perf_log_entries))
     if num_perf_log_data_mismatch == 0 and num_perf_log_data_match > 0:
         print("TEST PASS\n")
     else:
         print("TEST FAIL\n")
 
-if __name__ == '__main__':
-	main()
+
+if __name__ == "__main__":
+    main()
diff --git a/compliance/nvidia/TEST01/verify_performance.py b/compliance/nvidia/TEST01/verify_performance.py
index 44fabddee..02ee8753b 100644
--- a/compliance/nvidia/TEST01/verify_performance.py
+++ b/compliance/nvidia/TEST01/verify_performance.py
@@ -13,27 +13,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
+import json
+import argparse
 import os
 import sys
 import re
+
 sys.path.append(os.getcwd())
 
-import argparse
-import json
 
 def main():
     # Parse arguments to identify the path to the accuracy logs from
     #   the accuracy and performance runs
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--reference_summary", "-r",
+        "--reference_summary",
+        "-r",
         help="Specifies the path to the summary log for the performance run.",
-        default=""
+        default="",
     )
     parser.add_argument(
-        "--test_summary", "-t",
+        "--test_summary",
+        "-t",
         help="Specifies the path to the summary log for this test.",
-        default=""
+        default="",
     )
     args = parser.parse_args()
 
@@ -42,68 +45,67 @@ def main():
     test_file = open(args.test_summary, "r")
     ref_score = 0
     test_score = 0
-    ref_mode = ''
-    test_mode = ''
+    ref_mode = ""
+    test_mode = ""
 
     for line in ref_file:
         if re.match("Scenario", line):
-            ref_mode = line.split(": ",1)[1].strip()
+            ref_mode = line.split(": ", 1)[1].strip()
             continue
 
         if ref_mode == "SingleStream":
             if re.match(".*Early stopping 90th percentile estimate", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
 
         if ref_mode == "MultiStream":
             if re.match(".*Early stopping 99th percentile estimate", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
 
         if ref_mode == "Server":
             if re.match("Completed samples per second", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
             if re.match("target_latency (ns)", line):
-                ref_target_latency = line.split(": ",1)[1].strip()
+                ref_target_latency = line.split(": ", 1)[1].strip()
                 continue
 
         if ref_mode == "Offline":
             if re.match("Samples per second", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
 
         if re.match("Result is", line):
-            valid = line.split(": ",1)[1].strip()
-            if valid == 'INVALID':
+            valid = line.split(": ", 1)[1].strip()
+            if valid == "INVALID":
                 sys.exit("TEST FAIL: Reference results are invalid")
 
-        if re.match("\d+ ERROR", line):
-            error = line.split(" ",1)[0].strip()
+        if re.match("\\d+ ERROR", line):
+            error = line.split(" ", 1)[0].strip()
             print("WARNING: " + error + " ERROR reported in reference results")
 
-
     for line in test_file:
         if re.match("Scenario", line):
-            test_mode = line.split(": ",1)[1].strip()
+            test_mode = line.split(": ", 1)[1].strip()
             continue
 
         if test_mode == "SingleStream":
             if re.match(".*Early stopping 90th percentile estimate", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
 
         if test_mode == "MultiStream":
             if re.match(".*Early stopping 99th percentile estimate", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
 
         if test_mode == "Server":
             if re.match("Completed samples per second", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
             if re.match("target_latency (ns)", line):
-                test_target_latency = line.split(": ",1)[1].strip()
+                test_target_latency = line.split(": ", 1)[1].strip()
                 if test_target_latency != ref_target_latency:
                     print("TEST FAIL: Server target latency mismatch")
                     sys.exit()
@@ -111,16 +113,16 @@ def main():
 
         if test_mode == "Offline":
             if re.match("Samples per second", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
 
         if re.match("Result is", line):
-            valid = line.split(": ",1)[1].strip()
-            if valid == 'INVALID':
+            valid = line.split(": ", 1)[1].strip()
+            if valid == "INVALID":
                 sys.exit("TEST FAIL: Test results are invalid")
-            
-        if re.match("\d+ ERROR", line):
-            error = line.split(" ",1)[0].strip()
+
+        if re.match("\\d+ ERROR", line):
+            error = line.split(" ", 1)[0].strip()
             print("WARNING: " + error + " ERROR reported in test results")
 
     if test_mode != ref_mode:
@@ -129,22 +131,23 @@ def main():
     print("reference score = {}".format(ref_score))
     print("test score = {}".format(test_score))
 
- 
     threshold = 0.10
 
     # In single-/multi-stream mode, latencies can be very short for high performance systems
     # and run-to-run variation due to external disturbances (OS) can be significant.
     # In this case we relax pass threshold to 20%
-    if (ref_mode == "SingleStream" and float(ref_score) <= 200000) or\
-       (ref_mode == "MultiStream" and float(ref_score) <= 1600000):
+    if (ref_mode == "SingleStream" and float(ref_score) <= 200000) or (
+        ref_mode == "MultiStream" and float(ref_score) <= 1600000
+    ):
         threshold = 0.20
-        
-    if float(test_score) < float(ref_score) * (1 + threshold) and\
-       float(test_score) > float(ref_score) * (1 - threshold):
+
+    if float(test_score) < float(ref_score) * (1 + threshold) and float(
+        test_score
+    ) > float(ref_score) * (1 - threshold):
         print("TEST PASS")
     else:
         print("TEST FAIL: Test score invalid")
 
-if __name__ == '__main__':
-	main()
 
+if __name__ == "__main__":
+    main()
diff --git a/compliance/nvidia/TEST04/run_verification.py b/compliance/nvidia/TEST04/run_verification.py
index 027cf7aa8..647f71f82 100644
--- a/compliance/nvidia/TEST04/run_verification.py
+++ b/compliance/nvidia/TEST04/run_verification.py
@@ -21,26 +21,29 @@
 
 sys.path.append(os.getcwd())
 
-def main():
 
+def main():
 
     # Parse arguments to identify the path to the logs from
     #   the performance runs
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--results_dir", "-r",
+        "--results_dir",
+        "-r",
         help="Specifies the path to the corresponding results directory that contains the performance subdirectories containing the submission logs, i.e. inference_results_v0.7/closed/NVIDIA/results/T4x8/resnet/Offline.",
-        required=True
+        required=True,
     )
     parser.add_argument(
-        "--compliance_dir", "-c",
+        "--compliance_dir",
+        "-c",
         help="Specifies the path to the directory containing the logs from the compliance test run.",
-        required=True
+        required=True,
     )
     parser.add_argument(
-        "--output_dir", "-o",
+        "--output_dir",
+        "-o",
         help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.",
-        required=True
+        required=True,
     )
 
     args = parser.parse_args()
@@ -51,20 +54,35 @@ def main():
     output_dir = os.path.join(args.output_dir, "TEST04")
 
     # run verify performance
-    verify_performance_binary = os.path.join(os.path.dirname(__file__),"verify_performance.py")
-    verify_performance_command = "python3 " + verify_performance_binary + " -r " + results_dir + "/performance/run_1/mlperf_log_summary.txt" + " -t " + compliance_dir + "/mlperf_log_summary.txt | tee verify_performance.txt"
+    verify_performance_binary = os.path.join(
+        os.path.dirname(__file__), "verify_performance.py"
+    )
+    verify_performance_command = (
+        "python3 "
+        + verify_performance_binary
+        + " -r "
+        + results_dir
+        + "/performance/run_1/mlperf_log_summary.txt"
+        + " -t "
+        + compliance_dir
+        + "/mlperf_log_summary.txt | tee verify_performance.txt"
+    )
     try:
         os.system(verify_performance_command)
     except Exception:
-        print("Exception occurred trying to execute:\n  " + verify_performance_command)
+        print(
+            "Exception occurred trying to execute:\n  " +
+            verify_performance_command)
 
     # check if verify performance script passes
     performance_pass_command = "grep PASS verify_performance.txt"
     try:
-        performance_pass = "TEST PASS" in subprocess.check_output(performance_pass_command, shell=True).decode("utf-8")
+        performance_pass = "TEST PASS" in subprocess.check_output(
+            performance_pass_command, shell=True
+        ).decode("utf-8")
     except Exception:
         performance_pass = False
-    
+
     # setup output compliance directory structure
     output_performance_dir = os.path.join(output_dir, "performance", "run_1")
     try:
@@ -74,21 +92,32 @@ def main():
         print("Exception occurred trying to create " + output_performance_dir)
 
     # copy compliance logs to output compliance directory
-    shutil.copy2("verify_performance.txt",output_dir)
-    summary_file = os.path.join(compliance_dir,"mlperf_log_summary.txt")
-    detail_file = os.path.join(compliance_dir,"mlperf_log_detail.txt")
+    shutil.copy2("verify_performance.txt", output_dir)
+    summary_file = os.path.join(compliance_dir, "mlperf_log_summary.txt")
+    detail_file = os.path.join(compliance_dir, "mlperf_log_detail.txt")
 
     try:
-        shutil.copy2(summary_file,output_performance_dir)
+        shutil.copy2(summary_file, output_performance_dir)
     except Exception:
-        print("Exception occured trying to copy " + summary_file + " to " + output_performance_dir)
+        print(
+            "Exception occured trying to copy "
+            + summary_file
+            + " to "
+            + output_performance_dir
+        )
     try:
-        shutil.copy2(detail_file,output_performance_dir)
+        shutil.copy2(detail_file, output_performance_dir)
     except Exception:
-        print("Exception occured trying to copy " + detail_file + " to " + output_performance_dir)
+        print(
+            "Exception occured trying to copy "
+            + detail_file
+            + " to "
+            + output_performance_dir
+        )
 
     print("Performance check pass: {:}".format(performance_pass))
     print("TEST04 verification complete")
 
-if __name__ == '__main__':
-	main()
+
+if __name__ == "__main__":
+    main()
diff --git a/compliance/nvidia/TEST04/verify_performance.py b/compliance/nvidia/TEST04/verify_performance.py
index 0aec33ace..861ec609c 100644
--- a/compliance/nvidia/TEST04/verify_performance.py
+++ b/compliance/nvidia/TEST04/verify_performance.py
@@ -13,26 +13,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
+import argparse
 import os
 import sys
 import re
+
 sys.path.append(os.getcwd())
 
-import argparse
 
 def main():
     # Parse arguments to identify the path to the accuracy logs from
     #   the accuracy and performance runs
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--reference_summary", "-r",
+        "--reference_summary",
+        "-r",
         help="Specifies the path to the summary log for the performance run.",
-        default=""
+        default="",
     )
     parser.add_argument(
-        "--test_summary", "-t",
+        "--test_summary",
+        "-t",
         help="Specifies the path to the summary log for this test.",
-        default=""
+        default="",
     )
     args = parser.parse_args()
 
@@ -41,72 +44,71 @@ def main():
     test_file = open(args.test_summary, "r")
     ref_score = 0
     test_score = 0
-    ref_mode = ''
-    test_mode = ''
+    ref_mode = ""
+    test_mode = ""
 
     for line in ref_file:
         if re.match("Scenario", line):
-            ref_mode = line.split(": ",1)[1].strip()
+            ref_mode = line.split(": ", 1)[1].strip()
             continue
 
         if ref_mode == "SingleStream":
             if re.match(".*Early stopping 90th percentile estimate", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 ref_score = 1e9 / float(ref_score)
                 continue
 
         if ref_mode == "MultiStream":
             if re.match(".*Early stopping 99th percentile estimate", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 ref_score = 1e9 / float(ref_score)
                 continue
 
         if ref_mode == "Server":
             if re.match("Completed samples per second", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
             if re.match("target_latency (ns)", line):
-                ref_target_latency = line.split(": ",1)[1].strip()
+                ref_target_latency = line.split(": ", 1)[1].strip()
                 continue
 
         if ref_mode == "Offline":
             if re.match("Samples per second", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
 
         if re.match("Result is", line):
-            valid = line.split(": ",1)[1].strip()
-            if valid == 'INVALID':
+            valid = line.split(": ", 1)[1].strip()
+            if valid == "INVALID":
                 sys.exit("TEST FAIL: Reference results are invalid")
 
-        if re.match("\d+ ERROR", line):
-            error = line.split(" ",1)[0].strip()
+        if re.match("\\d+ ERROR", line):
+            error = line.split(" ", 1)[0].strip()
             print("WARNING: " + error + " ERROR reported in reference results")
 
-
     for line in test_file:
         if re.match("Scenario", line):
-            test_mode = line.split(": ",1)[1].strip()
+            test_mode = line.split(": ", 1)[1].strip()
             continue
 
         if test_mode == "SingleStream":
             if re.match(".*Early stopping 90th percentile estimate", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 test_score = 1e9 / float(test_score)
                 continue
 
         if test_mode == "MultiStream":
             if re.match(".*Early stopping 99th percentile estimate", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 test_score = 1e9 / float(test_score)
                 continue
 
         if test_mode == "Server":
             if re.match("Completed samples per second", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
             if re.match("target_latency (ns)", line):
-                test_target_latency = line.split(": ",1)[1].strip()
+                test_target_latency = line.split(": ", 1)[1].strip()
                 if test_target_latency != ref_target_latency:
                     print("TEST FAIL: Server target latency mismatch")
                     sys.exit()
@@ -114,16 +116,16 @@ def main():
 
         if test_mode == "Offline":
             if re.match("Samples per second", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
 
         if re.match("Result is", line):
-            valid = line.split(": ",1)[1].strip()
-            if valid == 'INVALID':
+            valid = line.split(": ", 1)[1].strip()
+            if valid == "INVALID":
                 sys.exit("TEST FAIL: Test results are invalid")
-            
-        if re.match("\d+ ERROR", line):
-            error = line.split(" ",1)[0].strip()
+
+        if re.match("\\d+ ERROR", line):
+            error = line.split(" ", 1)[0].strip()
             print("WARNING: " + error + " ERROR reported in test results")
 
     if test_mode != ref_mode:
@@ -132,21 +134,21 @@ def main():
     print("reference score = {}".format(ref_score))
     print("test score = {}".format(test_score))
 
- 
     threshold = 0.10
 
     # In single-/multi-stream mode, latencies can be very short for high performance systems
     # and run-to-run variation due to external disturbances (OS) can be significant.
     # In this case we relax pass threshold to 20%
-    if (ref_mode == "SingleStream" and float(ref_score) <= 200000) or\
-       (ref_mode == "MultiStream" and float(ref_score) <= 1600000):
+    if (ref_mode == "SingleStream" and float(ref_score) <= 200000) or (
+        ref_mode == "MultiStream" and float(ref_score) <= 1600000
+    ):
         threshold = 0.20
-        
+
     if float(test_score) < float(ref_score) * (1 + threshold):
         print("TEST PASS")
     else:
         print("TEST FAIL: Test score invalid")
 
-if __name__ == '__main__':
-	main()
 
+if __name__ == "__main__":
+    main()
diff --git a/compliance/nvidia/TEST05/run_verification.py b/compliance/nvidia/TEST05/run_verification.py
index 7bf32fabc..804155187 100644
--- a/compliance/nvidia/TEST05/run_verification.py
+++ b/compliance/nvidia/TEST05/run_verification.py
@@ -24,26 +24,30 @@
 
 sys.path.append(os.getcwd())
 
-def main():
 
+def main():
 
-    py3 = sys.version_info >= (3,0)
-    # Parse arguments to identify the path to the logs from the performance runs
+    py3 = sys.version_info >= (3, 0)
+    # Parse arguments to identify the path to the logs from the performance
+    # runs
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--results_dir", "-r",
+        "--results_dir",
+        "-r",
         help="Specifies the path to the corresponding results directory that contains the performance subdirectories containing the submission logs, i.e. inference_results_v0.7/closed/NVIDIA/results/T4x8/resnet/Offline.",
-        required=True
+        required=True,
     )
     parser.add_argument(
-        "--compliance_dir", "-c",
+        "--compliance_dir",
+        "-c",
         help="Specifies the path to the directory containing the logs from the compliance test run.",
-        required=True
+        required=True,
     )
     parser.add_argument(
-        "--output_dir", "-o",
+        "--output_dir",
+        "-o",
         help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.",
-        required=True
+        required=True,
     )
 
     args = parser.parse_args()
@@ -54,20 +58,35 @@ def main():
     output_dir = os.path.join(args.output_dir, "TEST05")
 
     # run verify performance
-    verify_performance_binary = os.path.join(os.path.dirname(__file__),"verify_performance.py")
-    verify_performance_command = "python3 " + verify_performance_binary + " -r " + results_dir + "/performance/run_1/mlperf_log_summary.txt" + " -t " + compliance_dir + "/mlperf_log_summary.txt | tee verify_performance.txt"
+    verify_performance_binary = os.path.join(
+        os.path.dirname(__file__), "verify_performance.py"
+    )
+    verify_performance_command = (
+        "python3 "
+        + verify_performance_binary
+        + " -r "
+        + results_dir
+        + "/performance/run_1/mlperf_log_summary.txt"
+        + " -t "
+        + compliance_dir
+        + "/mlperf_log_summary.txt | tee verify_performance.txt"
+    )
     try:
         os.system(verify_performance_command)
     except Exception:
-        print("Exception occurred trying to execute:\n  " + verify_performance_command)
+        print(
+            "Exception occurred trying to execute:\n  " +
+            verify_performance_command)
 
     # check if verify performance script passes
     performance_pass_command = "grep PASS verify_performance.txt"
     try:
-        performance_pass = "TEST PASS" in subprocess.check_output(performance_pass_command, shell=True).decode("utf-8")
+        performance_pass = "TEST PASS" in subprocess.check_output(
+            performance_pass_command, shell=True
+        ).decode("utf-8")
     except Exception:
         performance_pass = False
-    
+
     # setup output compliance directory structure
     output_performance_dir = os.path.join(output_dir, "performance", "run_1")
     try:
@@ -77,21 +96,32 @@ def main():
         print("Exception occurred trying to create " + output_performance_dir)
 
     # copy compliance logs to output compliance directory
-    shutil.copy2("verify_performance.txt",output_dir)
-    summary_file = os.path.join(compliance_dir,"mlperf_log_summary.txt")
-    detail_file = os.path.join(compliance_dir,"mlperf_log_detail.txt")
+    shutil.copy2("verify_performance.txt", output_dir)
+    summary_file = os.path.join(compliance_dir, "mlperf_log_summary.txt")
+    detail_file = os.path.join(compliance_dir, "mlperf_log_detail.txt")
 
     try:
-        shutil.copy2(summary_file,output_performance_dir)
+        shutil.copy2(summary_file, output_performance_dir)
     except Exception:
-        print("Exception occured trying to copy " + summary_file + " to " + output_performance_dir)
+        print(
+            "Exception occured trying to copy "
+            + summary_file
+            + " to "
+            + output_performance_dir
+        )
     try:
-        shutil.copy2(detail_file,output_performance_dir)
+        shutil.copy2(detail_file, output_performance_dir)
     except Exception:
-        print("Exception occured trying to copy " + detail_file + " to " + output_performance_dir)
+        print(
+            "Exception occured trying to copy "
+            + detail_file
+            + " to "
+            + output_performance_dir
+        )
 
     print("Performance check pass: {:}".format(performance_pass))
     print("TEST05 verification complete")
 
-if __name__ == '__main__':
-	main()
+
+if __name__ == "__main__":
+    main()
diff --git a/compliance/nvidia/TEST05/verify_performance.py b/compliance/nvidia/TEST05/verify_performance.py
index 763737f77..864a8af44 100644
--- a/compliance/nvidia/TEST05/verify_performance.py
+++ b/compliance/nvidia/TEST05/verify_performance.py
@@ -13,27 +13,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
+import json
+import argparse
 import os
 import sys
 import re
+
 sys.path.append(os.getcwd())
 
-import argparse
-import json
 
 def main():
     # Parse arguments to identify the path to the accuracy logs from
     #   the accuracy and performance runs
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--reference_summary", "-r",
+        "--reference_summary",
+        "-r",
         help="Specifies the path to the summary log for the performance run.",
-        default=""
+        default="",
     )
     parser.add_argument(
-        "--test_summary", "-t",
+        "--test_summary",
+        "-t",
         help="Specifies the path to the summary log for this test.",
-        default=""
+        default="",
     )
     args = parser.parse_args()
 
@@ -42,68 +45,67 @@ def main():
     test_file = open(args.test_summary, "r")
     ref_score = 0
     test_score = 0
-    ref_mode = ''
-    test_mode = ''
+    ref_mode = ""
+    test_mode = ""
 
     for line in ref_file:
         if re.match("Scenario", line):
-            ref_mode = line.split(": ",1)[1].strip()
+            ref_mode = line.split(": ", 1)[1].strip()
             continue
 
         if ref_mode == "SingleStream":
             if re.match(".*Early stopping 90th percentile estimate", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
 
         if ref_mode == "MultiStream":
             if re.match(".*Early stopping 99th percentile estimate", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
 
         if ref_mode == "Server":
             if re.match("Completed samples per second", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
             if re.match("target_latency (ns)", line):
-                ref_target_latency = line.split(": ",1)[1].strip()
+                ref_target_latency = line.split(": ", 1)[1].strip()
                 continue
 
         if ref_mode == "Offline":
             if re.match("Samples per second", line):
-                ref_score = line.split(": ",1)[1].strip()
+                ref_score = line.split(": ", 1)[1].strip()
                 continue
 
         if re.match("Result is", line):
-            valid = line.split(": ",1)[1].strip()
-            if valid == 'INVALID':
+            valid = line.split(": ", 1)[1].strip()
+            if valid == "INVALID":
                 sys.exit("TEST FAIL: Reference results are invalid")
 
-        if re.match("\d+ ERROR", line):
-            error = line.split(" ",1)[0].strip()
+        if re.match("\\d+ ERROR", line):
+            error = line.split(" ", 1)[0].strip()
             print("WARNING: " + error + " ERROR reported in reference results")
 
-
     for line in test_file:
         if re.match("Scenario", line):
-            test_mode = line.split(": ",1)[1].strip()
+            test_mode = line.split(": ", 1)[1].strip()
             continue
 
         if test_mode == "SingleStream":
             if re.match(".*Early stopping 90th percentile estimate", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
 
         if test_mode == "MultiStream":
             if re.match(".*Early stopping 99th percentile estimate", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
 
         if test_mode == "Server":
             if re.match("Completed samples per second", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
             if re.match("target_latency (ns)", line):
-                test_target_latency = line.split(": ",1)[1].strip()
+                test_target_latency = line.split(": ", 1)[1].strip()
                 if test_target_latency != ref_target_latency:
                     print("TEST FAIL: Server target latency mismatch")
                     sys.exit()
@@ -111,16 +113,16 @@ def main():
 
         if test_mode == "Offline":
             if re.match("Samples per second", line):
-                test_score = line.split(": ",1)[1].strip()
+                test_score = line.split(": ", 1)[1].strip()
                 continue
 
         if re.match("Result is", line):
-            valid = line.split(": ",1)[1].strip()
-            if valid == 'INVALID':
+            valid = line.split(": ", 1)[1].strip()
+            if valid == "INVALID":
                 sys.exit("TEST FAIL: Test results are invalid")
-            
-        if re.match("\d+ ERROR", line):
-            error = line.split(" ",1)[0].strip()
+
+        if re.match("\\d+ ERROR", line):
+            error = line.split(" ", 1)[0].strip()
             print("WARNING: " + error + " ERROR reported in test results")
 
     if test_mode != ref_mode:
@@ -129,21 +131,27 @@ def main():
     print("reference score = {}".format(ref_score))
     print("test score = {}".format(test_score))
 
- 
     threshold = 0.05
 
     # In single-/multi-stream mode, latencies can be very short for high performance systems
     # and run-to-run variation due to external disturbances (OS) can be significant.
     # In this case we relax pass threshold to 20%
-    if (ref_mode == "SingleStream" and float(ref_score) <= 200000) or\
-       (ref_mode == "MultiStream" and float(ref_score) <= 1600000):
+    if (ref_mode == "SingleStream" and float(ref_score) <= 200000) or (
+        ref_mode == "MultiStream" and float(ref_score) <= 1600000
+    ):
         threshold = 0.20
-        
-    if (ref_mode in [ "Offline", "Server" ] and float(test_score) > float(ref_score) * (1 - threshold)) or ("Stream" in ref_mode and float(test_score) < float(ref_score) * (1 + threshold)):
+
+    if (
+        ref_mode in ["Offline", "Server"]
+        and float(test_score) > float(ref_score) * (1 - threshold)
+    ) or (
+        "Stream" in ref_mode and float(test_score) < float(
+            ref_score) * (1 + threshold)
+    ):
         print("TEST PASS")
     else:
         print("TEST FAIL: Test score invalid")
 
-if __name__ == '__main__':
-	main()
 
+if __name__ == "__main__":
+    main()
diff --git a/compliance/nvidia/TEST06/run_verification.py b/compliance/nvidia/TEST06/run_verification.py
index 03c60c2ef..ece424121 100644
--- a/compliance/nvidia/TEST06/run_verification.py
+++ b/compliance/nvidia/TEST06/run_verification.py
@@ -26,47 +26,67 @@
     "int64": np.int64,
     "int32": np.int32,
     "int16": np.int16,
-    "float32": np.float32
+    "float32": np.float32,
 }
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--compliance_dir", "-c", 
-                        help="Specifies the path to the directory containing the logs from the compliance test run.",
-                        required=True)
-    parser.add_argument("--output_dir", "-o",
-                        help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.",
-                        required=True)
-    parser.add_argument("--dtype", "-d", default="int64", choices=["int64", "int32", "int16", "float32"])
-    parser.add_argument("--scenario", "-s", required=True, choices=["Offline", "Server", "SingleStream", "MultiStream"])
+    parser.add_argument(
+        "--compliance_dir",
+        "-c",
+        help="Specifies the path to the directory containing the logs from the compliance test run.",
+        required=True,
+    )
+    parser.add_argument(
+        "--output_dir",
+        "-o",
+        help="Specifies the path to the output directory where compliance logs will be uploaded from, i.e. inference_results_v0.7/closed/NVIDIA/compliance/T4x8/resnet/Offline.",
+        required=True,
+    )
+    parser.add_argument(
+        "--dtype", "-d", default="int64", choices=["int64", "int32", "int16", "float32"]
+    )
+    parser.add_argument(
+        "--scenario",
+        "-s",
+        required=True,
+        choices=["Offline", "Server", "SingleStream", "MultiStream"],
+    )
     args = parser.parse_args()
     return args
 
+
 def eos_check(acc_data, dtype):
     for sample in acc_data:
         data = np.frombuffer(bytes.fromhex(sample["data"]), dtype=dtype)
         i = data.shape[0] - 1
         n_eos_tokens = 0
-        while (i > 0):
+        while i > 0:
             if data[i] == EOS_TOKEN:
                 n_eos_tokens += 1
             if n_eos_tokens >= 2:
                 return False
             if data[i] != EOS_TOKEN:
                 break
-            i-=1
+            i -= 1
     return True
 
+
 def first_token_check(acc_data, dtype):
     for sample in acc_data:
         data = np.frombuffer(bytes.fromhex(sample["data"]), dtype=dtype)
-        token_data = np.frombuffer(bytes.fromhex(sample["token_data"]), dtype=dtype)
+        token_data = np.frombuffer(
+            bytes.fromhex(
+                sample["token_data"]),
+            dtype=dtype)
         for t1, t2 in zip(data, token_data):
             if t1 != t2:
                 return False
-        
+
     return True
 
+
 def sample_len_check(acc_data, dtype):
     for sample in acc_data:
         data = np.frombuffer(bytes.fromhex(sample["data"]), dtype=dtype)
@@ -78,22 +98,25 @@ def sample_len_check(acc_data, dtype):
 
 def main():
     args = get_args()
-    accuracy_file = os.path.join(args.compliance_dir, "mlperf_log_accuracy.json")
-    
+    accuracy_file = os.path.join(
+        args.compliance_dir,
+        "mlperf_log_accuracy.json")
+
     with open(accuracy_file, "r") as acc_json:
         acc_data = json.load(acc_json)
-    
+
     try:
         eos_pass = eos_check(acc_data, DTYPE_MAP[args.dtype])
     except Exception:
         print("Unexpected error occured while doing the EOS check")
         eos_pass = False
 
-    need_first_token_check = (args.scenario != "Offline")
+    need_first_token_check = args.scenario != "Offline"
     first_token_pass = True
     if need_first_token_check:
         try:
-            first_token_pass = first_token_check(acc_data, DTYPE_MAP[args.dtype])
+            first_token_pass = first_token_check(
+                acc_data, DTYPE_MAP[args.dtype])
         except Exception:
             print("Unexpected error occured while doing the first token check")
             first_token_pass = False
@@ -107,7 +130,7 @@ def main():
         output += f"First token check pass: {first_token_pass}\n"
     else:
         output += f"First token check pass: Skipped\n"
-    
+
     # Add EOS check
     output += f"EOS check pass: {eos_pass}\n"
 
@@ -122,7 +145,7 @@ def main():
     # Output test output to console and folder
     output_dir = os.path.join(args.output_dir, "TEST06")
     output_accuracy_dir = os.path.join(output_dir, "accuracy")
-    
+
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir)
     if not os.path.isdir(output_accuracy_dir):
@@ -132,10 +155,16 @@ def main():
         f.write(output)
 
     try:
-        shutil.copy2(accuracy_file,output_accuracy_dir)
+        shutil.copy2(accuracy_file, output_accuracy_dir)
     except Exception:
-        print("Exception occured trying to copy " + accuracy_file + " to " + output_accuracy_dir)
+        print(
+            "Exception occured trying to copy "
+            + accuracy_file
+            + " to "
+            + output_accuracy_dir
+        )
     print(output)
 
+
 if __name__ == "__main__":
     main()
diff --git a/language/bert/accuracy-squad.py b/language/bert/accuracy-squad.py
index 3f3a01c08..780cd5a9a 100644
--- a/language/bert/accuracy-squad.py
+++ b/language/bert/accuracy-squad.py
@@ -15,6 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from create_squad_data import convert_examples_to_features, read_squad_examples
+import tokenization
 import argparse
 import collections
 import json
@@ -36,30 +38,39 @@
 installed = {pkg.key for pkg in pkg_resources.working_set}
 if "tensorflow" in installed:
     import tensorflow
+
     sys.path.insert(
-        0, os.path.join(
+        0,
+        os.path.join(
             os.path.dirname(__file__),
-            "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"
-        )
+            "DeepLearningExamples",
+            "TensorFlow",
+            "LanguageModeling",
+            "BERT",
+        ),
     )
 elif "torch" in installed:
     import torch
+
     sys.path.insert(
-        0, os.path.join(
+        0,
+        os.path.join(
             os.path.dirname(__file__),
-            "DeepLearningExamples", "PyTorch", "LanguageModeling", "BERT"
-        )
+            "DeepLearningExamples",
+            "PyTorch",
+            "LanguageModeling",
+            "BERT",
+        ),
     )
 
-import tokenization
-from create_squad_data import convert_examples_to_features, read_squad_examples
 
 max_seq_length = 384
 max_query_length = 64
 doc_stride = 128
 
 RawResult = collections.namedtuple(
-    "RawResult", ["unique_id", "start_logits", "end_logits"])
+    "RawResult", ["unique_id", "start_logits", "end_logits"]
+)
 
 dtype_map = {
     "int8": np.int8,
@@ -68,7 +79,7 @@
     "int64": np.int64,
     "float16": np.float16,
     "float32": np.float32,
-    "float64": np.float64
+    "float64": np.float64,
 }
 
 
@@ -103,7 +114,7 @@ def get_final_text(pred_text, orig_text, do_lower_case):
     def _strip_spaces(text):
         ns_chars = []
         ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
+        for i, c in enumerate(text):
             if c == " ":
                 continue
             ns_to_s_map[len(ns_chars)] = i
@@ -133,7 +144,7 @@ def _strip_spaces(text):
     # We then project the characters in `pred_text` back to `orig_text` using
     # the character-to-character alignment.
     tok_s_to_ns_map = {}
-    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+    for i, tok_index in six.iteritems(tok_ns_to_s_map):
         tok_s_to_ns_map[tok_index] = i
 
     orig_start_position = None
@@ -154,14 +165,16 @@ def _strip_spaces(text):
     if orig_end_position is None:
         return orig_text
 
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    output_text = orig_text[orig_start_position: (orig_end_position + 1)]
     return output_text
 
 
 def _get_best_indexes(logits, n_best_size):
     """Get the n-best logits from a list."""
     index_and_score = sorted(
-        enumerate(logits), key=lambda x: x[1], reverse=True)
+        enumerate(logits),
+        key=lambda x: x[1],
+        reverse=True)
 
     best_indexes = []
     for i in range(len(index_and_score)):
@@ -194,8 +207,16 @@ def _compute_softmax(scores):
     return probs
 
 
-def write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file, max_examples=None):
+def write_predictions(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    max_examples=None,
+):
     """Write final predictions to the json file and log-odds of null if needed."""
     print("Writing predictions to: %s" % (output_prediction_file))
 
@@ -209,13 +230,14 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
 
     _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
         "PrelimPrediction",
-        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
+    )
 
     all_predictions = collections.OrderedDict()
     all_nbest_json = collections.OrderedDict()
     scores_diff_json = collections.OrderedDict()
 
-    for (example_index, example) in enumerate(all_examples):
+    for example_index, example in enumerate(all_examples):
         if max_examples and example_index == max_examples:
             break
 
@@ -227,7 +249,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
         min_null_feature_index = 0  # the paragraph slice with min mull score
         null_start_logit = 0  # the start logit at the slice with min null score
         null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
+        for feature_index, feature in enumerate(features):
             # FIX: During compliance/audit runs, we only generate a small subset of
             # all entries from the dataset. As a result, sometimes dict retrieval
             # fails because a key is missing.
@@ -237,7 +259,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                 continue
             start_indexes = _get_best_indexes(result.start_logits, n_best_size)
             end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
+            # if we could have irrelevant answers, get the min score of
+            # irrelevant
             for start_index in start_indexes:
                 for end_index in end_indexes:
                     # We could hypothetically create invalid predictions, e.g., predict
@@ -251,7 +274,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                         continue
                     if end_index not in feature.token_to_orig_map:
                         continue
-                    if not feature.token_is_max_context.get(start_index, False):
+                    if not feature.token_is_max_context.get(
+                            start_index, False):
                         continue
                     if end_index < start_index:
                         continue
@@ -264,15 +288,19 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                             start_index=start_index,
                             end_index=end_index,
                             start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
 
         prelim_predictions = sorted(
             prelim_predictions,
             key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
+            reverse=True,
+        )
 
         _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
 
         seen_predictions = {}
         nbest = []
@@ -280,10 +308,11 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
             if len(nbest) >= n_best_size:
                 break
             feature = features[pred.feature_index]
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            tok_tokens = feature.tokens[pred.start_index: (pred.end_index + 1)]
             orig_doc_start = feature.token_to_orig_map[pred.start_index]
             orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            orig_tokens = example.doc_tokens[orig_doc_start: (
+                orig_doc_end + 1)]
             tok_text = " ".join(tok_tokens)
 
             # De-tokenize WordPieces that have been split off.
@@ -305,13 +334,18 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                 _NbestPrediction(
                     text=final_text,
                     start_logit=pred.start_logit,
-                    end_logit=pred.end_logit))
+                    end_logit=pred.end_logit,
+                )
+            )
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
         if not nbest:
             nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                _NbestPrediction(
+                    text="empty",
+                    start_logit=0.0,
+                    end_logit=0.0))
 
         assert len(nbest) >= 1
 
@@ -326,7 +360,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
         probs = _compute_softmax(total_scores)
 
         nbest_json = []
-        for (i, entry) in enumerate(nbest):
+        for i, entry in enumerate(nbest):
             output = collections.OrderedDict()
             output["text"] = entry.text
             output["probability"] = probs[i]
@@ -342,7 +376,9 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
         writer.write(json.dumps(all_predictions, indent=4) + "\n")
 
 
-def load_loadgen_log(log_path, eval_features, dtype=np.float32, output_transposed=False):
+def load_loadgen_log(
+    log_path, eval_features, dtype=np.float32, output_transposed=False
+):
     with open(log_path) as f:
         predictions = json.load(f)
 
@@ -350,23 +386,27 @@ def load_loadgen_log(log_path, eval_features, dtype=np.float32, output_transpose
     for prediction in predictions:
         qsl_idx = prediction["qsl_idx"]
         if output_transposed:
-            logits = np.frombuffer(bytes.fromhex(
-                prediction["data"]), dtype).reshape(2, -1)
+            logits = np.frombuffer(bytes.fromhex(prediction["data"]), dtype).reshape(
+                2, -1
+            )
             logits = np.transpose(logits)
         else:
-            logits = np.frombuffer(bytes.fromhex(
-                prediction["data"]), dtype).reshape(-1, 2)
+            logits = np.frombuffer(bytes.fromhex(prediction["data"]), dtype).reshape(
+                -1, 2
+            )
         # Pad logits to max_seq_length
         seq_length = logits.shape[0]
         start_logits = np.ones(max_seq_length) * -10000.0
         end_logits = np.ones(max_seq_length) * -10000.0
         start_logits[:seq_length] = logits[:, 0]
         end_logits[:seq_length] = logits[:, 1]
-        results.append(RawResult(
-            unique_id=eval_features[qsl_idx].unique_id,
-            start_logits=start_logits.tolist(),
-            end_logits=end_logits.tolist()
-        ))
+        results.append(
+            RawResult(
+                unique_id=eval_features[qsl_idx].unique_id,
+                start_logits=start_logits.tolist(),
+                end_logits=end_logits.tolist(),
+            )
+        )
 
     return results
 
@@ -374,38 +414,62 @@ def load_loadgen_log(log_path, eval_features, dtype=np.float32, output_transpose
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--vocab_file", default="build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt", help="Path to vocab.txt")
+        "--vocab_file",
+        default="build/data/bert_tf_v1_1_large_fp32_384_v2/vocab.txt",
+        help="Path to vocab.txt",
+    )
+    parser.add_argument(
+        "--val_data", default="build/data/dev-v1.1.json", help="Path to validation data"
+    )
+    parser.add_argument(
+        "--log_file",
+        default="build/logs/mlperf_log_accuracy.json",
+        help="Path to LoadGen accuracy log",
+    )
+    parser.add_argument(
+        "--out_file",
+        default="build/result/predictions.json",
+        help="Path to output predictions file",
+    )
+    parser.add_argument(
+        "--features_cache_file",
+        default="eval_features.pickle",
+        help="Path to features' cache file",
+    )
+    parser.add_argument(
+        "--output_transposed", action="store_true", help="Transpose the output"
+    )
+    parser.add_argument(
+        "--output_dtype",
+        default="float32",
+        choices=dtype_map.keys(),
+        help="Output data type",
+    )
     parser.add_argument(
-        "--val_data", default="build/data/dev-v1.1.json", help="Path to validation data")
-    parser.add_argument("--log_file", default="build/logs/mlperf_log_accuracy.json",
-                        help="Path to LoadGen accuracy log")
-    parser.add_argument("--out_file", default="build/result/predictions.json",
-                        help="Path to output predictions file")
-    parser.add_argument("--features_cache_file",
-                        default="eval_features.pickle", help="Path to features' cache file")
-    parser.add_argument("--output_transposed",
-                        action="store_true", help="Transpose the output")
-    parser.add_argument("--output_dtype", default="float32",
-                        choices=dtype_map.keys(), help="Output data type")
-    parser.add_argument("--max_examples", type=int,
-                        help="Maximum number of examples to consider (not limited by default)")
+        "--max_examples",
+        type=int,
+        help="Maximum number of examples to consider (not limited by default)",
+    )
     args = parser.parse_args()
 
     output_dtype = dtype_map[args.output_dtype]
 
     print("Reading examples...")
-    eval_examples = read_squad_examples(input_file=args.val_data,
-                                        is_training=False, version_2_with_negative=False)
+    eval_examples = read_squad_examples(
+        input_file=args.val_data, is_training=False, version_2_with_negative=False
+    )
 
     eval_features = []
     # Load features if cached, convert from examples otherwise.
     cache_path = args.features_cache_file
     if os.path.exists(cache_path):
         print("Loading cached features from '%s'..." % cache_path)
-        with open(cache_path, 'rb') as cache_file:
+        with open(cache_path, "rb") as cache_file:
             eval_features = pickle.load(cache_file)
     else:
-        print("No cached features at '%s'... converting from examples..." % cache_path)
+        print(
+            "No cached features at '%s'... converting from examples..." %
+            cache_path)
 
         print("Creating tokenizer...")
         tokenizer = BertTokenizer(args.vocab_file)
@@ -423,25 +487,38 @@ def append_feature(feature):
             max_query_length=max_query_length,
             is_training=False,
             output_fn=append_feature,
-            verbose_logging=False)
+            verbose_logging=False,
+        )
 
         print("Caching features at '%s'..." % cache_path)
-        with open(cache_path, 'wb') as cache_file:
+        with open(cache_path, "wb") as cache_file:
             pickle.dump(eval_features, cache_file)
 
     print("Loading LoadGen logs...")
     results = load_loadgen_log(
-        args.log_file, eval_features, output_dtype, args.output_transposed)
+        args.log_file, eval_features, output_dtype, args.output_transposed
+    )
 
     print("Post-processing predictions...")
-    write_predictions(eval_examples, eval_features, results,
-                      20, 30, True, args.out_file, args.max_examples)
+    write_predictions(
+        eval_examples,
+        eval_features,
+        results,
+        20,
+        30,
+        True,
+        args.out_file,
+        args.max_examples,
+    )
 
     print("Evaluating predictions...")
     cmd = "python3 {:}/evaluate_v1.1.py {:} {:} {}".format(
-        os.path.dirname(os.path.abspath(__file__)), args.val_data,
-        args.out_file, '--max_examples {}'.format(
-            args.max_examples) if args.max_examples else '')
+        os.path.dirname(os.path.abspath(__file__)),
+        args.val_data,
+        args.out_file,
+        "--max_examples {}".format(
+            args.max_examples) if args.max_examples else "",
+    )
     subprocess.check_call(cmd, shell=True)
 
 
diff --git a/language/bert/bert_QDL.py b/language/bert/bert_QDL.py
index 87d93b4fc..6e804a799 100644
--- a/language/bert/bert_QDL.py
+++ b/language/bert/bert_QDL.py
@@ -5,7 +5,7 @@
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
-#   
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,18 +14,19 @@
 # =============================================================================
 
 
+from time import sleep
+import squad_QSL
+import mlperf_loadgen as lg
+import numpy as np
 import threading
 import requests
 import array
 import time
 import os
 import sys
+
 sys.path.insert(0, os.getcwd())
-import numpy as np
 
-import mlperf_loadgen as lg
-import squad_QSL
-from time import sleep
 
 class bert_QDL:
     """QDL acting as a proxy to the SUT.
@@ -44,10 +45,11 @@ def __init__(self, qsl: squad_QSL.SQuAD_v1_QSL, sut_server_addr: list):
         """
         self.qsl = qsl
         self.quantized = False
- 
+
         # Construct QDL from the python binding
         self.qdl = lg.ConstructQDL(
-            self.issue_query, self.flush_queries, self.client_get_name)
+            self.issue_query, self.flush_queries, self.client_get_name
+        )
         self.sut_server_addr = sut_server_addr
         self.num_nodes = len(sut_server_addr)
 
@@ -57,8 +59,9 @@ def __init__(self, qsl: squad_QSL.SQuAD_v1_QSL, sut_server_addr: list):
 
     def issue_query(self, query_samples):
         """Process the query to send to the SUT"""
-        threading.Thread(target=self.process_query_async,
-                         args=[query_samples]).start()
+        threading.Thread(
+            target=self.process_query_async,
+            args=[query_samples]).start()
 
     def flush_queries(self):
         """Flush the queries. Dummy implementation."""
@@ -77,22 +80,26 @@ def process_query_async(self, query_samples):
             query_samples: A list of QuerySample objects.
         """
 
-        max_num_threads = int(os.environ.get('CM_MAX_NUM_THREADS', os.cpu_count()))
+        max_num_threads = int(
+            os.environ.get(
+                "CM_MAX_NUM_THREADS",
+                os.cpu_count()))
 
         for i in range(len(query_samples)):
             eval_features = self.qsl.get_features(query_samples[i].index)
             encoded_eval_features = {
-                    "input_ids": eval_features.input_ids,
-                    "input_mask": eval_features.input_mask,
-                    "segment_ids": eval_features.segment_ids
-                    }
+                "input_ids": eval_features.input_ids,
+                "input_mask": eval_features.input_mask,
+                "segment_ids": eval_features.segment_ids,
+            }
             n = threading.active_count()
             while n >= max_num_threads:
                 sleep(0.0001)
                 n = threading.active_count()
-            threading.Thread(target=self.client_predict_worker,
-                         args=[encoded_eval_features, query_samples[i].id]).start()
-
+            threading.Thread(
+                target=self.client_predict_worker,
+                args=[encoded_eval_features, query_samples[i].id],
+            ).start()
 
     def get_sut_id_round_robin(self):
         """Get the SUT id in round robin."""
@@ -103,10 +110,11 @@ def get_sut_id_round_robin(self):
 
     def client_predict_worker(self, query, query_id):
         """Serialize the query, send it to the SUT in round robin, and return the deserialized response."""
-        url = '{}/predict/'.format(self.sut_server_addr[self.get_sut_id_round_robin()])
+        url = "{}/predict/".format(
+            self.sut_server_addr[self.get_sut_id_round_robin()])
         responses = []
-        response = requests.post(url, json={'query': query})
-        output = response.json()['result']
+        response = requests.post(url, json={"query": query})
+        output = response.json()["result"]
         output = np.array(output).astype(np.float32)
         response_array = array.array("B", output.tobytes())
         bi = response_array.buffer_info()
@@ -117,12 +125,14 @@ def client_predict_worker(self, query, query_id):
     def client_get_name(self):
         """Get the name of the SUT from ALL the SUTS."""
         if len(self.sut_server_addr) == 1:
-            return requests.post(f'{self.sut_server_addr[0]}/getname/').json()['name']
-    
-        sut_names = [requests.post(f'{addr}/getname/').json()['name'] for addr in self.sut_server_addr]
-        return "Multi-node SUT: " + ', '.join(sut_names)
+            return requests.post(
+                f"{self.sut_server_addr[0]}/getname/").json()["name"]
+
+        sut_names = [
+            requests.post(f"{addr}/getname/").json()["name"]
+            for addr in self.sut_server_addr
+        ]
+        return "Multi-node SUT: " + ", ".join(sut_names)
 
     def __del__(self):
         lg.DestroyQDL(self.qdl)
-
-
diff --git a/language/bert/bert_tf_to_pytorch.py b/language/bert/bert_tf_to_pytorch.py
index 4ec6d9ad6..76cd269a6 100644
--- a/language/bert/bert_tf_to_pytorch.py
+++ b/language/bert/bert_tf_to_pytorch.py
@@ -15,6 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tensorflow as tf
+from transformers import BertConfig, BertTokenizer, BertForQuestionAnswering
+import torch
+import numpy as np
 import collections
 import json
 import math
@@ -25,19 +29,21 @@
 import sys
 import time
 
-sys.path.insert(0, os.path.join(os.getcwd(), "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"))
+sys.path.insert(
+    0,
+    os.path.join(
+        os.getcwd(), "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"
+    ),
+)
 sys.path.insert(0, os.getcwd())
 
-import numpy as np
-import torch
-from transformers import BertConfig, BertTokenizer, BertForQuestionAnswering
-import tensorflow as tf
 
 def load_from_tf(config, tf_path):
     model = BertForQuestionAnswering(config)
     model.classifier = model.qa_outputs
 
-    # This part is copied from HuggingFace Transformers with a fix to bypass an error
+    # This part is copied from HuggingFace Transformers with a fix to bypass
+    # an error
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
@@ -67,7 +73,9 @@ def load_from_tf(config, tf_path):
             elif scope_names[0] == "output_weights":
                 pointer = getattr(pointer, "weight")
             elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier") # This line is causing the issue
+                pointer = getattr(
+                    pointer, "classifier"
+                )  # This line is causing the issue
             else:
                 try:
                     pointer = getattr(pointer, scope_names[0])
@@ -93,8 +101,11 @@ def load_from_tf(config, tf_path):
     del model.classifier
     return model
 
+
 def save_to_onnx(model):
-    tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    tokenizer = BertTokenizer.from_pretrained(
+        "bert-large-uncased-whole-word-masking-finetuned-squad"
+    )
     model.eval()
 
     dummy_input = torch.ones((1, 384), dtype=torch.int64)
@@ -103,13 +114,21 @@ def save_to_onnx(model):
         (dummy_input, dummy_input, dummy_input),
         "build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx",
         verbose=True,
-        input_names = ["input_ids", "input_mask", "segment_ids"],
-        output_names = ["output_start_logits", "output_end_logits"],
+        input_names=["input_ids", "input_mask", "segment_ids"],
+        output_names=["output_start_logits", "output_end_logits"],
         opset_version=11,
-        dynamic_axes=({"input_ids": {0: "batch_size"}, "input_mask": {0: "batch_size"}, "segment_ids": {0: "batch_size"},
-            "output_start_logits": {0: "batch_size"}, "output_end_logits": {0: "batch_size"}})
+        dynamic_axes=(
+            {
+                "input_ids": {0: "batch_size"},
+                "input_mask": {0: "batch_size"},
+                "segment_ids": {0: "batch_size"},
+                "output_start_logits": {0: "batch_size"},
+                "output_end_logits": {0: "batch_size"},
+            }
+        ),
     )
 
+
 def main():
     with open("build/data/bert_tf_v1_1_large_fp32_384_v2/bert_config.json") as f:
         config_json = json.load(f)
@@ -125,11 +144,17 @@ def main():
         num_attention_heads=config_json["num_attention_heads"],
         num_hidden_layers=config_json["num_hidden_layers"],
         type_vocab_size=config_json["type_vocab_size"],
-        vocab_size=config_json["vocab_size"])
+        vocab_size=config_json["vocab_size"],
+    )
 
-    model = load_from_tf(config, "build/data/bert_tf_v1_1_large_fp32_384_v2/model.ckpt-5474")
-    torch.save(model.state_dict(), "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch")
+    model = load_from_tf(
+        config, "build/data/bert_tf_v1_1_large_fp32_384_v2/model.ckpt-5474"
+    )
+    torch.save(
+        model.state_dict(), "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch"
+    )
     save_to_onnx(model)
 
+
 if __name__ == "__main__":
     main()
diff --git a/language/bert/create_squad_data.py b/language/bert/create_squad_data.py
index 6792e7587..c5e085644 100644
--- a/language/bert/create_squad_data.py
+++ b/language/bert/create_squad_data.py
@@ -24,390 +24,443 @@
 import tokenization
 import six
 
+
 class SquadExample(object):
-  """A single training/test example for simple sequence classification.
-
-     For examples without an answer, the start and end position are -1.
-  """
-
-  def __init__(self,
-               qas_id,
-               question_text,
-               doc_tokens,
-               orig_answer_text=None,
-               start_position=None,
-               end_position=None,
-               is_impossible=False):
-    self.qas_id = qas_id
-    self.question_text = question_text
-    self.doc_tokens = doc_tokens
-    self.orig_answer_text = orig_answer_text
-    self.start_position = start_position
-    self.end_position = end_position
-    self.is_impossible = is_impossible
-
-  def __str__(self):
-    return self.__repr__()
-
-  def __repr__(self):
-    s = ""
-    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
-    s += ", question_text: %s" % (
-        tokenization.printable_text(self.question_text))
-    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-    if self.start_position:
-      s += ", start_position: %d" % (self.start_position)
-    if self.start_position:
-      s += ", end_position: %d" % (self.end_position)
-    if self.start_position:
-      s += ", is_impossible: %r" % (self.is_impossible)
-    return s
+    """A single training/test example for simple sequence classification.
+
+    For examples without an answer, the start and end position are -1.
+    """
+
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        doc_tokens,
+        orig_answer_text=None,
+        start_position=None,
+        end_position=None,
+        is_impossible=False,
+    ):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
 
 class InputFeatures(object):
-  """A single set of features of data."""
-
-  def __init__(self,
-               unique_id,
-               example_index,
-               doc_span_index,
-               tokens,
-               token_to_orig_map,
-               token_is_max_context,
-               input_ids,
-               input_mask,
-               segment_ids,
-               start_position=None,
-               end_position=None,
-               is_impossible=None):
-    self.unique_id = unique_id
-    self.example_index = example_index
-    self.doc_span_index = doc_span_index
-    self.tokens = tokens
-    self.token_to_orig_map = token_to_orig_map
-    self.token_is_max_context = token_is_max_context
-    self.input_ids = input_ids
-    self.input_mask = input_mask
-    self.segment_ids = segment_ids
-    self.start_position = start_position
-    self.end_position = end_position
-    self.is_impossible = is_impossible
-
-def read_squad_examples(input_file, is_training, version_2_with_negative=False):
-  """Read a SQuAD json file into a list of SquadExample."""
-  with open(input_file) as reader:
-    input_data = json.load(reader)["data"]
-
-  def is_whitespace(c):
-    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-      return True
-    return False
-
-  examples = []
-  for entry in input_data:
-    for paragraph in entry["paragraphs"]:
-      paragraph_text = paragraph["context"]
-      doc_tokens = []
-      char_to_word_offset = []
-      prev_is_whitespace = True
-      for c in paragraph_text:
-        if is_whitespace(c):
-          prev_is_whitespace = True
-        else:
-          if prev_is_whitespace:
-            doc_tokens.append(c)
-          else:
-            doc_tokens[-1] += c
-          prev_is_whitespace = False
-        char_to_word_offset.append(len(doc_tokens) - 1)
-
-      for qa in paragraph["qas"]:
-        qas_id = qa["id"]
-        question_text = qa["question"]
-        start_position = None
-        end_position = None
-        orig_answer_text = None
-        is_impossible = False
-        if is_training:
-
-          if version_2_with_negative:
-            is_impossible = qa["is_impossible"]
-          if (len(qa["answers"]) != 1) and (not is_impossible):
-            raise ValueError(
-                "For training, each question should have exactly 1 answer.")
-          if not is_impossible:
-            answer = qa["answers"][0]
-            orig_answer_text = answer["text"]
-            answer_offset = answer["answer_start"]
-            answer_length = len(orig_answer_text)
-            start_position = char_to_word_offset[answer_offset]
-            end_position = char_to_word_offset[answer_offset + answer_length -
-                                               1]
-            # Only add answers where the text can be exactly recovered from the
-            # document. If this CAN'T happen it's likely due to weird Unicode
-            # stuff so we will just skip the example.
-            #
-            # Note that this means for training mode, every example is NOT
-            # guaranteed to be preserved.
-            actual_text = " ".join(
-                doc_tokens[start_position:(end_position + 1)])
-            cleaned_answer_text = " ".join(
-                tokenization.whitespace_tokenize(orig_answer_text))
-            if actual_text.find(cleaned_answer_text) == -1:
-              print("Could not find answer: '%s' vs. '%s'",
-                                 actual_text, cleaned_answer_text)
-              continue
-          else:
-            start_position = -1
-            end_position = -1
-            orig_answer_text = ""
-
-        example = SquadExample(
-            qas_id=qas_id,
-            question_text=question_text,
-            doc_tokens=doc_tokens,
-            orig_answer_text=orig_answer_text,
-            start_position=start_position,
-            end_position=end_position,
-            is_impossible=is_impossible)
-        examples.append(example)
-
-  return examples
+    """A single set of features of data."""
+
+    def __init__(
+        self,
+        unique_id,
+        example_index,
+        doc_span_index,
+        tokens,
+        token_to_orig_map,
+        token_is_max_context,
+        input_ids,
+        input_mask,
+        segment_ids,
+        start_position=None,
+        end_position=None,
+        is_impossible=None,
+    ):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+
+def read_squad_examples(input_file, is_training,
+                        version_2_with_negative=False):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file) as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer."
+                        )
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[
+                            answer_offset + answer_length - 1
+                        ]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(
+                            doc_tokens[start_position: (end_position + 1)]
+                        )
+                        cleaned_answer_text = " ".join(
+                            tokenization.whitespace_tokenize(orig_answer_text)
+                        )
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            print(
+                                "Could not find answer: '%s' vs. '%s'",
+                                actual_text,
+                                cleaned_answer_text,
+                            )
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible,
+                )
+                examples.append(example)
+
+    return examples
+
 
 def _check_is_max_context(doc_spans, cur_span_index, position):
-  """Check if this is the 'max context' doc span for the token."""
-
-  # Because of the sliding window approach taken to scoring documents, a single
-  # token can appear in multiple documents. E.g.
-  #  Doc: the man went to the store and bought a gallon of milk
-  #  Span A: the man went to the
-  #  Span B: to the store and bought
-  #  Span C: and bought a gallon of
-  #  ...
-  #
-  # Now the word 'bought' will have two scores from spans B and C. We only
-  # want to consider the score with "maximum context", which we define as
-  # the *minimum* of its left and right context (the *sum* of left and
-  # right context will always be the same, of course).
-  #
-  # In the example the maximum context for 'bought' would be span C since
-  # it has 1 left context and 3 right context, while span B has 4 left context
-  # and 0 right context.
-  best_score = None
-  best_span_index = None
-  for (span_index, doc_span) in enumerate(doc_spans):
-    end = doc_span.start + doc_span.length - 1
-    if position < doc_span.start:
-      continue
-    if position > end:
-      continue
-    num_left_context = position - doc_span.start
-    num_right_context = end - position
-    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-    if best_score is None or score > best_score:
-      best_score = score
-      best_span_index = span_index
-
-  return cur_span_index == best_span_index
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-  """Returns tokenized answer spans that better match the annotated answer."""
-
-  # The SQuAD annotations are character based. We first project them to
-  # whitespace-tokenized words. But then after WordPiece tokenization, we can
-  # often find a "better match". For example:
-  #
-  #   Question: What year was John Smith born?
-  #   Context: The leader was John Smith (1895-1943).
-  #   Answer: 1895
-  #
-  # The original whitespace-tokenized answer will be "(1895-1943).". However
-  # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-  # the exact answer, 1895.
-  #
-  # However, this is not always possible. Consider the following:
-  #
-  #   Question: What country is the top exporter of electornics?
-  #   Context: The Japanese electronics industry is the lagest in the world.
-  #   Answer: Japan
-  #
-  # In this case, the annotator chose "Japan" as a character sub-span of
-  # the word "Japanese". Since our WordPiece tokenizer does not split
-  # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-  # in SQuAD, but does happen.
-  tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-  for new_start in range(input_start, input_end + 1):
-    for new_end in range(input_end, new_start - 1, -1):
-      text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-      if text_span == tok_answer_text:
-        return (new_start, new_end)
-
-  return (input_start, input_end)
-
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training,
-                                 output_fn, verbose_logging=False):
-  """Loads a data file into a list of `InputBatch`s."""
-
-  unique_id = 1000000000
-
-  for (example_index, example) in enumerate(examples):
-    query_tokens = tokenizer.tokenize(example.question_text)
-
-    if len(query_tokens) > max_query_length:
-      query_tokens = query_tokens[0:max_query_length]
-
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for (i, token) in enumerate(example.doc_tokens):
-      orig_to_tok_index.append(len(all_doc_tokens))
-      sub_tokens = tokenizer.tokenize(token)
-      for sub_token in sub_tokens:
-        tok_to_orig_index.append(i)
-        all_doc_tokens.append(sub_token)
-
-    tok_start_position = None
-    tok_end_position = None
-    if is_training and example.is_impossible:
-      tok_start_position = -1
-      tok_end_position = -1
-    if is_training and not example.is_impossible:
-      tok_start_position = orig_to_tok_index[example.start_position]
-      if example.end_position < len(example.doc_tokens) - 1:
-        tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-      else:
-        tok_end_position = len(all_doc_tokens) - 1
-      (tok_start_position, tok_end_position) = _improve_answer_span(
-          all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-          example.orig_answer_text)
-
-    # The -3 accounts for [CLS], [SEP] and [SEP]
-    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-    # We can have documents that are longer than the maximum sequence length.
-    # To deal with this we do a sliding window approach, where we take chunks
-    # of the up to our max length with a stride of `doc_stride`.
-    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"])
-    doc_spans = []
-    start_offset = 0
-    while start_offset < len(all_doc_tokens):
-      length = len(all_doc_tokens) - start_offset
-      if length > max_tokens_for_doc:
-        length = max_tokens_for_doc
-      doc_spans.append(_DocSpan(start=start_offset, length=length))
-      if start_offset + length == len(all_doc_tokens):
-        break
-      start_offset += min(length, doc_stride)
-
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
-      tokens = []
-      token_to_orig_map = {}
-      token_is_max_context = {}
-      segment_ids = []
-      tokens.append("[CLS]")
-      segment_ids.append(0)
-      for token in query_tokens:
-        tokens.append(token)
-        segment_ids.append(0)
-      tokens.append("[SEP]")
-      segment_ids.append(0)
-
-      for i in range(doc_span.length):
-        split_token_index = doc_span.start + i
-        token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-        is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                               split_token_index)
-        token_is_max_context[len(tokens)] = is_max_context
-        tokens.append(all_doc_tokens[split_token_index])
-        segment_ids.append(1)
-      tokens.append("[SEP]")
-      segment_ids.append(1)
-
-      input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-      # The mask has 1 for real tokens and 0 for padding tokens. Only real
-      # tokens are attended to.
-      input_mask = [1] * len(input_ids)
-
-      # Zero-pad up to the sequence length.
-      while len(input_ids) < max_seq_length:
-        input_ids.append(0)
-        input_mask.append(0)
-        segment_ids.append(0)
-
-      assert len(input_ids) == max_seq_length
-      assert len(input_mask) == max_seq_length
-      assert len(segment_ids) == max_seq_length
-
-      start_position = None
-      end_position = None
-      if is_training and not example.is_impossible:
-        # For training, if our document chunk does not contain an annotation
-        # we throw it out, since there is nothing to predict.
-        doc_start = doc_span.start
-        doc_end = doc_span.start + doc_span.length - 1
-        out_of_span = False
-        if not (tok_start_position >= doc_start and
-                tok_end_position <= doc_end):
-          out_of_span = True
-        if out_of_span:
-          start_position = 0
-          end_position = 0
-        else:
-          doc_offset = len(query_tokens) + 2
-          start_position = tok_start_position - doc_start + doc_offset
-          end_position = tok_end_position - doc_start + doc_offset
-
-      if is_training and example.is_impossible:
-        start_position = 0
-        end_position = 0
-
-      if verbose_logging and example_index < 20:
-        print("*** Example ***")
-        print("unique_id: %s" % (unique_id))
-        print("example_index: %s" % (example_index))
-        print("doc_span_index: %s" % (doc_span_index))
-        print("tokens: %s" % " ".join(
-            [tokenization.printable_text(x) for x in tokens]))
-        print("token_to_orig_map: %s" % " ".join(
-            ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)]))
-        print("token_is_max_context: %s" % " ".join([
-            "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context)
-        ]))
-        print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-        print(
-            "input_mask: %s" % " ".join([str(x) for x in input_mask]))
-        print(
-            "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for span_index, doc_span in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + \
+            0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _improve_answer_span(
+    doc_tokens, input_start, input_end, tokenizer, orig_answer_text
+):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start: (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_seq_length,
+    doc_stride,
+    max_query_length,
+    is_training,
+    output_fn,
+    verbose_logging=False,
+):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    unique_id = 1000000000
+
+    for example_index, example in enumerate(examples):
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for i, token in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        tok_start_position = None
+        tok_end_position = None
         if is_training and example.is_impossible:
-          print("impossible example")
+            tok_start_position = -1
+            tok_end_position = -1
         if is_training and not example.is_impossible:
-          answer_text = " ".join(tokens[start_position:(end_position + 1)])
-          print("start_position: %d" % (start_position))
-          print("end_position: %d" % (end_position))
-          print(
-              "answer: %s" % (tokenization.printable_text(answer_text)))
-
-      feature = InputFeatures(
-          unique_id=unique_id,
-          example_index=example_index,
-          doc_span_index=doc_span_index,
-          tokens=tokens,
-          token_to_orig_map=token_to_orig_map,
-          token_is_max_context=token_is_max_context,
-          input_ids=input_ids,
-          input_mask=input_mask,
-          segment_ids=segment_ids,
-          start_position=start_position,
-          end_position=end_position,
-          is_impossible=example.is_impossible)
-
-      # Run callback
-      output_fn(feature)
-
-      unique_id += 1
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens,
+                tok_start_position,
+                tok_end_position,
+                tokenizer,
+                example.orig_answer_text,
+            )
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"]
+        )
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for doc_span_index, doc_span in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(
+                    tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(
+                    doc_spans, doc_span_index, split_token_index
+                )
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            start_position = None
+            end_position = None
+            if is_training and not example.is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (
+                    tok_start_position >= doc_start and tok_end_position <= doc_end
+                ):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+
+            if is_training and example.is_impossible:
+                start_position = 0
+                end_position = 0
+
+            if verbose_logging and example_index < 20:
+                print("*** Example ***")
+                print("unique_id: %s" % (unique_id))
+                print("example_index: %s" % (example_index))
+                print("doc_span_index: %s" % (doc_span_index))
+                print(
+                    "tokens: %s"
+                    % " ".join([tokenization.printable_text(x) for x in tokens])
+                )
+                print(
+                    "token_to_orig_map: %s"
+                    % " ".join(
+                        [
+                            "%d:%d" % (x, y)
+                            for (x, y) in six.iteritems(token_to_orig_map)
+                        ]
+                    )
+                )
+                print(
+                    "token_is_max_context: %s"
+                    % " ".join(
+                        [
+                            "%d:%s" % (x, y)
+                            for (x, y) in six.iteritems(token_is_max_context)
+                        ]
+                    )
+                )
+                print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                print("input_mask: %s" %
+                      " ".join([str(x) for x in input_mask]))
+                print("segment_ids: %s" %
+                      " ".join([str(x) for x in segment_ids]))
+                if is_training and example.is_impossible:
+                    print("impossible example")
+                if is_training and not example.is_impossible:
+                    answer_text = " ".join(
+                        tokens[start_position: (end_position + 1)])
+                    print("start_position: %d" % (start_position))
+                    print("end_position: %d" % (end_position))
+                    print(
+                        "answer: %s" %
+                        (tokenization.printable_text(answer_text)))
+
+            feature = InputFeatures(
+                unique_id=unique_id,
+                example_index=example_index,
+                doc_span_index=doc_span_index,
+                tokens=tokens,
+                token_to_orig_map=token_to_orig_map,
+                token_is_max_context=token_is_max_context,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                segment_ids=segment_ids,
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=example.is_impossible,
+            )
+
+            # Run callback
+            output_fn(feature)
+
+            unique_id += 1
diff --git a/language/bert/evaluate_v1.1.py b/language/bert/evaluate_v1.1.py
index 0e89c9623..f309e47d8 100644
--- a/language/bert/evaluate_v1.1.py
+++ b/language/bert/evaluate_v1.1.py
@@ -10,7 +10,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Source: https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
+# Source:
+# https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
 
 """ Official evaluation script for v1.1 of the SQuAD dataset. """
 from __future__ import print_function
@@ -24,15 +25,16 @@
 
 def normalize_answer(s):
     """Lower text and remove punctuation, articles and extra whitespace."""
+
     def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
+        return re.sub(r"\b(a|an|the)\b", " ", text)
 
     def white_space_fix(text):
-        return ' '.join(text.split())
+        return " ".join(text.split())
 
     def remove_punc(text):
         exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
+        return "".join(ch for ch in text if ch not in exclude)
 
     def lower(text):
         return text.lower()
@@ -54,7 +56,7 @@ def f1_score(prediction, ground_truth):
 
 
 def exact_match_score(prediction, ground_truth):
-    return (normalize_answer(prediction) == normalize_answer(ground_truth))
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
 
 
 def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
@@ -68,46 +70,61 @@ def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 def evaluate(dataset, predictions, max_examples=None):
     f1 = exact_match = total = 0
     for article in dataset:
-        if max_examples and max_examples==total: break
-        for paragraph in article['paragraphs']:
-            if max_examples and max_examples==total: break
-            for qa in paragraph['qas']:
+        if max_examples and max_examples == total:
+            break
+        for paragraph in article["paragraphs"]:
+            if max_examples and max_examples == total:
+                break
+            for qa in paragraph["qas"]:
                 total += 1
-                if max_examples and max_examples==total: break
-
-                if qa['id'] not in predictions:
-                    message = 'Unanswered question ' + qa['id'] + \
-                              ' will receive score 0.'
+                if max_examples and max_examples == total:
+                    break
+
+                if qa["id"] not in predictions:
+                    message = (
+                        "Unanswered question " +
+                        qa["id"] + " will receive score 0."
+                    )
                     print(message, file=sys.stderr)
                     continue
-                ground_truths = list(map(lambda x: x['text'], qa['answers']))
-                prediction = predictions[qa['id']]
+                ground_truths = list(map(lambda x: x["text"], qa["answers"]))
+                prediction = predictions[qa["id"]]
                 exact_match += metric_max_over_ground_truths(
-                    exact_match_score, prediction, ground_truths)
+                    exact_match_score, prediction, ground_truths
+                )
                 f1 += metric_max_over_ground_truths(
                     f1_score, prediction, ground_truths)
 
     exact_match = 100.0 * exact_match / total
     f1 = 100.0 * f1 / total
 
-    return {'exact_match': exact_match, 'f1': f1}
+    return {"exact_match": exact_match, "f1": f1}
 
 
-if __name__ == '__main__':
-    expected_version = '1.1'
+if __name__ == "__main__":
+    expected_version = "1.1"
     parser = argparse.ArgumentParser(
-        description='Evaluation for SQuAD ' + expected_version)
-    parser.add_argument('dataset_file', help='Dataset file')
-    parser.add_argument('prediction_file', help='Prediction File')
-    parser.add_argument('--max_examples', type=int, help='Maximum number of examples to consider (not limited by default)')
+        description="Evaluation for SQuAD " + expected_version
+    )
+    parser.add_argument("dataset_file", help="Dataset file")
+    parser.add_argument("prediction_file", help="Prediction File")
+    parser.add_argument(
+        "--max_examples",
+        type=int,
+        help="Maximum number of examples to consider (not limited by default)",
+    )
     args = parser.parse_args()
     with open(args.dataset_file) as dataset_file:
         dataset_json = json.load(dataset_file)
-        if (dataset_json['version'] != expected_version):
-            print('Evaluation expects v-' + expected_version +
-                  ', but got dataset with v-' + dataset_json['version'],
-                  file=sys.stderr)
-        dataset = dataset_json['data']
+        if dataset_json["version"] != expected_version:
+            print(
+                "Evaluation expects v-"
+                + expected_version
+                + ", but got dataset with v-"
+                + dataset_json["version"],
+                file=sys.stderr,
+            )
+        dataset = dataset_json["data"]
     with open(args.prediction_file) as prediction_file:
         predictions = json.load(prediction_file)
     print(json.dumps(evaluate(dataset, predictions, args.max_examples)))
diff --git a/language/bert/network_LON.py b/language/bert/network_LON.py
index fa6113e1a..92b53a411 100644
--- a/language/bert/network_LON.py
+++ b/language/bert/network_LON.py
@@ -5,7 +5,7 @@
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
-#   
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,15 +14,26 @@
 # =============================================================================
 
 
+import bert_QDL
+import mlperf_loadgen as lg
+import squad_QSL
+from absl import app
 import sys
 import os
+
 sys.path.insert(0, os.getcwd())
-from absl import app
-import squad_QSL
-import mlperf_loadgen as lg
-import bert_QDL
 
-def set_args(argv, g_settings, g_log_settings, g_audit_conf, g_sut_server, g_backend, g_total_count_override=None, g_perf_count_override=None):
+
+def set_args(
+    argv,
+    g_settings,
+    g_log_settings,
+    g_audit_conf,
+    g_sut_server,
+    g_backend,
+    g_total_count_override=None,
+    g_perf_count_override=None,
+):
 
     global settings, log_settings, audit_conf, sut_server, total_count_override, perf_count_override, backend
     sys.argv = sys.argv[0:1]
@@ -34,11 +45,17 @@ def set_args(argv, g_settings, g_log_settings, g_audit_conf, g_sut_server, g_bac
     perf_count_override = g_perf_count_override
     backend = g_backend
 
-def main(argv):
-        qsl = squad_QSL.get_squad_QSL(total_count_override, perf_count_override)
-        qdl = bert_QDL.bert_QDL(qsl, sut_server_addr=sut_server)
 
-        lg.StartTestWithLogSettings(qdl.qdl, qsl.qsl, settings, log_settings, audit_conf)
+def main(argv):
+    qsl = squad_QSL.get_squad_QSL(total_count_override, perf_count_override)
+    qdl = bert_QDL.bert_QDL(qsl, sut_server_addr=sut_server)
+
+    lg.StartTestWithLogSettings(
+        qdl.qdl,
+        qsl.qsl,
+        settings,
+        log_settings,
+        audit_conf)
 
 
 if __name__ == "__main__":
diff --git a/language/bert/onnxruntime_SUT.py b/language/bert/onnxruntime_SUT.py
index a016c1f7b..598192509 100644
--- a/language/bert/onnxruntime_SUT.py
+++ b/language/bert/onnxruntime_SUT.py
@@ -14,21 +14,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from time import sleep
+from squad_QSL import get_squad_QSL
+from transformers import BertConfig, BertForQuestionAnswering
+import onnxruntime
+import numpy as np
+import mlperf_loadgen as lg
 import threading
 import array
 import json
 import os
 import sys
+
 sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
-import numpy as np
-import onnxruntime
-from transformers import BertConfig, BertForQuestionAnswering
-from squad_QSL import get_squad_QSL
-from time import sleep
 
-class BERT_ONNXRuntime_SUT():
+class BERT_ONNXRuntime_SUT:
     def __init__(self, args):
         self.profile = args.profile
         self.network = args.network
@@ -44,11 +45,20 @@ def __init__(self, args):
                 model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/bert_large_v1_1_fake_quant.onnx"
             else:
                 model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx"
-        if len(onnxruntime.get_all_providers()) > 1 and os.environ.get("USE_GPU", "yes").lower() not in [ "0", "false", "off", "no" ]:
-            preferred_execution_provider = os.environ.get("ONNXRUNTIME_PREFERRED_EXECUTION_PROVIDER", "CUDAExecutionProvider")
-            self.sess = onnxruntime.InferenceSession(model_path, self.options, providers=[ preferred_execution_provider ])
+        if len(onnxruntime.get_all_providers()) > 1 and os.environ.get(
+            "USE_GPU", "yes"
+        ).lower() not in ["0", "false", "off", "no"]:
+            preferred_execution_provider = os.environ.get(
+                "ONNXRUNTIME_PREFERRED_EXECUTION_PROVIDER", "CUDAExecutionProvider"
+            )
+            self.sess = onnxruntime.InferenceSession(
+                model_path, self.options, providers=[
+                    preferred_execution_provider]
+            )
         else:
-            self.sess = onnxruntime.InferenceSession(model_path, self.options, providers=["CPUExecutionProvider"])
+            self.sess = onnxruntime.InferenceSession(
+                model_path, self.options, providers=["CPUExecutionProvider"]
+            )
 
         print("Constructing SUT...")
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
@@ -57,24 +67,28 @@ def __init__(self, args):
         self.qsl = get_squad_QSL(args.max_examples)
 
     def issue_queries(self, query_samples):
-        max_num_threads = int(os.environ.get('CM_MAX_NUM_THREADS', os.cpu_count()))
+        max_num_threads = int(
+            os.environ.get(
+                "CM_MAX_NUM_THREADS",
+                os.cpu_count()))
 
         for i in range(len(query_samples)):
             eval_features = self.qsl.get_features(query_samples[i].index)
             n = threading.active_count()
             while n >= max_num_threads:
-                #sleep(0.01)
+                # sleep(0.01)
                 n = threading.active_count()
-            threading.Thread(target=self.process_sample,
-                         args=[eval_features, query_samples[i].id]).start()
+            threading.Thread(
+                target=self.process_sample, args=[
+                    eval_features, query_samples[i].id]
+            ).start()
 
     def process_sample(self, eval_features, query_id=None):
-
-        '''For Loadgen over the network'''
+        """For Loadgen over the network"""
         if self.network == "sut":
-            input_ids = eval_features['input_ids']
-            input_mask = eval_features['input_mask']
-            segment_ids = eval_features['segment_ids']
+            input_ids = eval_features["input_ids"]
+            input_mask = eval_features["input_mask"]
+            segment_ids = eval_features["segment_ids"]
         else:
             input_ids = eval_features.input_ids
             input_mask = eval_features.input_mask
@@ -84,13 +98,13 @@ def process_sample(self, eval_features, query_id=None):
             fd = {
                 "input_ids": np.array(input_ids).astype(np.int64)[np.newaxis, :],
                 "attention_mask": np.array(input_mask).astype(np.int64)[np.newaxis, :],
-                "token_type_ids": np.array(segment_ids).astype(np.int64)[np.newaxis, :]
+                "token_type_ids": np.array(segment_ids).astype(np.int64)[np.newaxis, :],
             }
         else:
             fd = {
                 "input_ids": np.array(input_ids).astype(np.int64)[np.newaxis, :],
                 "input_mask": np.array(input_mask).astype(np.int64)[np.newaxis, :],
-                "segment_ids": np.array(segment_ids).astype(np.int64)[np.newaxis, :]
+                "segment_ids": np.array(segment_ids).astype(np.int64)[np.newaxis, :],
             }
 
         scores = self.sess.run([o.name for o in self.sess.get_outputs()], fd)
@@ -109,8 +123,12 @@ def flush_queries(self):
 
     def __del__(self):
         if self.profile:
-            print("ONNX runtime profile dumped to: '{}'".format(self.sess.end_profiling()))
+            print(
+                "ONNX runtime profile dumped to: '{}'".format(
+                    self.sess.end_profiling())
+            )
         print("Finished destroying SUT.")
 
+
 def get_onnxruntime_sut(args):
     return BERT_ONNXRuntime_SUT(args)
diff --git a/language/bert/pytorch_SUT.py b/language/bert/pytorch_SUT.py
index 81908ea6d..f433eb69f 100644
--- a/language/bert/pytorch_SUT.py
+++ b/language/bert/pytorch_SUT.py
@@ -15,21 +15,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from squad_QSL import get_squad_QSL
+from transformers import BertConfig, BertForQuestionAnswering
+import transformers
+import torch
+import numpy as np
+import mlperf_loadgen as lg
 import array
 import json
 import os
 import sys
-sys.path.insert(0, os.path.join(os.getcwd(), "DeepLearningExamples", "PyTorch", "LanguageModeling", "BERT"))
+
+sys.path.insert(
+    0,
+    os.path.join(
+        os.getcwd(), "DeepLearningExamples", "PyTorch", "LanguageModeling", "BERT"
+    ),
+)
 sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
-import numpy as np
-import torch
-import transformers
-from transformers import BertConfig, BertForQuestionAnswering
-from squad_QSL import get_squad_QSL
 
-class BERT_PyTorch_SUT():
+class BERT_PyTorch_SUT:
     def __init__(self, args):
         print("Loading BERT configs...")
         with open("bert_config.json") as f:
@@ -46,17 +52,24 @@ def __init__(self, args):
             num_attention_heads=config_json["num_attention_heads"],
             num_hidden_layers=config_json["num_hidden_layers"],
             type_vocab_size=config_json["type_vocab_size"],
-            vocab_size=config_json["vocab_size"])
+            vocab_size=config_json["vocab_size"],
+        )
 
         self.network = args.network
-        self.dev = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+        self.dev = (
+            torch.device("cuda:0") if torch.cuda.is_available(
+            ) else torch.device("cpu")
+        )
         self.version = transformers.__version__
 
         print("Loading PyTorch model...")
         self.model = BertForQuestionAnswering(config)
         self.model.to(self.dev)
         self.model.eval()
-        model_file = os.environ.get("ML_MODEL_FILE_WITH_PATH", "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch")
+        model_file = os.environ.get(
+            "ML_MODEL_FILE_WITH_PATH",
+            "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch",
+        )
         self.model.load_state_dict(torch.load(model_file), strict=False)
 
         print("Constructing SUT...")
@@ -70,42 +83,52 @@ def issue_queries(self, query_samples):
             eval_features = self.qsl.get_features(query_samples[i].index)
             self.process_sample(eval_features, query_samples[i].id)
 
-    def process_sample(self, sample_input, query_id = None):
+    def process_sample(self, sample_input, query_id=None):
 
         if self.network == "sut":
-            input_ids = sample_input['input_ids']
-            input_mask = sample_input['input_mask']
-            segment_ids = sample_input['segment_ids']
+            input_ids = sample_input["input_ids"]
+            input_mask = sample_input["input_mask"]
+            segment_ids = sample_input["segment_ids"]
         else:
             input_ids = sample_input.input_ids
             input_mask = sample_input.input_mask
             segment_ids = sample_input.segment_ids
 
         with torch.no_grad():
-            model_output = self.model.forward(input_ids=torch.LongTensor(input_ids).unsqueeze(0).to(self.dev),
-                attention_mask=torch.LongTensor(input_mask).unsqueeze(0).to(self.dev),
-                token_type_ids=torch.LongTensor(segment_ids).unsqueeze(0).to(self.dev))
-            if self.version >= '4.0.0':
+            model_output = self.model.forward(
+                input_ids=torch.LongTensor(
+                    input_ids).unsqueeze(0).to(self.dev),
+                attention_mask=torch.LongTensor(
+                    input_mask).unsqueeze(0).to(self.dev),
+                token_type_ids=torch.LongTensor(
+                    segment_ids).unsqueeze(0).to(self.dev),
+            )
+            if self.version >= "4.0.0":
                 start_scores = model_output.start_logits
                 end_scores = model_output.end_logits
             else:
                 start_scores, end_scores = model_output
-            output = torch.stack([start_scores, end_scores], axis=-1).squeeze(0).cpu().numpy()
+            output = (
+                torch.stack([start_scores, end_scores], axis=-1)
+                .squeeze(0)
+                .cpu()
+                .numpy()
+            )
 
             if self.network == "sut":
                 return output.tolist()
-    
+
             response_array = array.array("B", output.tobytes())
             bi = response_array.buffer_info()
             response = lg.QuerySampleResponse(query_id, bi[0], bi[1])
             lg.QuerySamplesComplete([response])
 
-
     def flush_queries(self):
         pass
 
     def __del__(self):
         print("Finished destroying SUT.")
 
+
 def get_pytorch_sut(args):
     return BERT_PyTorch_SUT(args)
diff --git a/language/bert/ray_SUT.py b/language/bert/ray_SUT.py
index d03e52f50..0503b26d8 100644
--- a/language/bert/ray_SUT.py
+++ b/language/bert/ray_SUT.py
@@ -15,29 +15,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ray.util.actor_pool import ActorPool
+import ray
+from squad_QSL import get_squad_QSL
+from transformers import BertConfig, BertForQuestionAnswering
+import transformers
+import torch_tensorrt
+import torch
+import numpy as np
+import mlperf_loadgen as lg
 import array
 import json
 import os
 import sys
-sys.path.insert(0, os.path.join(os.getcwd(), "DeepLearningExamples", "PyTorch", "LanguageModeling", "BERT"))
-sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
-import numpy as np
-import torch
-import torch_tensorrt
-import transformers
-from transformers import BertConfig, BertForQuestionAnswering
-from squad_QSL import get_squad_QSL
-
-import ray
-from ray.util.actor_pool import ActorPool
+sys.path.insert(
+    0,
+    os.path.join(
+        os.getcwd(), "DeepLearningExamples", "PyTorch", "LanguageModeling", "BERT"
+    ),
+)
+sys.path.insert(0, os.getcwd())
 
 
 # Adjustable Parameters
-BATCH_SIZE = 16    # Note. num_samples (called "test_query_count" in CM) must be a multiple of batch_size
+# Note. num_samples (called "test_query_count" in CM) must be a multiple
+# of batch_size
+BATCH_SIZE = 16
 
-@ray.remote(num_cpus=1,num_gpus=1)
+
+@ray.remote(num_cpus=1, num_gpus=1)
 class TorchPredictor:
     def __init__(self, config_json, model_file, batch_size):
         print("init", os.getpid(), torch.cuda.device_count())
@@ -55,7 +62,8 @@ def __init__(self, config_json, model_file, batch_size):
             num_attention_heads=config_json["num_attention_heads"],
             num_hidden_layers=config_json["num_hidden_layers"],
             type_vocab_size=config_json["type_vocab_size"],
-            vocab_size=config_json["vocab_size"])
+            vocab_size=config_json["vocab_size"],
+        )
 
         self.dev = torch.device("cuda")
         self.model = BertForQuestionAnswering(config)
@@ -63,25 +71,46 @@ def __init__(self, config_json, model_file, batch_size):
         self.model.eval()
         self.model.load_state_dict(torch.load(model_file), strict=False)
         # tensor rt
-        batch_input_ids = torch.LongTensor(np.zeros((batch_size, 384))).to(self.dev)
-        traced_mlm_model = torch.jit.trace(self.model, [batch_input_ids, batch_input_ids, batch_input_ids], strict=False)
-        self.trt_model = torch_tensorrt.compile(traced_mlm_model,
+        batch_input_ids = torch.LongTensor(
+            np.zeros(
+                (batch_size, 384))).to(
+            self.dev)
+        traced_mlm_model = torch.jit.trace(
+            self.model,
+            [batch_input_ids, batch_input_ids, batch_input_ids],
+            strict=False,
+        )
+        self.trt_model = torch_tensorrt.compile(
+            traced_mlm_model,
             inputs=[
-                torch_tensorrt.Input(shape=[batch_size, 384], dtype=torch.int32),
-                torch_tensorrt.Input(shape=[batch_size, 384], dtype=torch.int32),
-                torch_tensorrt.Input(shape=[batch_size, 384], dtype=torch.int32),
-        ],
-        enabled_precisions= {torch.float32, torch.float16},
-        workspace_size=2000000000,
-        truncate_long_and_double=True)
+                torch_tensorrt.Input(
+                    shape=[
+                        batch_size,
+                        384],
+                    dtype=torch.int32),
+                torch_tensorrt.Input(
+                    shape=[
+                        batch_size,
+                        384],
+                    dtype=torch.int32),
+                torch_tensorrt.Input(
+                    shape=[
+                        batch_size,
+                        384],
+                    dtype=torch.int32),
+            ],
+            enabled_precisions={torch.float32, torch.float16},
+            workspace_size=2000000000,
+            truncate_long_and_double=True,
+        )
 
         print("done loading")
 
     # Logic for inference on 1 batch of data.
     def forward(self, batch):
-        input_ids=torch.from_numpy(batch["input_ids"]).to(self.dev)
-        attention_mask=torch.from_numpy(batch["attention_mask"]).to(self.dev)
-        token_type_ids=torch.from_numpy(batch["token_type_ids"]).to(self.dev)
+        input_ids = torch.from_numpy(batch["input_ids"]).to(self.dev)
+        attention_mask = torch.from_numpy(batch["attention_mask"]).to(self.dev)
+        token_type_ids = torch.from_numpy(batch["token_type_ids"]).to(self.dev)
         with torch.inference_mode():
             # pytorch
             # model_output = self.model.forward(input_ids=input_ids,
@@ -89,25 +118,29 @@ def forward(self, batch):
             #     token_type_ids=token_type_ids)
             # start_scores = model_output.start_logits
             # end_scores = model_output.end_logits
-            
+
             # tensor rt
-            trt_output = self.trt_model(input_ids, attention_mask, token_type_ids)
+            trt_output = self.trt_model(
+                input_ids, attention_mask, token_type_ids)
             start_scores = trt_output["start_logits"]
             end_scores = trt_output["end_logits"]
-            
-            batch_ret = torch.stack([start_scores, end_scores], axis=-1).cpu().numpy()
-            return {
-                "output": batch_ret
-            }
-    
+
+            batch_ret = torch.stack(
+                [start_scores, end_scores], axis=-1).cpu().numpy()
+            return {"output": batch_ret}
+
     def ready(self):
         pass
 
-class BERT_Ray_SUT():
+
+class BERT_Ray_SUT:
     def __init__(self, args):
         with open("bert_config.json") as f:
             config_json = json.load(f)
-        model_file = os.environ.get("ML_MODEL_FILE_WITH_PATH", "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch")
+        model_file = os.environ.get(
+            "ML_MODEL_FILE_WITH_PATH",
+            "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch",
+        )
 
         print("Constructing SUT...")
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
@@ -116,34 +149,45 @@ def __init__(self, args):
 
         try:
             ray.init(address="auto")
-        except:
+        except BaseException:
             print("WARN: Cannot connect to existing Ray cluster.")
             print("We are going to start a new RAY cluster, but pay attention that")
             print("the cluster contains only one node.")
-            print("If you want to use multiple nodes, please start the cluster manually via:")
+            print(
+                "If you want to use multiple nodes, please start the cluster manually via:"
+            )
             print("\tOn the head node, run `ray start --head`")
             print("\tOn other nodes, run `ray start --address=<head node IP>:6379`")
             ray.init()
-            
+
         self.batch_size = BATCH_SIZE
         resources = ray.cluster_resources()
-        num_gpus = int(resources.get('GPU', 0))
-        
+        num_gpus = int(resources.get("GPU", 0))
+
         print(f"The cluster has {num_gpus} GPUs.")
-        
-        self.actor_list = [TorchPredictor.remote(config_json, model_file, self.batch_size) for _ in range(num_gpus)]
+
+        self.actor_list = [
+            TorchPredictor.remote(config_json, model_file, self.batch_size)
+            for _ in range(num_gpus)
+        ]
         self.pool = ActorPool(self.actor_list)
 
         samples = []
         for i in range(self.qsl.count):
             sample = {}
             eval_features = self.qsl.get_features(i)
-            sample["input_ids"] = np.array(eval_features.input_ids).astype(np.int32)
-            sample["attention_mask"] = np.array(eval_features.input_mask).astype(np.int32)
-            sample["token_type_ids"] = np.array(eval_features.segment_ids).astype(np.int32)
+            sample["input_ids"] = np.array(
+                eval_features.input_ids).astype(
+                np.int32)
+            sample["attention_mask"] = np.array(eval_features.input_mask).astype(
+                np.int32
+            )
+            sample["token_type_ids"] = np.array(eval_features.segment_ids).astype(
+                np.int32
+            )
             samples.append(sample)
         self.samples = samples
-        
+
         print("Waiting Actors init")
         for actor in self.actor_list:
             ray.get(actor.ready.remote())
@@ -153,26 +197,38 @@ def issue_queries(self, query_samples):
         if len(query_samples) % self.batch_size != 0:
             print("ERROR: batch size must be a multiple of the number of samples")
             sys.exit(1)
-            
+
         batch_samples = []
         i = 0
         while i < len(query_samples):
             batch_sample = {
-                "input_ids": np.array([
-                    self.samples[query_sample.index]["input_ids"]
-                    for query_sample in query_samples[i:i+self.batch_size]]),
-                "attention_mask": np.array([
-                    self.samples[query_sample.index]["attention_mask"]
-                    for query_sample in query_samples[i:i+self.batch_size]]),
-                "token_type_ids": np.array([
-                    self.samples[query_sample.index]["token_type_ids"]
-                    for query_sample in query_samples[i:i+self.batch_size]]),
+                "input_ids": np.array(
+                    [
+                        self.samples[query_sample.index]["input_ids"]
+                        for query_sample in query_samples[i: i + self.batch_size]
+                    ]
+                ),
+                "attention_mask": np.array(
+                    [
+                        self.samples[query_sample.index]["attention_mask"]
+                        for query_sample in query_samples[i: i + self.batch_size]
+                    ]
+                ),
+                "token_type_ids": np.array(
+                    [
+                        self.samples[query_sample.index]["token_type_ids"]
+                        for query_sample in query_samples[i: i + self.batch_size]
+                    ]
+                ),
             }
             batch_samples.append(batch_sample)
             i = i + self.batch_size
 
         # print("samples len", len(batch_samples))
-        batch_inference_results = list(self.pool.map_unordered(lambda a, v: a.forward.remote(v), batch_samples))
+        batch_inference_results = list(
+            self.pool.map_unordered(
+                lambda a, v: a.forward.remote(v), batch_samples)
+        )
 
         cur_query_index = 0
         for batch_inference_result in batch_inference_results:
@@ -180,7 +236,9 @@ def issue_queries(self, query_samples):
             for inference_result in batch_inference_result:
                 response_array = array.array("B", inference_result.tobytes())
                 bi = response_array.buffer_info()
-                response = lg.QuerySampleResponse(query_samples[cur_query_index].id, bi[0], bi[1])
+                response = lg.QuerySampleResponse(
+                    query_samples[cur_query_index].id, bi[0], bi[1]
+                )
                 lg.QuerySamplesComplete([response])
                 cur_query_index += 1
 
@@ -190,5 +248,6 @@ def flush_queries(self):
     def __del__(self):
         print("Finished destroying SUT.")
 
+
 def get_ray_sut(args):
     return BERT_Ray_SUT(args)
diff --git a/language/bert/run.py b/language/bert/run.py
index 2afbe56f6..487100a47 100644
--- a/language/bert/run.py
+++ b/language/bert/run.py
@@ -15,52 +15,90 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from absl import flags
+from absl import app
 import subprocess
 import mlperf_loadgen as lg
 import argparse
 import os
 import sys
+
 sys.path.insert(0, os.getcwd())
 sys.path.insert(0, os.path.join(os.getcwd(), "..", "..", "lon"))
-from absl import app
-from absl import flags
+
 
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-            "--backend", choices=["tf", "pytorch", "onnxruntime", "tf_estimator", "ray"], default="tf", help="Backend")
-    parser.add_argument("--scenario", choices=["SingleStream", "Offline",
-                                               "Server", "MultiStream"], default="Offline", help="Scenario")
-    parser.add_argument("--accuracy", action="store_true",
-                        help="enable accuracy pass")
-    parser.add_argument("--quantized", action="store_true",
-                        help="use quantized model (only valid for onnxruntime backend)")
-    parser.add_argument("--profile", action="store_true",
-                        help="enable profiling (only valid for onnxruntime backend)")
+        "--backend",
+        choices=["tf", "pytorch", "onnxruntime", "tf_estimator", "ray"],
+        default="tf",
+        help="Backend",
+    )
+    parser.add_argument(
+        "--scenario",
+        choices=["SingleStream", "Offline", "Server", "MultiStream"],
+        default="Offline",
+        help="Scenario",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--quantized",
+        action="store_true",
+        help="use quantized model (only valid for onnxruntime backend)",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="enable profiling (only valid for onnxruntime backend)",
+    )
+    parser.add_argument(
+        "--mlperf_conf", default="build/mlperf.conf", help="mlperf rules config"
+    )
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
     parser.add_argument(
-            "--mlperf_conf", default="build/mlperf.conf", help="mlperf rules config")
-    parser.add_argument("--user_conf", default="user.conf",
-                        help="user config for user LoadGen settings such as target QPS")
-    parser.add_argument("--audit_conf", default="audit.conf",
-                        help="audit config for LoadGen settings during compliance runs")
-    parser.add_argument("--max_examples", type=int,
-                        help="Maximum number of examples to consider (not limited by default)")
-    parser.add_argument("--network", choices=["sut","lon",None], default=None, help="Loadgen network mode")
-    parser.add_argument('--node', type=str, default="")
-    parser.add_argument('--port', type=int, default=8000)
-    parser.add_argument('--sut_server', nargs="*", default= ['http://localhost:8000'],
-                    help='Address of the server(s) under test.')
+        "--audit_conf",
+        default="audit.conf",
+        help="audit config for LoadGen settings during compliance runs",
+    )
+    parser.add_argument(
+        "--max_examples",
+        type=int,
+        help="Maximum number of examples to consider (not limited by default)",
+    )
+    parser.add_argument(
+        "--network",
+        choices=["sut", "lon", None],
+        default=None,
+        help="Loadgen network mode",
+    )
+    parser.add_argument("--node", type=str, default="")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--sut_server",
+        nargs="*",
+        default=["http://localhost:8000"],
+        help="Address of the server(s) under test.",
+    )
 
     args = parser.parse_args()
     return args
 
 
 scenario_map = {
-        "SingleStream": lg.TestScenario.SingleStream,
-        "Offline": lg.TestScenario.Offline,
-        "Server": lg.TestScenario.Server,
-        "MultiStream": lg.TestScenario.MultiStream
-        }
+    "SingleStream": lg.TestScenario.SingleStream,
+    "Offline": lg.TestScenario.Offline,
+    "Server": lg.TestScenario.Server,
+    "MultiStream": lg.TestScenario.MultiStream,
+}
+
 
 def main():
     args = get_args()
@@ -69,27 +107,48 @@ def main():
 
     if not args.network or args.network == "sut":
         if args.backend == "pytorch":
-            assert not args.quantized, "Quantized model is only supported by onnxruntime backend!"
-            assert not args.profile, "Profiling is only supported by onnxruntime backend!"
+            assert (
+                not args.quantized
+            ), "Quantized model is only supported by onnxruntime backend!"
+            assert (
+                not args.profile
+            ), "Profiling is only supported by onnxruntime backend!"
             from pytorch_SUT import get_pytorch_sut
+
             sut = get_pytorch_sut(args)
         elif args.backend == "tf":
-            assert not args.quantized, "Quantized model is only supported by onnxruntime backend!"
-            assert not args.profile, "Profiling is only supported by onnxruntime backend!"
+            assert (
+                not args.quantized
+            ), "Quantized model is only supported by onnxruntime backend!"
+            assert (
+                not args.profile
+            ), "Profiling is only supported by onnxruntime backend!"
             from tf_SUT import get_tf_sut
+
             sut = get_tf_sut(args)
         elif args.backend == "tf_estimator":
-            assert not args.quantized, "Quantized model is only supported by onnxruntime backend!"
-            assert not args.profile, "Profiling is only supported by onnxruntime backend!"
+            assert (
+                not args.quantized
+            ), "Quantized model is only supported by onnxruntime backend!"
+            assert (
+                not args.profile
+            ), "Profiling is only supported by onnxruntime backend!"
             from tf_estimator_SUT import get_tf_estimator_sut
+
             sut = get_tf_estimator_sut()
         elif args.backend == "onnxruntime":
             from onnxruntime_SUT import get_onnxruntime_sut
+
             sut = get_onnxruntime_sut(args)
         elif args.backend == "ray":
-            assert not args.quantized, "Quantized model is only supported by onnxruntime backend!"
-            assert not args.profile, "Profiling is only supported by onnxruntime backend!"
+            assert (
+                not args.quantized
+            ), "Quantized model is only supported by onnxruntime backend!"
+            assert (
+                not args.profile
+            ), "Profiling is only supported by onnxruntime backend!"
             from ray_SUT import get_ray_sut
+
             sut = get_ray_sut(args)
         else:
             raise ValueError("Unknown backend: {:}".format(args.backend))
@@ -117,23 +176,39 @@ def main():
 
     if args.network == "lon":
         from network_LON import app, set_args, main as app_main
-        set_args(args, settings, log_settings, args.audit_conf, args.sut_server, args.backend, args.max_examples)
+
+        set_args(
+            args,
+            settings,
+            log_settings,
+            args.audit_conf,
+            args.sut_server,
+            args.backend,
+            args.max_examples,
+        )
         app.run(app_main)
 
     elif args.network == "sut":
         from network_SUT import app, node, set_backend
+
         node = args.node
         set_backend(sut)
         app.run(debug=False, port=args.port, host="0.0.0.0")
 
     else:
         print("Running LoadGen test...")
-        lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings, log_settings, args.audit_conf)
+        lg.StartTestWithLogSettings(
+            sut.sut, sut.qsl.qsl, settings, log_settings, args.audit_conf
+        )
         if args.accuracy and not os.environ.get("SKIP_VERIFY_ACCURACY"):
             cmd = "python3 {:}/accuracy-squad.py {}".format(
                 os.path.dirname(os.path.abspath(__file__)),
-                '--max_examples {}'.format(
-                    args.max_examples) if args.max_examples else '')
+                (
+                    "--max_examples {}".format(args.max_examples)
+                    if args.max_examples
+                    else ""
+                ),
+            )
             subprocess.check_call(cmd, shell=True)
 
     print("Done!")
diff --git a/language/bert/squad_QSL.py b/language/bert/squad_QSL.py
index c751c33c6..a97961fe0 100644
--- a/language/bert/squad_QSL.py
+++ b/language/bert/squad_QSL.py
@@ -15,33 +15,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pickle
+import mlperf_loadgen as lg
+from create_squad_data import read_squad_examples, convert_examples_to_features
+from transformers import BertTokenizer
 import os
 import sys
-sys.path.insert(0, os.getcwd())
 
-from transformers import BertTokenizer
-from create_squad_data import read_squad_examples, convert_examples_to_features
+sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
 
 # To support feature cache.
-import pickle
 
 max_seq_length = 384
 max_query_length = 64
 doc_stride = 128
 
-class SQuAD_v1_QSL():
-    def __init__(self, total_count_override=None, perf_count_override=None, cache_path='eval_features.pickle'):
+
+class SQuAD_v1_QSL:
+    def __init__(
+        self,
+        total_count_override=None,
+        perf_count_override=None,
+        cache_path="eval_features.pickle",
+    ):
         print("Constructing QSL...")
         eval_features = []
         # Load features if cached, convert from examples otherwise.
         if os.path.exists(cache_path):
             print("Loading cached features from '%s'..." % cache_path)
-            with open(cache_path, 'rb') as cache_file:
+            with open(cache_path, "rb") as cache_file:
                 eval_features = pickle.load(cache_file)
         else:
-            print("No cached features at '%s'... converting from examples..." % cache_path)
+            print(
+                "No cached features at '%s'... converting from examples..." % cache_path
+            )
 
             print("Creating tokenizer...")
             vocab_file = os.environ.get("VOCAB_FILE")
@@ -53,10 +61,14 @@ def __init__(self, total_count_override=None, perf_count_override=None, cache_pa
             dataset_file = os.environ.get("DATASET_FILE")
             if not dataset_file:
                 dataset_file = "build/data/dev-v1.1.json"
-            eval_examples = read_squad_examples(input_file=dataset_file,
-                is_training=False, version_2_with_negative=False)
+            eval_examples = read_squad_examples(
+                input_file=dataset_file,
+                is_training=False,
+                version_2_with_negative=False,
+            )
 
             print("Converting examples to features...")
+
             def append_feature(feature):
                 eval_features.append(feature)
 
@@ -68,16 +80,22 @@ def append_feature(feature):
                 max_query_length=max_query_length,
                 is_training=False,
                 output_fn=append_feature,
-                verbose_logging=False)
+                verbose_logging=False,
+            )
 
             print("Caching features at '%s'..." % cache_path)
-            with open(cache_path, 'wb') as cache_file:
+            with open(cache_path, "wb") as cache_file:
                 pickle.dump(eval_features, cache_file)
 
         self.eval_features = eval_features
         self.count = total_count_override or len(self.eval_features)
         self.perf_count = perf_count_override or self.count
-        self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples)
+        self.qsl = lg.ConstructQSL(
+            self.count,
+            self.perf_count,
+            self.load_query_samples,
+            self.unload_query_samples,
+        )
         print("Finished constructing QSL.")
 
     def load_query_samples(self, sample_list):
@@ -92,5 +110,6 @@ def get_features(self, sample_id):
     def __del__(self):
         print("Finished destroying QSL.")
 
+
 def get_squad_QSL(total_count_override=None, perf_count_override=None):
     return SQuAD_v1_QSL(total_count_override, perf_count_override)
diff --git a/language/bert/tf_SUT.py b/language/bert/tf_SUT.py
index 2956e6d97..382bae541 100644
--- a/language/bert/tf_SUT.py
+++ b/language/bert/tf_SUT.py
@@ -15,34 +15,49 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from squad_QSL import get_squad_QSL
+from tensorflow.python.platform import gfile
+import tensorflow as tf
+import numpy as np
+import mlperf_loadgen as lg
 import array
 import os
 import sys
-sys.path.insert(0, os.path.join(os.getcwd(), "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"))
+
+sys.path.insert(
+    0,
+    os.path.join(
+        os.getcwd(), "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"
+    ),
+)
 sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
-import numpy as np
-import tensorflow as tf
-from tensorflow.python.platform import gfile
-from squad_QSL import get_squad_QSL
 
-class BERT_TF_SUT():
+class BERT_TF_SUT:
     def __init__(self, args):
         print("Loading TF model...")
         infer_config = tf.compat.v1.ConfigProto()
-        infer_config.intra_op_parallelism_threads = int(os.environ['TF_INTRA_OP_PARALLELISM_THREADS']) \
-                if 'TF_INTRA_OP_PARALLELISM_THREADS' in os.environ else os.cpu_count()
-        infer_config.inter_op_parallelism_threads = int(os.environ['TF_INTER_OP_PARALLELISM_THREADS']) \
-                if 'TF_INTER_OP_PARALLELISM_THREADS' in os.environ else os.cpu_count()
+        infer_config.intra_op_parallelism_threads = (
+            int(os.environ["TF_INTRA_OP_PARALLELISM_THREADS"])
+            if "TF_INTRA_OP_PARALLELISM_THREADS" in os.environ
+            else os.cpu_count()
+        )
+        infer_config.inter_op_parallelism_threads = (
+            int(os.environ["TF_INTER_OP_PARALLELISM_THREADS"])
+            if "TF_INTER_OP_PARALLELISM_THREADS" in os.environ
+            else os.cpu_count()
+        )
         infer_config.use_per_session_threads = 1
         self.sess = tf.compat.v1.Session(config=infer_config)
-        model_file = os.environ.get('ML_MODEL_FILE_WITH_PATH', 'build/data/bert_tf_v1_1_large_fp32_384_v2/model.pb')
-        with gfile.FastGFile(model_file, 'rb') as f:
+        model_file = os.environ.get(
+            "ML_MODEL_FILE_WITH_PATH",
+            "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pb",
+        )
+        with gfile.FastGFile(model_file, "rb") as f:
             graph_def = tf.compat.v1.GraphDef()
             graph_def.ParseFromString(f.read())
             self.sess.graph.as_default()
-            tf.import_graph_def(graph_def, name='')
+            tf.import_graph_def(graph_def, name="")
 
         print("Constructing SUT...")
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
@@ -53,20 +68,23 @@ def __init__(self, args):
     def issue_queries(self, query_samples):
         for i in range(len(query_samples)):
             eval_features = self.qsl.get_features(query_samples[i].index)
-            input_ids   = np.array([eval_features.input_ids])
-            input_mask  = np.array([eval_features.input_mask])
+            input_ids = np.array([eval_features.input_ids])
+            input_mask = np.array([eval_features.input_mask])
             segment_ids = np.array([eval_features.segment_ids])
             feeds = {
-                'input_ids:0':   input_ids,
-                'input_mask:0':  input_mask,
-                'segment_ids:0': segment_ids
+                "input_ids:0": input_ids,
+                "input_mask:0": input_mask,
+                "segment_ids:0": segment_ids,
             }
             result = self.sess.run(["logits:0"], feed_dict=feeds)
 
             logits = [float(x) for x in result[0].flat]
-            response_array = array.array("B", np.array(logits).astype(np.float32).tobytes())
+            response_array = array.array(
+                "B", np.array(logits).astype(np.float32).tobytes()
+            )
             bi = response_array.buffer_info()
-            response = lg.QuerySampleResponse(query_samples[i].id, bi[0], bi[1])
+            response = lg.QuerySampleResponse(
+                query_samples[i].id, bi[0], bi[1])
             lg.QuerySamplesComplete([response])
 
     def flush_queries(self):
@@ -75,5 +93,6 @@ def flush_queries(self):
     def __del__(self):
         print("Finished destroying SUT.")
 
+
 def get_tf_sut(args):
     return BERT_TF_SUT(args)
diff --git a/language/bert/tf_estimator_SUT.py b/language/bert/tf_estimator_SUT.py
index a795da52c..a5866aef0 100644
--- a/language/bert/tf_estimator_SUT.py
+++ b/language/bert/tf_estimator_SUT.py
@@ -14,32 +14,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from squad_QSL import get_squad_QSL
+import tensorflow as tf
+import numpy as np
+import modeling
+import mlperf_loadgen as lg
 import array
 import json
 import os
 import sys
-sys.path.insert(0, os.path.join(os.getcwd(), "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"))
+
+sys.path.insert(
+    0,
+    os.path.join(
+        os.getcwd(), "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"
+    ),
+)
 sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
-import modeling
-import numpy as np
-import tensorflow as tf
-from squad_QSL import get_squad_QSL
 
-# Allow TF to increase GPU memory usage dynamically to prevent cuBLAS init problems.
+# Allow TF to increase GPU memory usage dynamically to prevent cuBLAS init
+# problems.
 config = tf.compat.v1.ConfigProto()
 config.gpu_options.allow_growth = True
 session = tf.compat.v1.Session(config=config)
 
-class BERT_TF_ESTIMATOR_SUT():
+
+class BERT_TF_ESTIMATOR_SUT:
     def __init__(self, batch_size=8):
         print("Loading TF model...")
         bert_config = modeling.BertConfig.from_json_file("bert_config.json")
 
         model_fn = self.model_fn_builder(
             bert_config=bert_config,
-            init_checkpoint=os.environ.get("ML_MODEL_FILE_WITH_PATH", "build/data/bert_tf_v1_1_large_fp32_384_v2/model.ckpt-5474"))
+            init_checkpoint=os.environ.get(
+                "ML_MODEL_FILE_WITH_PATH",
+                "build/data/bert_tf_v1_1_large_fp32_384_v2/model.ckpt-5474",
+            ),
+        )
 
         self.estimator = tf.estimator.Estimator(model_fn=model_fn)
         self.batch_size = batch_size
@@ -55,7 +67,8 @@ def issue_queries(self, query_samples):
         input_mask = np.zeros((len(query_samples), 1, 384), dtype=np.int32)
         segment_ids = np.zeros((len(query_samples), 1, 384), dtype=np.int32)
         for sample_idx in range(len(query_samples)):
-            eval_features = self.qsl.get_features(query_samples[sample_idx].index)
+            eval_features = self.qsl.get_features(
+                query_samples[sample_idx].index)
             input_ids[sample_idx, ...] = np.array(eval_features.input_ids)
             input_mask[sample_idx, ...] = np.array(eval_features.input_mask)
             segment_ids[sample_idx, ...] = np.array(eval_features.segment_ids)
@@ -64,15 +77,18 @@ def input_fn():
             inputs = {
                 "input_ids": input_ids,
                 "input_mask": input_mask,
-                "segment_ids": segment_ids
+                "segment_ids": segment_ids,
             }
             return tf.data.Dataset.from_tensor_slices(inputs)
 
         for i, result in enumerate(self.estimator.predict(input_fn)):
             logits = [float(x) for x in result["logits"].flat]
-            response_array = array.array("B", np.array(logits).astype(np.float32).tobytes())
+            response_array = array.array(
+                "B", np.array(logits).astype(np.float32).tobytes()
+            )
             bi = response_array.buffer_info()
-            response = lg.QuerySampleResponse(query_samples[i].id, bi[0], bi[1])
+            response = lg.QuerySampleResponse(
+                query_samples[i].id, bi[0], bi[1])
             lg.QuerySamplesComplete([response])
 
     def flush_queries(self):
@@ -81,33 +97,51 @@ def flush_queries(self):
     def __del__(self):
         print("Finished destroying SUT.")
 
-    def create_model(self, bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings):
+    def create_model(
+        self,
+        bert_config,
+        is_training,
+        input_ids,
+        input_mask,
+        segment_ids,
+        use_one_hot_embeddings,
+    ):
         """Creates a classification model."""
         model = modeling.BertModel(
-                config=bert_config,
-                is_training=is_training,
-                input_ids=input_ids,
-                input_mask=input_mask,
-                token_type_ids=segment_ids,
-                use_one_hot_embeddings=use_one_hot_embeddings,
-                compute_type=tf.float32)
+            config=bert_config,
+            is_training=is_training,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            token_type_ids=segment_ids,
+            use_one_hot_embeddings=use_one_hot_embeddings,
+            compute_type=tf.float32,
+        )
 
         final_hidden = model.get_sequence_output()
 
-        final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
+        final_hidden_shape = modeling.get_shape_list(
+            final_hidden, expected_rank=3)
         batch_size = final_hidden_shape[0]
         seq_length = final_hidden_shape[1]
         hidden_size = final_hidden_shape[2]
 
         output_weights = tf.get_variable(
-                "cls/squad/output_weights", [2, hidden_size],
-                initializer=tf.truncated_normal_initializer(stddev=0.02))
+            "cls/squad/output_weights",
+            [2, hidden_size],
+            initializer=tf.truncated_normal_initializer(stddev=0.02),
+        )
 
         output_bias = tf.get_variable(
-                "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())
-
-        final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size])
-        logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
+            "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()
+        )
+
+        final_hidden_matrix = tf.reshape(
+            final_hidden, [batch_size * seq_length, hidden_size]
+        )
+        logits = tf.matmul(
+            final_hidden_matrix,
+            output_weights,
+            transpose_b=True)
         logits = tf.nn.bias_add(logits, output_bias)
 
         logits = tf.reshape(logits, [batch_size, seq_length, 2])
@@ -121,7 +155,9 @@ def create_model(self, bert_config, is_training, input_ids, input_mask, segment_
 
         # return (start_logits, end_logits)
 
-    def model_fn_builder(self, bert_config, init_checkpoint, use_one_hot_embeddings=False):
+    def model_fn_builder(
+        self, bert_config, init_checkpoint, use_one_hot_embeddings=False
+    ):
         """Returns `model_fn` closure for Estimator."""
 
         def model_fn(features, labels):  # pylint: disable=unused-argument
@@ -135,20 +171,24 @@ def model_fn(features, labels):  # pylint: disable=unused-argument
                 input_ids=input_ids,
                 input_mask=input_mask,
                 segment_ids=segment_ids,
-                use_one_hot_embeddings=use_one_hot_embeddings)
+                use_one_hot_embeddings=use_one_hot_embeddings,
+            )
 
             tvars = tf.compat.v1.trainable_variables()
 
             initialized_variable_names = {}
-            (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+            (assignment_map, initialized_variable_names) = (
+                modeling.get_assignment_map_from_checkpoint(
+                    tvars, init_checkpoint)
+            )
 
-            tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map)
+            tf.compat.v1.train.init_from_checkpoint(
+                init_checkpoint, assignment_map)
 
-            predictions = {
-                "logits": logits
-            }
+            predictions = {"logits": logits}
             output_spec = tf.estimator.EstimatorSpec(
-                mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions)
+                mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions
+            )
 
             return output_spec
 
diff --git a/language/bert/tf_freeze_bert.py b/language/bert/tf_freeze_bert.py
index 8a2e23bf5..f00d72dbe 100644
--- a/language/bert/tf_freeze_bert.py
+++ b/language/bert/tf_freeze_bert.py
@@ -14,63 +14,82 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from tensorflow.tools.graph_transforms import TransformGraph
+from tensorflow.python.tools import optimize_for_inference_lib
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import dtypes
+import tensorflow as tf
+import numpy as np
+import modeling
 import array
 import json
 import os
 import sys
-sys.path.insert(0, os.path.join(os.getcwd(), "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"))
-sys.path.insert(0, os.getcwd())
 
-import modeling
-import numpy as np
-import tensorflow as tf
+sys.path.insert(
+    0,
+    os.path.join(
+        os.getcwd(), "DeepLearningExamples", "TensorFlow", "LanguageModeling", "BERT"
+    ),
+)
+sys.path.insert(0, os.getcwd())
 
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import graph_util
-from tensorflow.python.tools import optimize_for_inference_lib
-from tensorflow.tools.graph_transforms import TransformGraph
 
 def save_model(fname, sess, graph=None):
-  def save(fname, graph_def):
-    pass
-    with tf.Graph().as_default() as g:
-        tf.import_graph_def(graph_def, name='')
-        graph_def = g.as_graph_def(add_shapes=True)
-    tf.train.write_graph(graph_def, ".", fname, as_text=False)
-
-  if graph == None:
-    graph_def = sess.graph_def
-  else:
-    graph_def = graph.as_graph_def(add_shapes=True)
-
-  input_nodes = ['IteratorGetNext:0', 'IteratorGetNext:1', 'IteratorGetNext:2']
-  output_nodes =  ['logits']
-
-  graph_def = graph_util.convert_variables_to_constants(
-      sess=sess,
-      input_graph_def=graph_def,
-      output_node_names=output_nodes)
-  graph_def = graph_util.remove_training_nodes(graph_def, protected_nodes=output_nodes)
-  graph_def = optimize_for_inference_lib.optimize_for_inference(graph_def, [], output_nodes, dtypes.float32.as_datatype_enum)
-
-  transforms = [
-    'remove_nodes(op=Identity, op=StopGradient)',
-    'fold_batch_norms',
-    'fold_old_batch_norms',
-  ]
-  graph_def = TransformGraph(graph_def, input_nodes, output_nodes, transforms)
-  save("build/data/bert_tf_v1_1_large_fp32_384_v2/model.pb", graph_def)
-
-def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings):
+    def save(fname, graph_def):
+        pass
+        with tf.Graph().as_default() as g:
+            tf.import_graph_def(graph_def, name="")
+            graph_def = g.as_graph_def(add_shapes=True)
+        tf.train.write_graph(graph_def, ".", fname, as_text=False)
+
+    if graph is None:
+        graph_def = sess.graph_def
+    else:
+        graph_def = graph.as_graph_def(add_shapes=True)
+
+    input_nodes = [
+        "IteratorGetNext:0",
+        "IteratorGetNext:1",
+        "IteratorGetNext:2"]
+    output_nodes = ["logits"]
+
+    graph_def = graph_util.convert_variables_to_constants(
+        sess=sess, input_graph_def=graph_def, output_node_names=output_nodes
+    )
+    graph_def = graph_util.remove_training_nodes(
+        graph_def, protected_nodes=output_nodes
+    )
+    graph_def = optimize_for_inference_lib.optimize_for_inference(
+        graph_def, [], output_nodes, dtypes.float32.as_datatype_enum
+    )
+
+    transforms = [
+        "remove_nodes(op=Identity, op=StopGradient)",
+        "fold_batch_norms",
+        "fold_old_batch_norms",
+    ]
+    graph_def = TransformGraph(
+        graph_def,
+        input_nodes,
+        output_nodes,
+        transforms)
+    save("build/data/bert_tf_v1_1_large_fp32_384_v2/model.pb", graph_def)
+
+
+def create_model(
+    bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings
+):
     """Creates a classification model."""
     model = modeling.BertModel(
-            config=bert_config,
-            is_training=is_training,
-            input_ids=input_ids,
-            input_mask=input_mask,
-            token_type_ids=segment_ids,
-            use_one_hot_embeddings=use_one_hot_embeddings,
-            compute_type=tf.float32)
+        config=bert_config,
+        is_training=is_training,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=segment_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings,
+        compute_type=tf.float32,
+    )
 
     final_hidden = model.get_sequence_output()
 
@@ -80,26 +99,32 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, u
     hidden_size = final_hidden_shape[2]
 
     output_weights = tf.get_variable(
-            "cls/squad/output_weights", [2, hidden_size],
-            initializer=tf.truncated_normal_initializer(stddev=0.02))
+        "cls/squad/output_weights",
+        [2, hidden_size],
+        initializer=tf.truncated_normal_initializer(stddev=0.02),
+    )
 
     output_bias = tf.get_variable(
-            "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())
+        "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()
+    )
 
-    final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size])
+    final_hidden_matrix = tf.reshape(
+        final_hidden, [batch_size * seq_length, hidden_size]
+    )
     logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
     logits = tf.nn.bias_add(logits, output_bias)
 
     logits = tf.reshape(logits, [batch_size, seq_length, 2], name="logits")
     return logits
 
+
 def main():
     bert_config = modeling.BertConfig.from_json_file("bert_config.json")
-    init_checkpoint="build/data/bert_tf_v1_1_large_fp32_384_v2/model.ckpt-5474"
+    init_checkpoint = "build/data/bert_tf_v1_1_large_fp32_384_v2/model.ckpt-5474"
 
-    input_ids = tf.placeholder(tf.int32, shape=(1,384), name="input_ids")
-    input_mask = tf.placeholder(tf.int32, shape=(1,384), name="input_mask")
-    segment_ids = tf.placeholder(tf.int32, shape=(1,384), name="segment_ids")
+    input_ids = tf.placeholder(tf.int32, shape=(1, 384), name="input_ids")
+    input_mask = tf.placeholder(tf.int32, shape=(1, 384), name="input_mask")
+    segment_ids = tf.placeholder(tf.int32, shape=(1, 384), name="segment_ids")
 
     logits = create_model(
         bert_config=bert_config,
@@ -107,24 +132,27 @@ def main():
         input_ids=input_ids,
         input_mask=input_mask,
         segment_ids=segment_ids,
-        use_one_hot_embeddings=False)
+        use_one_hot_embeddings=False,
+    )
 
     tvars = tf.compat.v1.trainable_variables()
 
     initialized_variable_names = {}
-    (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+    (assignment_map, initialized_variable_names) = (
+        modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+    )
 
     tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map)
 
-    predictions = {
-        "logits": logits
-    }
+    predictions = {"logits": logits}
     output_spec = tf.estimator.EstimatorSpec(
-        mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions)
+        mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions
+    )
 
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
         save_model("bert_large_nv.pb", sess)
 
+
 if __name__ == "__main__":
     main()
diff --git a/language/gpt-j/backend.py b/language/gpt-j/backend.py
index 7a01ee8a5..cc748e146 100644
--- a/language/gpt-j/backend.py
+++ b/language/gpt-j/backend.py
@@ -14,12 +14,15 @@
     "early_stopping": True,
     "max_new_tokens": 128,
     "min_new_tokens": 30,
-    "num_beams": int(os.environ.get("GPTJ_BEAM_SIZE", "4")), # only beam_size 4 is allowed for official submission
+    "num_beams": int(
+        os.environ.get("GPTJ_BEAM_SIZE", "4")
+    ),  # only beam_size 4 is allowed for official submission
 }
 
 
-class SUT_base():
-    def __init__(self, model_path, dtype, dataset_path, max_examples, use_gpu=False):
+class SUT_base:
+    def __init__(self, model_path, dtype, dataset_path,
+                 max_examples, use_gpu=False):
         # TODO : Pass model file name to init instead of args
         print("Loading PyTorch model...")
         self.model_name = "EleutherAI/gpt-j-6B"
@@ -27,11 +30,11 @@ def __init__(self, model_path, dtype, dataset_path, max_examples, use_gpu=False)
         self.model_path = model_path
         self.use_gpu = use_gpu
         # dtype
-        if dtype == 'bfloat16':
+        if dtype == "bfloat16":
             self.amp_enabled = True
             self.amp_dtype = torch.bfloat16
             print("BF16 autocast")
-        elif dtype == 'float16':
+        elif dtype == "float16":
             self.amp_enabled = True
             self.amp_dtype = torch.float16
         else:
@@ -42,7 +45,7 @@ def __init__(self, model_path, dtype, dataset_path, max_examples, use_gpu=False)
             self.model_path,
             device_map="auto" if not self.use_gpu else None,
             low_cpu_mem_usage=True if not self.use_gpu else False,
-            torch_dtype=self.amp_dtype
+            torch_dtype=self.amp_dtype,
         )
 
         # Cast the model to GPU if the flag is set.
@@ -59,13 +62,19 @@ def __init__(self, model_path, dtype, dataset_path, max_examples, use_gpu=False)
             self.model_name,
             model_max_length=1919,
             padding_side="left",
-            use_fast=False,)
+            use_fast=False,
+        )
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
         self.data_object = Dataset(
-            self.dataset_path, total_count_override=max_examples)
-        self.qsl = lg.ConstructQSL(self.data_object.count, self.data_object.perf_count,
-                                   self.data_object.LoadSamplesToRam, self.data_object.UnloadSamplesFromRam)
+            self.dataset_path,
+            total_count_override=max_examples)
+        self.qsl = lg.ConstructQSL(
+            self.data_object.count,
+            self.data_object.perf_count,
+            self.data_object.LoadSamplesToRam,
+            self.data_object.UnloadSamplesFromRam,
+        )
 
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
 
@@ -86,28 +95,39 @@ def issue_queries(self, query_samples):
                 input_ids_tensor = input_ids_tensor.to(self.device)
                 input_masks_tensor = input_masks_tensor.to(self.device)
 
-            pred_output_batch = self.inference_call(
-                input_ids_tensor, input_masks_tensor).cpu().numpy()
+            pred_output_batch = (
+                self.inference_call(
+                    input_ids_tensor,
+                    input_masks_tensor).cpu().numpy()
+            )
 
             response_array = array.array("B", pred_output_batch[0].tobytes())
             bi = response_array.buffer_info()
-            response = [lg.QuerySampleResponse(
-                query_samples[i].id, bi[0], bi[1])]
+            response = [
+                lg.QuerySampleResponse(
+                    query_samples[i].id,
+                    bi[0],
+                    bi[1])]
             lg.QuerySamplesComplete(response)
             if i % 5 == 0:
                 print("Completed : ", i)
 
     def inference_call(self, input_ids_tensor, input_masks_tensor):
-        ''' Common for all scenarios '''
-        torch_device_type = 'cuda' if self.use_gpu else 'cpu'
-
-        with torch.inference_mode(), torch.autocast(device_type=torch_device_type, enabled=self.amp_enabled, dtype=self.amp_dtype if self.amp_enabled else None):
+        """Common for all scenarios"""
+        torch_device_type = "cuda" if self.use_gpu else "cpu"
+
+        with torch.inference_mode(), torch.autocast(
+            device_type=torch_device_type,
+            enabled=self.amp_enabled,
+            dtype=self.amp_dtype if self.amp_enabled else None,
+        ):
             input_batch = dict()
-            input_batch['input_ids'] = input_ids_tensor
-            input_batch['attention_mask'] = input_masks_tensor
+            input_batch["input_ids"] = input_ids_tensor
+            input_batch["attention_mask"] = input_masks_tensor
 
             output_batch = self.model.generate(
-                **input_batch, **gen_kwargs, pad_token_id=self.tokenizer.eos_token_id)
+                **input_batch, **gen_kwargs, pad_token_id=self.tokenizer.eos_token_id
+            )
 
             input_batch_lengths = [x.shape[0]
                                    for x in input_batch["input_ids"]]
@@ -131,14 +151,27 @@ def __del__(self):
 
 class SUT_Offline(SUT_base):
     def __init__(self, model_path, dtype, dataset_path, max_examples, use_gpu):
-        SUT_base.__init__(self, model_path, dtype, dataset_path, max_examples, use_gpu)
-    '''IssueQuery and inference methods implemented in Base class'''
+        SUT_base.__init__(
+            self,
+            model_path,
+            dtype,
+            dataset_path,
+            max_examples,
+            use_gpu)
+
+    """IssueQuery and inference methods implemented in Base class"""
 
 
 class SUT_Server(SUT_base):
     def __init__(self, model_path, dtype, dataset_path, max_examples, use_gpu):
 
-        SUT_base.__init__(self, model_path, dtype, dataset_path, max_examples, use_gpu)
+        SUT_base.__init__(
+            self,
+            model_path,
+            dtype,
+            dataset_path,
+            max_examples,
+            use_gpu)
         self.total_samples_done = 0
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
         print("SUT Server")
@@ -153,8 +186,11 @@ def issue_queries(self, query_samples):
             input_ids_tensor = input_ids_tensor.to(self.device)
             input_masks_tensor = input_masks_tensor.to(self.device)
 
-        pred_output_batch = self.inference_call(
-            input_ids_tensor, input_masks_tensor).cpu().numpy()
+        pred_output_batch = (
+            self.inference_call(
+                input_ids_tensor,
+                input_masks_tensor).cpu().numpy()
+        )
 
         response_array = array.array("B", pred_output_batch.tobytes())
         bi = response_array.buffer_info()
@@ -167,7 +203,13 @@ def issue_queries(self, query_samples):
 
 class SUT_SingleStream(SUT_base):
     def __init__(self, model_path, dtype, dataset_path, max_examples, use_gpu):
-        SUT_base.__init__(self, model_path, dtype, dataset_path, max_examples, use_gpu)
+        SUT_base.__init__(
+            self,
+            model_path,
+            dtype,
+            dataset_path,
+            max_examples,
+            use_gpu)
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
         self.total_samples_done = 0
 
@@ -181,8 +223,11 @@ def issue_queries(self, query_samples):
             input_ids_tensor = input_ids_tensor.to(self.device)
             input_masks_tensor = input_masks_tensor.to(self.device)
 
-        pred_output_batch = self.inference_call(
-            input_ids_tensor, input_masks_tensor).cpu().numpy()
+        pred_output_batch = (
+            self.inference_call(
+                input_ids_tensor,
+                input_masks_tensor).cpu().numpy()
+        )
 
         response_array = array.array("B", pred_output_batch.tobytes())
         bi = response_array.buffer_info()
@@ -193,10 +238,14 @@ def issue_queries(self, query_samples):
             print("Completed : ", self.total_samples_done)
 
 
-def get_SUT(model_path, scenario, dtype, dataset_path, max_examples, use_gpu=False):
+def get_SUT(model_path, scenario, dtype, dataset_path,
+            max_examples, use_gpu=False):
     if scenario == "Offline":
-        return SUT_Offline(model_path, dtype, dataset_path, max_examples, use_gpu)
+        return SUT_Offline(model_path, dtype, dataset_path,
+                           max_examples, use_gpu)
     elif scenario == "Server":
-        return SUT_Server(model_path, dtype, dataset_path, max_examples, use_gpu)
+        return SUT_Server(model_path, dtype, dataset_path,
+                          max_examples, use_gpu)
     elif scenario == "SingleStream":
-        return SUT_SingleStream(model_path, dtype, dataset_path, max_examples, use_gpu)
+        return SUT_SingleStream(
+            model_path, dtype, dataset_path, max_examples, use_gpu)
diff --git a/language/gpt-j/dataset.py b/language/gpt-j/dataset.py
index 37d9cf354..36cff5989 100644
--- a/language/gpt-j/dataset.py
+++ b/language/gpt-j/dataset.py
@@ -25,8 +25,16 @@
 }
 
 
-class Dataset():
-    def __init__(self, dataset_path, batch_size=1, pad_val=1, pad_max=196, total_count_override=None, perf_count_override=None):
+class Dataset:
+    def __init__(
+        self,
+        dataset_path,
+        batch_size=1,
+        pad_val=1,
+        pad_max=196,
+        total_count_override=None,
+        perf_count_override=None,
+    ):
         print("Constructing QSL")
 
         self.dataset = "cnn_dailymail"
@@ -40,18 +48,25 @@ def __init__(self, dataset_path, batch_size=1, pad_val=1, pad_max=196, total_cou
             self.model_name,
             model_max_length=2048,
             padding_side="left",
-            use_fast=False,)
+            use_fast=False,
+        )
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
         self.list_data_dict = utils.jload(self.dataset_path)
 
-        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
-        self.sources = [prompt_input.format_map(
-            example) for example in self.list_data_dict]
+        prompt_input, prompt_no_input = (
+            PROMPT_DICT["prompt_input"],
+            PROMPT_DICT["prompt_no_input"],
+        )
+        self.sources = [
+            prompt_input.format_map(example) for example in self.list_data_dict
+        ]
         self.targets = [
             f"{example['output']}" for example in self.list_data_dict]
 
-        self.source_encoded_input_ids, self.source_encoded_attn_masks = self.encode_samples()
+        self.source_encoded_input_ids, self.source_encoded_attn_masks = (
+            self.encode_samples()
+        )
 
         self.count = total_count_override or len(self.sources)
         self.perf_count = perf_count_override or self.count
@@ -65,9 +80,13 @@ def encode_samples(self):
         source_encoded_attn_masks = []
 
         for i in range(total_samples):
-            source_encoded = self.tokenizer(self.sources[i], return_tensors="pt",
-                                            padding=True, truncation=True,
-                                            max_length=1919)
+            source_encoded = self.tokenizer(
+                self.sources[i],
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=1919,
+            )
             source_encoded_input_ids.append(source_encoded.input_ids)
             source_encoded_attn_masks.append(source_encoded.attention_mask)
 
diff --git a/language/gpt-j/download_cnndm.py b/language/gpt-j/download_cnndm.py
index fa47f1af8..240db3cdb 100644
--- a/language/gpt-j/download_cnndm.py
+++ b/language/gpt-j/download_cnndm.py
@@ -1,24 +1,24 @@
 # experiment config
+import sys
+import simplejson as json
+import os
+import numpy as np
+from transformers import AutoTokenizer
+from datasets import load_dataset, concatenate_datasets
 model_id = "EleutherAI/gpt-j-6b"
 dataset_id = "cnn_dailymail"
 dataset_config = "3.0.0"
 text_column = "article"
 summary_column = "highlights"
 
-from datasets import load_dataset, concatenate_datasets
-from transformers import AutoTokenizer
-import numpy as np
-import os
-import simplejson as json
-import sys
 
-save_dataset_path = os.environ.get('DATASET_CNNDM_PATH', "data")
+save_dataset_path = os.environ.get("DATASET_CNNDM_PATH", "data")
 
 # Check whether the specified path exists or not
 isExist = os.path.exists(save_dataset_path)
 if not isExist:
-   # Create a new directory because it does not exist
-   os.makedirs(save_dataset_path)
+    # Create a new directory because it does not exist
+    os.makedirs(save_dataset_path)
 
 # Load dataset from the hub
 dataset = load_dataset(dataset_id, name=dataset_config)
@@ -29,7 +29,6 @@
 tokenizer.model_max_length = 2048
 
 
-
 instruction_template = "Summarize the following news article:"
 
 prompt_length = len(tokenizer(instruction_template)["input_ids"])
@@ -51,14 +50,18 @@ def preprocess_function(sample, padding="max_length"):
 
     return model_inputs
 
+
 # process dataset
-tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=list(dataset["train"].features))
+tokenized_dataset = dataset.map(
+    preprocess_function, batched=True, remove_columns=list(dataset["train"].features)
+)
 
 # save dataset to disk
 
-with open(os.path.join(save_dataset_path,"cnn_eval.json"), 'w') as write_f:
-    json.dump(tokenized_dataset["validation"]["text"], write_f, indent=4, ensure_ascii=False)
-
+with open(os.path.join(save_dataset_path, "cnn_eval.json"), "w") as write_f:
+    json.dump(
+        tokenized_dataset["validation"]["text"], write_f, indent=4, ensure_ascii=False
+    )
 
-print("Dataset saved in ",save_dataset_path)
 
+print("Dataset saved in ", save_dataset_path)
diff --git a/language/gpt-j/download_gptj.py b/language/gpt-j/download_gptj.py
index 967862b41..ac9a5df13 100644
--- a/language/gpt-j/download_gptj.py
+++ b/language/gpt-j/download_gptj.py
@@ -8,22 +8,24 @@
 #     print("Please provide a valid path for downloaded model")
 #     print("usage : python download_gptj.py <path_where_to_save_model>")
 #     exit()
-# else:  
+# else:
 #     model_path = sys.argv[1]
 #     if not os.path.exists(os.path.dirname(model_path)):
 #         print("Error : Please provide a valid path")
 #         exit()
 
 
-model_path = os.path.join(os.getcwd(),"model")
+model_path = os.path.join(os.getcwd(), "model")
 
 if not os.path.exists(os.path.dirname(model_path)):
     os.mkdir(model_path)
 
 model_name = "EleutherAI/gpt-j-6B"
-model = AutoModelForCausalLM.from_pretrained(model_name,device_map="auto",torchscript=True)  # torchscript will force `return_dict=False` to avoid jit errors
+model = AutoModelForCausalLM.from_pretrained(
+    model_name, device_map="auto", torchscript=True
+)  # torchscript will force `return_dict=False` to avoid jit errors
 print("Loaded model")
 
 model.save_pretrained(model_path)
 
-print("Model downloaded and Saved in : ",model_path)
\ No newline at end of file
+print("Model downloaded and Saved in : ", model_path)
diff --git a/language/gpt-j/evaluation.py b/language/gpt-j/evaluation.py
index 1b37b3a3a..3203359d0 100644
--- a/language/gpt-j/evaluation.py
+++ b/language/gpt-j/evaluation.py
@@ -17,14 +17,23 @@
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True,
-                        help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--dataset-file", required=True,
-                        help="path to cnn_eval.json")
-    parser.add_argument("--verbose", action="store_true",
-                        help="verbose messages")
-    parser.add_argument("--dtype", default="int64",
-                        help="dtype of the accuracy log", choices=["int32", "int64"])
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--dataset-file",
+        required=True,
+        help="path to cnn_eval.json")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="int64",
+        help="dtype of the accuracy log",
+        choices=["int32", "int64"],
+    )
     args = parser.parse_args()
     return args
 
@@ -46,13 +55,14 @@ def main():
     model_name = "EleutherAI/gpt-j-6B"
     dataset_path = args.dataset_file
     metric = evaluate.load("rouge")
-    nltk.download('punkt')
+    nltk.download("punkt")
 
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         model_max_length=2048,
         padding_side="left",
-        use_fast=False,)
+        use_fast=False,
+    )
     tokenizer.pad_token = tokenizer.eos_token
 
     data_object = Dataset(dataset_path)
@@ -66,11 +76,11 @@ def main():
     dedup_results = []
     seen = set()
     for result in results:
-        item = result['qsl_idx']
+        item = result["qsl_idx"]
         if item not in seen:
             seen.add(item)
             dedup_results.append(result)
-    results = dedup_results      
+    results = dedup_results
 
     target_required = []
     preds_token_ids = []
@@ -80,19 +90,24 @@ def main():
         eval_dtype = np.int32
 
     for pred in results:
-        qsl_idx = pred['qsl_idx']
+        qsl_idx = pred["qsl_idx"]
         target = targets[qsl_idx]
         target_required.append(target)
-        preds_token_ids.append(np.frombuffer(
-            bytes.fromhex(pred['data']), eval_dtype))
+        preds_token_ids.append(
+            np.frombuffer(
+                bytes.fromhex(
+                    pred["data"]),
+                eval_dtype))
 
     preds_decoded_text = tokenizer.batch_decode(
-        preds_token_ids, skip_special_tokens=True)
+        preds_token_ids, skip_special_tokens=True
+    )
 
     preds, targets = postprocess_text(preds_decoded_text, target_required)
 
     result = metric.compute(
-        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False)
+        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
+    )
     result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
     prediction_lens = [len(pred) for pred in preds]
     result["gen_len"] = np.sum(prediction_lens)
diff --git a/language/gpt-j/main.py b/language/gpt-j/main.py
index 367af5212..ec8e631f6 100644
--- a/language/gpt-j/main.py
+++ b/language/gpt-j/main.py
@@ -5,35 +5,67 @@
 
 import sys
 from backend import get_SUT
+
 sys.path.insert(0, os.getcwd())
 
 
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--backend", choices=["pytorch"], default="pytorch", help="Backend")
-    parser.add_argument("--scenario", choices=["SingleStream", "Offline",
-                        "Server"], default="Offline", help="Scenario")
+        "--backend", choices=["pytorch"], default="pytorch", help="Backend"
+    )
+    parser.add_argument(
+        "--scenario",
+        choices=["SingleStream", "Offline", "Server"],
+        default="Offline",
+        help="Scenario",
+    )
     parser.add_argument("--model-path", default="EleutherAI/gpt-j-6B", help="")
     parser.add_argument(
-        "--dataset-path", default="./data/cnn_eval.json", help="")
-    parser.add_argument("--accuracy", action="store_true",
-                        help="enable accuracy pass")
-    parser.add_argument("--dtype", default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
-    parser.add_argument("--quantized", action="store_true",
-                        help="use quantized model (only valid for onnxruntime backend)")
-    parser.add_argument("--profile", action="store_true",
-                        help="enable profiling (only valid for onnxruntime backend)")
-    parser.add_argument("--gpu", action="store_true",
-                        help="use GPU instead of CPU for the inference")
-    parser.add_argument("--audit_conf", default="audit.conf",
-                        help="audit config for LoadGen settings during compliance runs")
+        "--dataset-path",
+        default="./data/cnn_eval.json",
+        help="")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        help="data type of the model, choose from float16, bfloat16 and float32",
+    )
+    parser.add_argument(
+        "--quantized",
+        action="store_true",
+        help="use quantized model (only valid for onnxruntime backend)",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="enable profiling (only valid for onnxruntime backend)",
+    )
     parser.add_argument(
-        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config")
-    parser.add_argument("--user_conf", default="user.conf",
-                        help="user config for user LoadGen settings such as target QPS")
-    parser.add_argument("--max_examples", type=int, default=13368,
-                        help="Maximum number of examples to consider (not limited by default)")
+        "--gpu", action="store_true", help="use GPU instead of CPU for the inference"
+    )
+    parser.add_argument(
+        "--audit_conf",
+        default="audit.conf",
+        help="audit config for LoadGen settings during compliance runs",
+    )
+    parser.add_argument(
+        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
+    )
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+    parser.add_argument(
+        "--max_examples",
+        type=int,
+        default=13368,
+        help="Maximum number of examples to consider (not limited by default)",
+    )
     args = parser.parse_args()
     return args
 
@@ -42,7 +74,7 @@ def get_args():
     "SingleStream": lg.TestScenario.SingleStream,
     "Offline": lg.TestScenario.Offline,
     "Server": lg.TestScenario.Server,
-    "MultiStream": lg.TestScenario.MultiStream
+    "MultiStream": lg.TestScenario.MultiStream,
 }
 
 
@@ -80,7 +112,9 @@ def main():
     log_settings.log_output = log_output_settings
     log_settings.enable_trace = True
 
-    lg.StartTestWithLogSettings(sut.sut, sut.qsl, settings, log_settings, args.audit_conf)
+    lg.StartTestWithLogSettings(
+        sut.sut, sut.qsl, settings, log_settings, args.audit_conf
+    )
     print("Test Done!")
 
     print("Destroying SUT...")
diff --git a/language/gpt-j/prepare-calibration.py b/language/gpt-j/prepare-calibration.py
index 846e492d3..91bcb0826 100644
--- a/language/gpt-j/prepare-calibration.py
+++ b/language/gpt-j/prepare-calibration.py
@@ -4,32 +4,39 @@
 from argparse import ArgumentParser
 from datasets import load_dataset
 
+
 def get_args():
     parser = ArgumentParser()
-    parser.add_argument("--calibration-list-file", required=True, help="Path to calibration list")
-    parser.add_argument("--output-dir", help="Output directory", default="calibration-data")
+    parser.add_argument(
+        "--calibration-list-file", required=True, help="Path to calibration list"
+    )
+    parser.add_argument(
+        "--output-dir", help="Output directory", default="calibration-data"
+    )
 
     return parser.parse_args()
 
-dataset_id='cnn_dailymail'
-version='3.0.0'
-split='train'
 
-instruction_template="Summarize the following news article:"
+dataset_id = "cnn_dailymail"
+version = "3.0.0"
+split = "train"
+
+instruction_template = "Summarize the following news article:"
+
 
 def check_path(path):
     return os.path.exists(path)
 
+
 def prepare_calibration_data(calibration_list_file, output_dir):
     if not check_path(calibration_list_file):
         print("Calibration list file not found: {}".format(calibration_list_file))
         sys.exit(1)
 
-    dataset = load_dataset("cnn_dailymail", name="3.0.0", split='train')
-    train = dict((x['id'], x) for x in dataset)
+    dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+    train = dict((x["id"], x) for x in dataset)
 
-    
-    with open(calibration_list_file, 'r') as fid:
+    with open(calibration_list_file, "r") as fid:
         calibration_ids = fid.read().splitlines()
 
     inputs = []
@@ -40,20 +47,22 @@ def prepare_calibration_data(calibration_list_file, output_dir):
         x["input"] = calibration_sample["article"]
         x["output"] = calibration_sample["highlights"]
         inputs.append(x)
-    
+
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir)
 
-    output_path = os.path.join(output_dir,"cnn_dailymail_calibration.json")
-    with open(output_path, 'w') as write_f:
+    output_path = os.path.join(output_dir, "cnn_dailymail_calibration.json")
+    with open(output_path, "w") as write_f:
         json.dump(inputs, write_f, indent=4, ensure_ascii=False)
 
     print("Calibration data saved at {}".format(output_path))
 
+
 def main():
 
     args = get_args()
     prepare_calibration_data(args.calibration_list_file, args.output_dir)
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     main()
diff --git a/language/gpt-j/utils.py b/language/gpt-j/utils.py
index 6e4e7e036..ef6c25b6b 100644
--- a/language/gpt-j/utils.py
+++ b/language/gpt-j/utils.py
@@ -2,11 +2,13 @@
 import os
 import io
 
+
 def _make_r_io_base(f, mode: str):
     if not isinstance(f, io.IOBase):
         f = open(f, mode=mode)
     return f
 
+
 def jload(f, mode="r"):
     """Load a .json file into a dictionary."""
     f = _make_r_io_base(f, mode)
diff --git a/language/llama2-70b/SUT.py b/language/llama2-70b/SUT.py
index 4ff78c38c..c148093b8 100644
--- a/language/llama2-70b/SUT.py
+++ b/language/llama2-70b/SUT.py
@@ -29,18 +29,19 @@
     "max_new_tokens": 1024,
     "min_new_tokens": 1,
     "num_beams": 1,
-    "do_sample": False
+    "do_sample": False,
 }
 
 
-
 class FirstTokenStreamer(BaseStreamer):
-    """ Streams first tokens to a 'holder' """
+    """Streams first tokens to a 'holder'"""
 
-    def __init__(self, first_token, tokens_cache=[], is_first_token=True, response_ids=[] ):
-        """ Response ids added to 'sign' the first token"""
+    def __init__(
+        self, first_token, tokens_cache=[], is_first_token=True, response_ids=[]
+    ):
+        """Response ids added to 'sign' the first token"""
 
-        self.first_token = first_token # Queue for first token
+        self.first_token = first_token  # Queue for first token
         self.is_first_token = is_first_token
 
         # Cache for subsequent generated tokens
@@ -48,12 +49,15 @@ def __init__(self, first_token, tokens_cache=[], is_first_token=True, response_i
 
         self.response_ids = response_ids
 
-        self.is_prompt = True # The first tokens sent to the streamer are actually the input prompts
+        self.is_prompt = (
+            True  # The first tokens sent to the streamer are actually the input prompts
+        )
 
     def put(self, value):
-        """ Caches the tokens as they're generated. Assumes bs=1 """
+        """Caches the tokens as they're generated. Assumes bs=1"""
 
-        # Prompts are streamed first so we need to skip the first time value that arrives
+        # Prompts are streamed first so we need to skip the first time value
+        # that arrives
         if self.is_prompt:
             self.is_prompt = False
             return
@@ -61,7 +65,8 @@ def put(self, value):
         value = value.item()
         if self.is_first_token:
 
-            # Add generated first token together with its query response_id to first tokens queue
+            # Add generated first token together with its query response_id to
+            # first tokens queue
             self.first_token.put((value, self.response_ids[0]))
 
             self.is_first_token = False
@@ -69,7 +74,6 @@ def put(self, value):
 
         self.tokens_cache.append(value)
 
-
     def end(self):
         pass
 
@@ -77,16 +81,20 @@ def get_out_tokens(self):
         return self.tokens_cache
 
 
-class SUT():
-    def __init__(self,
-                 model_path=None,
-                 dtype="bfloat16",
-                 device="cpu",
-                 batch_size=None,
-                 total_sample_count=24576,
-                 dataset_path=None,
-                 use_cached_outputs=False,  # Set this to True *only for test accuracy runs* in case your prior session was killed partway through
-                 workers=1):
+class SUT:
+    def __init__(
+        self,
+        model_path=None,
+        dtype="bfloat16",
+        device="cpu",
+        batch_size=None,
+        total_sample_count=24576,
+        dataset_path=None,
+        use_cached_outputs=False,
+        # Set this to True *only for test accuracy runs* in case your prior
+        # session was killed partway through
+        workers=1,
+    ):
 
         self.model_path = model_path or "meta-llama/Llama-2-70b-chat-hf"
         self.device = device
@@ -99,26 +107,32 @@ def __init__(self,
         self.batch_size = batch_size
 
         # dtype
-        if dtype == 'bfloat16':
+        if dtype == "bfloat16":
             self.amp_enabled = True
             self.amp_dtype = torch.bfloat16
-        elif dtype == 'float16':
+        elif dtype == "float16":
             self.amp_enabled = True
             self.amp_dtype = torch.float16
         else:
             self.amp_enabled = False
             self.amp_dtype = torch.float32
 
-        if 'cuda' in self.device:
+        if "cuda" in self.device:
             assert torch.cuda.is_available(), "torch gpu is not available, exiting..."
 
         self.dataset_path = dataset_path
-        self.data_object = Dataset(self.model_path,
-                                   dataset_path=self.dataset_path,
-                                   total_sample_count=total_sample_count,
-                                   device=self.device)
-        self.qsl = lg.ConstructQSL(self.data_object.total_sample_count, self.data_object.perf_count,
-                                   self.data_object.LoadSamplesToRam, self.data_object.UnloadSamplesFromRam)
+        self.data_object = Dataset(
+            self.model_path,
+            dataset_path=self.dataset_path,
+            total_sample_count=total_sample_count,
+            device=self.device,
+        )
+        self.qsl = lg.ConstructQSL(
+            self.data_object.total_sample_count,
+            self.data_object.perf_count,
+            self.data_object.LoadSamplesToRam,
+            self.data_object.UnloadSamplesFromRam,
+        )
 
         self.load_model()
 
@@ -130,7 +144,6 @@ def __init__(self,
         self.sample_counter = 0
         self.sample_counter_lock = threading.Lock()
 
-
     def start(self):
         # Create worker threads
         for j in range(self.num_workers):
@@ -145,9 +158,8 @@ def stop(self):
         for worker in self.worker_threads:
             worker.join()
 
-
     def process_queries(self):
-        """Processor of the queued queries. User may choose to add batching logic """
+        """Processor of the queued queries. User may choose to add batching logic"""
 
         while True:
             qitem = self.query_queue.get()
@@ -178,12 +190,32 @@ def process_queries(self):
                 input_masks_tensor = []
                 input_len = []
                 for q in qitem:
-                    input_ids_tensor.append(pad(self.data_object.input_ids[q.index],
-                                                (max_seq_len - self.data_object.input_lens[q.index], 0, 0, 0),
-                                                value=self.tokenizer.pad_token_id))
-                    input_masks_tensor.append(pad(self.data_object.attention_masks[q.index],
-                                                  (max_seq_len - self.data_object.input_lens[q.index], 0, 0, 0),
-                                                 value=0))
+                    input_ids_tensor.append(
+                        pad(
+                            self.data_object.input_ids[q.index],
+                            (
+                                max_seq_len -
+                                self.data_object.input_lens[q.index],
+                                0,
+                                0,
+                                0,
+                            ),
+                            value=self.tokenizer.pad_token_id,
+                        )
+                    )
+                    input_masks_tensor.append(
+                        pad(
+                            self.data_object.attention_masks[q.index],
+                            (
+                                max_seq_len -
+                                self.data_object.input_lens[q.index],
+                                0,
+                                0,
+                                0,
+                            ),
+                            value=0,
+                        )
+                    )
                     input_len.append(self.data_object.input_lens[q.index])
                 input_ids_tensor = torch.cat(input_ids_tensor)
                 input_masks_tensor = torch.cat(input_masks_tensor)
@@ -197,20 +229,28 @@ def process_queries(self):
                     input_ids=input_ids_tensor,
                     attention_mask=input_masks_tensor,
                     pad_token_id=self.tokenizer.pad_token_id,
-                    **gen_kwargs
+                    **gen_kwargs,
                 )
 
                 tik3 = time.time()
 
-                processed_output = self.data_object.postProcess(pred_output_tokens,
-                                                                input_seq_lens=input_len,
-                                                                query_id_list=query_ids)
+                processed_output = self.data_object.postProcess(
+                    pred_output_tokens,
+                    input_seq_lens=input_len,
+                    query_id_list=query_ids,
+                )
 
             for i in range(len(qitem)):
                 n_tokens = processed_output[i].shape[0]
-                response_array = array.array("B", processed_output[i].tobytes())
+                response_array = array.array(
+                    "B", processed_output[i].tobytes())
                 bi = response_array.buffer_info()
-                response = [lg.QuerySampleResponse(qitem[i].id, bi[0], bi[1], n_tokens)]
+                response = [
+                    lg.QuerySampleResponse(
+                        qitem[i].id,
+                        bi[0],
+                        bi[1],
+                        n_tokens)]
                 lg.QuerySamplesComplete(response)
 
             tok = time.time()
@@ -226,19 +266,20 @@ def process_queries(self):
                 else:
                     print(f"\tLoaded from cache: {_p}")
 
-
     def load_model(self):
         self.model = LlamaForCausalLM.from_pretrained(
             self.model_path,
             device_map="auto",
             low_cpu_mem_usage=True,
-            torch_dtype=self.amp_dtype
+            torch_dtype=self.amp_dtype,
         )
         print("Loaded model")
 
         self.device = torch.device(self.device)
         if self.device == "cpu":
-            self.model = self.model.to(self.device)  # Force CPU if your system has GPU and you specifically want CPU-only run
+            self.model = self.model.to(
+                self.device
+            )  # Force CPU if your system has GPU and you specifically want CPU-only run
 
         self.model.eval()
         self.model = self.model.to(memory_format=torch.channels_last)
@@ -247,7 +288,8 @@ def load_model(self):
             self.model_path,
             model_max_length=1024,
             padding_side="left",
-            use_fast=False,)
+            use_fast=False,
+        )
 
         self.tokenizer.pad_token = self.tokenizer.eos_token
         print("Loaded tokenizer")
@@ -259,24 +301,21 @@ def get_sut(self):
     def get_qsl(self):
         return self.qsl
 
-
-    def predict(self,**kwargs):
+    def predict(self, **kwargs):
         raise NotImplementedError
 
-
     def issue_queries(self, query_samples):
-        """ Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
+        """Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
 
         list_prompts_tokens = []
         list_prompts_attn_masks = []
 
         print(f"IssueQuery started with {len(query_samples)} samples")
         while len(query_samples) > 0:
-            self.query_queue.put(query_samples[:self.batch_size])
+            self.query_queue.put(query_samples[: self.batch_size])
             query_samples = query_samples[self.batch_size:]
         print(f"IssueQuery done")
 
-
     def flush_queries(self):
         pass
 
@@ -285,9 +324,24 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, dtype="bfloat16", device="cpu", total_sample_count=24576, dataset_path=None, workers=1):
-
-        super().__init__(model_path=model_path, dtype=dtype, device=device, total_sample_count=total_sample_count, dataset_path=dataset_path, workers=workers)
+    def __init__(
+        self,
+        model_path=None,
+        dtype="bfloat16",
+        device="cpu",
+        total_sample_count=24576,
+        dataset_path=None,
+        workers=1,
+    ):
+
+        super().__init__(
+            model_path=model_path,
+            dtype=dtype,
+            device=device,
+            total_sample_count=total_sample_count,
+            dataset_path=dataset_path,
+            workers=workers,
+        )
 
         self.first_token_queue = queue.Queue()
 
@@ -300,10 +354,10 @@ def start(self):
             self.worker_threads[j] = worker
 
         # Create first token response thread
-        self.ft_response_thread = threading.Thread(target=self.process_first_tokens)
+        self.ft_response_thread = threading.Thread(
+            target=self.process_first_tokens)
         self.ft_response_thread.start()
 
-
     def process_first_tokens(self):
 
         while True:
@@ -315,13 +369,15 @@ def process_first_tokens(self):
 
             first_tokens, response_id = first_token_item
 
-            response_data = array.array("B", np.array(first_tokens, np.float32).tobytes())
+            response_data = array.array(
+                "B", np.array(first_tokens, np.float32).tobytes()
+            )
             bi = response_data.buffer_info()
             response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
             lg.FirstTokenComplete(response)
 
     def process_queries(self):
-        """Processor of the queued queries. User may choose to add batching logic """
+        """Processor of the queued queries. User may choose to add batching logic"""
         while True:
 
             qitem = self.query_queue.get()
@@ -331,31 +387,42 @@ def process_queries(self):
             input_ids_tensor = self.data_object.input_ids[qitem.index]
             input_masks_tensor = self.data_object.attention_masks[qitem.index]
 
-            #TODO: This PoC is super slow with significant overhead. Best to create a patch to `generate`
+            # TODO: This PoC is super slow with significant overhead. Best to
+            # create a patch to `generate`
             tokens_cache = []
-            tokens_streamer = FirstTokenStreamer(self.first_token_queue, tokens_cache=tokens_cache, is_first_token=True, response_ids=[qitem.id])
-
-            _ = self.model.generate(    input_ids=input_ids_tensor,
-                                        attention_mask=input_masks_tensor,
-                                        pad_token_id=self.tokenizer.pad_token_id,
-                                        streamer = tokens_streamer,
-                                        **gen_kwargs
-                                        )
+            tokens_streamer = FirstTokenStreamer(
+                self.first_token_queue,
+                tokens_cache=tokens_cache,
+                is_first_token=True,
+                response_ids=[qitem.id],
+            )
+
+            _ = self.model.generate(
+                input_ids=input_ids_tensor,
+                attention_mask=input_masks_tensor,
+                pad_token_id=self.tokenizer.pad_token_id,
+                streamer=tokens_streamer,
+                **gen_kwargs,
+            )
 
             output_tokens = tokens_streamer.get_out_tokens()
             n_tokens = len(output_tokens)
-            response_array = array.array("B", np.array(output_tokens, np.int32).tobytes())
+            response_array = array.array(
+                "B", np.array(output_tokens, np.int32).tobytes()
+            )
             bi = response_array.buffer_info()
-            response = [lg.QuerySampleResponse(
-                qitem.id, bi[0], bi[1], n_tokens)]
+            response = [
+                lg.QuerySampleResponse(
+                    qitem.id,
+                    bi[0],
+                    bi[1],
+                    n_tokens)]
             lg.QuerySamplesComplete(response)
 
-
     def issue_queries(self, query_samples):
 
         self.query_queue.put(query_samples[0])
 
-
     def stop(self):
         for _ in range(self.num_workers):
             self.query_queue.put(None)
diff --git a/language/llama2-70b/consolidate_results.py b/language/llama2-70b/consolidate_results.py
index ad5fbe411..645b7bc57 100644
--- a/language/llama2-70b/consolidate_results.py
+++ b/language/llama2-70b/consolidate_results.py
@@ -14,10 +14,30 @@
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset-path", type=str, default=None, help="Path to .pkl generated by processorca.py")
-    parser.add_argument("--run-outputs", type=str, default="run_outputs", help="Output dir generated by accuracy run.")
-    parser.add_argument("--model-dir", type=str, default=None, help="Path to Llamav2 HuggingFace repo clone")
-    parser.add_argument("--output-pkl-path", type=str, default="full_output.pkl", help="Path to dump output to")
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to .pkl generated by processorca.py",
+    )
+    parser.add_argument(
+        "--run-outputs",
+        type=str,
+        default="run_outputs",
+        help="Output dir generated by accuracy run.",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=None,
+        help="Path to Llamav2 HuggingFace repo clone",
+    )
+    parser.add_argument(
+        "--output-pkl-path",
+        type=str,
+        default="full_output.pkl",
+        help="Path to dump output to",
+    )
     args = parser.parse_args()
     return args
 
@@ -33,7 +53,7 @@ def load_run_outputs(p: os.PathLike):
     by_query_idx = dict()
     for pkl_file in g:
         print(f"Loading from {pkl_file}...")
-        with open(pkl_file, 'rb') as f:
+        with open(pkl_file, "rb") as f:
             d = pickle.load(f)
         assert len(d["query_ids"]) == len(d["outputs"])
 
@@ -78,17 +98,18 @@ def main(args):
         output_lens[qid] = len(L)
 
         # Decode tokens
-        output_text_col[qid] = tokenizer.decode(output_tok_ids_col[qid], skip_special_tokens=True)
+        output_text_col[qid] = tokenizer.decode(
+            output_tok_ids_col[qid], skip_special_tokens=True
+        )
     print(f"Found {len(no_eos_ids)} samples with no EOS token")
 
     print("Calculating rouge scores...")
-    _preproc = lambda s: "\n".join(nltk.sent_tokenize(s.strip()))
+    def _preproc(s): return "\n".join(nltk.sent_tokenize(s.strip()))
     preds = list(map(_preproc, output_text_col))
     targets = list(map(_preproc, list(df["output"])))
-    rouge_scores = metric.compute(predictions=preds,
-                                  references=targets,
-                                  use_stemmer=True,
-                                  use_aggregator=False)
+    rouge_scores = metric.compute(
+        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
+    )
 
     assert len(rouge_scores["rouge1"]) == 24576
     assert len(rouge_scores["rouge2"]) == 24576
diff --git a/language/llama2-70b/dataset.py b/language/llama2-70b/dataset.py
index 4b1b1bb91..0d8edc74c 100644
--- a/language/llama2-70b/dataset.py
+++ b/language/llama2-70b/dataset.py
@@ -1,3 +1,4 @@
+import random
 import os
 import time
 import numpy as np
@@ -8,24 +9,32 @@
 from torch.utils.data import DataLoader
 from typing import Optional, Dict, Sequence
 import io
-#import utils
+
+# import utils
 import copy
 import pickle
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("Llama-70B-Dataset")
 
-import random
 
-class Dataset():
-    def __init__(self, model_name=None, total_sample_count=24576, perf_count_override=None, dataset_path=None, device="cpu"):
+class Dataset:
+    def __init__(
+        self,
+        model_name=None,
+        total_sample_count=24576,
+        perf_count_override=None,
+        dataset_path=None,
+        device="cpu",
+    ):
         self.model_name = model_name or "meta-llama/Llama-2-70b-chat-hf"
         self.dataset_path = dataset_path
         self.max_length = 1024
         self.device = device
 
-        #self.total_sample_count = total_sample_count
+        # self.total_sample_count = total_sample_count
 
         self.load_tokenizer()
         self.load_processed_dataset()
@@ -34,42 +43,54 @@ def __init__(self, model_name=None, total_sample_count=24576, perf_count_overrid
         self.perf_count = perf_count_override or self.total_sample_count
 
     def load_tokenizer(self):
-        """ Returns tokenizer """
+        """Returns tokenizer"""
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
             model_max_length=1024,
             padding_side="left",
-            use_fast=False,)
+            use_fast=False,
+        )
 
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
     def load_processed_dataset(self):
         if not os.path.isfile(self.dataset_path):
-            log.warn("Processed pickle file {} not found. Please check that the path is correct".format(self.dataset_path))
+            log.warn(
+                "Processed pickle file {} not found. Please check that the path is correct".format(
+                    self.dataset_path
+                )
+            )
 
         print("Loading dataset...")
         import pandas as pd
+
         processed_data = pd.read_pickle(self.dataset_path)
 
-        input_tokens = processed_data['tok_input']
+        input_tokens = processed_data["tok_input"]
 
         self.input_ids = []
         self.input_lens = []
         self.attention_masks = []
 
         for ids in input_tokens:
-            input_ids = torch.tensor(ids, dtype=torch.int32).view(1,-1).to(self.device)
+            input_ids = torch.tensor(ids, dtype=torch.int32).view(
+                1, -1).to(self.device)
             attn_mask = torch.ones_like(input_ids)
             self.input_ids.append(input_ids)
             self.attention_masks.append(attn_mask)
             self.input_lens.append(input_ids.shape[-1])
         print("Finished loading dataset.")
 
+    def postProcess(
+        self,
+        out_tokens,
+        input_seq_lens=None,
+        query_id_list=None,
+        sample_index_list=None,
+    ):
+        """Postprocesses output prediction"""
 
-    def postProcess(self, out_tokens, input_seq_lens=None, query_id_list=None, sample_index_list=None):
-        """ Postprocesses output prediction """
-
-        #TODO: Create response object in postProcess(?)
+        # TODO: Create response object in postProcess(?)
         """
         preds = []
         for i in range(out_tokens.shape[0]):
@@ -79,7 +100,8 @@ def postProcess(self, out_tokens, input_seq_lens=None, query_id_list=None, sampl
             pred = out_tokens[i, input_len:].reshape(-1).cpu().numpy()
             preds.append(pred)
         """
-        # Everything is padded to max_len (1024), so prune the input and parse to numpy
+        # Everything is padded to max_len (1024), so prune the input and parse
+        # to numpy
         output_seq = out_tokens[:, 1024:].cpu().numpy()
         assert len(query_id_list) == output_seq.shape[0]
 
@@ -88,9 +110,8 @@ def postProcess(self, out_tokens, input_seq_lens=None, query_id_list=None, sampl
             os.makedirs("run_outputs")
         fname = "q" + "_".join([str(i) for i in query_id_list])
         fname = f"run_outputs/{fname}.pkl"
-        with open(fname, mode='wb') as f:
-            d = {"query_ids": query_id_list,
-                 "outputs": output_seq}
+        with open(fname, mode="wb") as f:
+            d = {"query_ids": query_id_list, "outputs": output_seq}
             print(f"Saving outputs to {fname}")
             pickle.dump(d, f)
 
diff --git a/language/llama2-70b/evaluate-accuracy.py b/language/llama2-70b/evaluate-accuracy.py
index cf4cb2c05..593ea9501 100644
--- a/language/llama2-70b/evaluate-accuracy.py
+++ b/language/llama2-70b/evaluate-accuracy.py
@@ -6,29 +6,41 @@
 import json
 
 
-
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint-path", required=True,
-                        help="Path to Llama2-70b-hf-chat checkpoint")
-    parser.add_argument("--mlperf-accuracy-file", required=True,
-                        help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--dataset-file", required=True,
-                        help="path to processed openorca validation set")
-    parser.add_argument("--verbose", action="store_true",
-                        help="verbose messages")
-    parser.add_argument("--dtype", default="int64",
-                        help="dtype of the accuracy log", choices=["int32", "int64", "float"])
+    parser.add_argument(
+        "--checkpoint-path", required=True, help="Path to Llama2-70b-hf-chat checkpoint"
+    )
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--dataset-file",
+        required=True,
+        help="path to processed openorca validation set",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="int64",
+        help="dtype of the accuracy log",
+        choices=["int32", "int64", "float"],
+    )
     args = parser.parse_args()
     return args
 
 
 def get_groundtruth(processed_dataset_file):
     import pandas as pd
+
     data = pd.read_pickle(processed_dataset_file)
-    ground_truths = data['output']
+    ground_truths = data["output"]
     return ground_truths
 
+
 def postprocess_text(preds, targets):
     preds = [pred.strip() for pred in preds]
     targets = [target.strip() for target in targets]
@@ -40,21 +52,20 @@ def postprocess_text(preds, targets):
     return preds, targets
 
 
-
 def main():
 
     args = get_args()
     dataset_path = args.dataset_file
     checkpoint_path = args.checkpoint_path
     metric = evaluate.load("rouge")
-    nltk.download('punkt')
+    nltk.download("punkt")
 
     tokenizer = AutoTokenizer.from_pretrained(
         checkpoint_path,
         model_max_length=2048,
         padding_side="left",
-        use_fast=False,)
-
+        use_fast=False,
+    )
 
     targets = get_groundtruth(args.dataset_file)
 
@@ -73,35 +84,38 @@ def main():
     seen = set()
     gen_tok_len = 0
     for pred in results:
-        qsl_idx = pred['qsl_idx']
+        qsl_idx = pred["qsl_idx"]
         if qsl_idx in seen:
             continue
 
         seen.add(qsl_idx)
         target = targets[qsl_idx]
         target_required.append(target)
-        pred = np.frombuffer( bytes.fromhex(pred['data']), eval_dtype)
+        pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
 
         gen_tok_len += len(pred)
         preds_token_ids.append(pred)
 
     preds_decoded_text = tokenizer.batch_decode(
-        preds_token_ids, skip_special_tokens=True)
+        preds_token_ids, skip_special_tokens=True
+    )
 
     preds, targets = postprocess_text(preds_decoded_text, target_required)
 
     result = metric.compute(
-        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False)
+        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
+    )
     result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
     prediction_lens = [len(pred) for pred in preds]
     gen_num = len(preds)
 
-    result = {**result,
-              'gen_len': np.sum(prediction_lens),
-              'gen_num': gen_num,
-              'gen_tok_len': gen_tok_len,
-              'tokens_per_sample': round(gen_tok_len / gen_num, 1)
-              }
+    result = {
+        **result,
+        "gen_len": np.sum(prediction_lens),
+        "gen_num": gen_num,
+        "gen_tok_len": gen_tok_len,
+        "tokens_per_sample": round(gen_tok_len / gen_num, 1),
+    }
 
     print("\nResults\n")
     print(result)
diff --git a/language/llama2-70b/main.py b/language/llama2-70b/main.py
index 468ad067e..733436970 100644
--- a/language/llama2-70b/main.py
+++ b/language/llama2-70b/main.py
@@ -11,22 +11,81 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("Llama-70B-MAIN")
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--scenario", type=str, choices=["Offline", "Server"], default="Offline", help="Scenario")
-    parser.add_argument("--model-path", type=str, default="meta-llama/Llama-2-70b-chat-hf", help="Model name")
+    parser.add_argument(
+        "--scenario",
+        type=str,
+        choices=["Offline", "Server"],
+        default="Offline",
+        help="Scenario",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="meta-llama/Llama-2-70b-chat-hf",
+        help="Model name",
+    )
     parser.add_argument("--dataset-path", type=str, default=None, help="")
-    parser.add_argument("--accuracy", action="store_true", help="Run accuracy mode")
-    parser.add_argument("--dtype", type=str, default="float32", help="data type of the model, choose from float16, bfloat16 and float32")
-    parser.add_argument("--device", type=str,  choices=["cpu", "cuda:0"], default="cpu", help="device to use")
-    parser.add_argument("--audit-conf", type=str, default="audit.conf", help="audit config for LoadGen settings during compliance runs")
-    parser.add_argument("--mlperf-conf", type=str, default="mlperf.conf", help="mlperf rules config")
-    parser.add_argument("--user-conf", type=str, default="user.conf", help="user config for user LoadGen settings such as target QPS")
-    parser.add_argument("--total-sample-count", type=int, default=24576, help="Number of samples to use in benchmark.") # TODO: This interpretation of 'total-sample-count' is a little misleading. Fix it
-    parser.add_argument("--batch-size", type=int, default=1, help="Model batch-size to use in benchmark.")
-    parser.add_argument("--output-log-dir", type=str, default="output-logs", help="Where logs are saved")
-    parser.add_argument("--enable-log-trace", action="store_true", help="Enable log tracing. This file can become quite large")
-    parser.add_argument("--num-workers", type=int, default=1, help="Number of workers to process queries")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="Run accuracy mode")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float32",
+        help="data type of the model, choose from float16, bfloat16 and float32",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        choices=["cpu", "cuda:0"],
+        default="cpu",
+        help="device to use",
+    )
+    parser.add_argument(
+        "--audit-conf",
+        type=str,
+        default="audit.conf",
+        help="audit config for LoadGen settings during compliance runs",
+    )
+    parser.add_argument(
+        "--mlperf-conf", type=str, default="mlperf.conf", help="mlperf rules config"
+    )
+    parser.add_argument(
+        "--user-conf",
+        type=str,
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+    parser.add_argument(
+        "--total-sample-count",
+        type=int,
+        default=24576,
+        help="Number of samples to use in benchmark.",
+    )  # TODO: This interpretation of 'total-sample-count' is a little misleading. Fix it
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=1,
+        help="Model batch-size to use in benchmark.",
+    )
+    parser.add_argument(
+        "--output-log-dir", type=str, default="output-logs", help="Where logs are saved"
+    )
+    parser.add_argument(
+        "--enable-log-trace",
+        action="store_true",
+        help="Enable log tracing. This file can become quite large",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=1,
+        help="Number of workers to process queries",
+    )
 
     args = parser.parse_args()
     return args
@@ -35,12 +94,10 @@ def get_args():
 scenario_map = {
     "offline": lg.TestScenario.Offline,
     "server": lg.TestScenario.Server,
-    }
+}
+
+sut_map = {"offline": SUT, "server": SUTServer}
 
-sut_map = {
-        "offline": SUT,
-        "server": SUTServer
-        }
 
 def main():
     args = get_args()
@@ -53,7 +110,9 @@ def main():
 
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly
-        log.warning("Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet")
+        log.warning(
+            "Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet"
+        )
     else:
         settings.mode = lg.TestMode.PerformanceOnly
 
@@ -80,7 +139,12 @@ def main():
     sut.start()
     lgSUT = lg.ConstructSUT(sut.issue_queries, sut.flush_queries)
     log.info("Starting Benchmark run")
-    lg.StartTestWithLogSettings(lgSUT, sut.qsl, settings, log_settings, args.audit_conf)
+    lg.StartTestWithLogSettings(
+        lgSUT,
+        sut.qsl,
+        settings,
+        log_settings,
+        args.audit_conf)
 
     # Stop sut after completion
     sut.stop()
diff --git a/language/llama2-70b/processorca.py b/language/llama2-70b/processorca.py
index c850f89bd..4c0df05fb 100644
--- a/language/llama2-70b/processorca.py
+++ b/language/llama2-70b/processorca.py
@@ -36,16 +36,21 @@
 llama_prompt_system = "<s>[INST] <<SYS>>\n{}\n<</SYS>>\n\n{} [/INST]"
 llama_prompt_no_system = "<s>[INST] {} [/INST]"
 
+
 def format_llama_input(row):
-    if row['system_prompt']:
-        return llama_prompt_system.format(row['system_prompt'], row['question'])
+    if row["system_prompt"]:
+        return llama_prompt_system.format(
+            row["system_prompt"], row["question"])
     else:
-        return llama_prompt_no_system.format(row['question'])
+        return llama_prompt_no_system.format(row["question"])
+
 
 def is_english(s):
     for c in s:
         allowed = c.isascii()
-        allowed = allowed or (c in ['’', '–', '“', '”', '—'])  # Taken from Habana: Unicode quotes and hyphens
+        allowed = allowed or (
+            c in ["’", "–", "“", "”", "—"]
+        )  # Taken from Habana: Unicode quotes and hyphens
         if not allowed:
             return False
     return True
@@ -59,7 +64,8 @@ def _tokenize_helper(x, llama_tokenizer=None, append_response_init_token=True):
 
     if append_response_init_token:
         # Workaround to enable cheat checking for first token: Llama always outputs token 29871 first
-        # It is possible for submitters to just immediately output this token to achieve a very fast TTFT.
+        # It is possible for submitters to just immediately output this token
+        # to achieve a very fast TTFT.
         tokens.append(29871)
     return tokens
 
@@ -73,11 +79,13 @@ class Keyphrase:
 
 
 class OpenOrcaDatasetGenerator:
-    def __init__(self,
-                 pq_path: os.PathLike,
-                 model_dir: os.PathLike,
-                 io_token_limit: int,
-                 calibration_subset_size: int = 1000):
+    def __init__(
+        self,
+        pq_path: os.PathLike,
+        model_dir: os.PathLike,
+        io_token_limit: int,
+        calibration_subset_size: int = 1000,
+    ):
         self.pq_path = Path(pq_path)
         self.model_dir = Path(model_dir)
         self.io_token_limit = io_token_limit
@@ -90,29 +98,37 @@ def load_parquet(self) -> pd.DataFrame:
         tik = time.time()
         df = pd.read_parquet(self.pq_path)
         print(f"Tokenizing input")
-        df.rename(columns={'response': 'output'}, inplace=True)
-        df['input'] = df.apply(format_llama_input, axis=1)
-
-        input_tokenizer = partial(_tokenize_helper, llama_tokenizer=llama_tokenizer)
-        output_tokenizer = partial(_tokenize_helper, llama_tokenizer=llama_tokenizer, append_response_init_token=False)
-        df['tok_input'] = df['input'].apply(input_tokenizer)
-        df['tok_output'] = df['output'].apply(output_tokenizer)
+        df.rename(columns={"response": "output"}, inplace=True)
+        df["input"] = df.apply(format_llama_input, axis=1)
+
+        input_tokenizer = partial(
+            _tokenize_helper,
+            llama_tokenizer=llama_tokenizer)
+        output_tokenizer = partial(
+            _tokenize_helper,
+            llama_tokenizer=llama_tokenizer,
+            append_response_init_token=False,
+        )
+        df["tok_input"] = df["input"].apply(input_tokenizer)
+        df["tok_output"] = df["output"].apply(output_tokenizer)
         tok = time.time()
         print(f"Loaded parquet and tokenized in {tok-tik} sec.")
         return df
 
     def filter_english(self, df: pd.DataFrame) -> pd.DataFrame:
-        df['input_english'] = df['input'].apply(is_english)
-        df['output_english'] = df['output'].apply(is_english)
-        df['all_english'] = df['input_english'] & df['output_english']
+        df["input_english"] = df["input"].apply(is_english)
+        df["output_english"] = df["output"].apply(is_english)
+        df["all_english"] = df["input_english"] & df["output_english"]
 
         # Filter based on english tokens
-        df = df[df['all_english']].drop(["input_english", "output_english", "all_english"], axis=1)
+        df = df[df["all_english"]].drop(
+            ["input_english", "output_english", "all_english"], axis=1
+        )
         return df.reset_index(drop=True)
 
     def filter_seqlen_oob(self, df: pd.DataFrame) -> pd.DataFrame:
-        df['tok_input_length'] = df['tok_input'].apply(lambda x: len(x))
-        df['tok_output_length'] = df['tok_output'].apply(lambda x: len(x))
+        df["tok_input_length"] = df["tok_input"].apply(lambda x: len(x))
+        df["tok_output_length"] = df["tok_output"].apply(lambda x: len(x))
 
         # Filter based on sequence length
         df = df[df["tok_input_length"] < self.io_token_limit]
@@ -129,22 +145,28 @@ def filter_short_expected_response(self, df: pd.DataFrame) -> pd.DataFrame:
         df = df[df["tok_output_length"] >= 3]
         return df.reset_index(drop=True)
 
-    def filter_bad_prompts(self, df: pd.DataFrame, only_niv_t0: bool = True) -> pd.DataFrame:
+    def filter_bad_prompts(
+        self, df: pd.DataFrame, only_niv_t0: bool = True
+    ) -> pd.DataFrame:
         # Some prompts underperform and cause very bad Rouge scores for a significant percentage of samples with these
         # prompts. See Jupyter notebook for analysis.
         # These generally only affect NIV and t0 and do not exist in flan or cot.
-        # Set 'only_niv_t0' to True to explicitly only remove these prompts from niv and t0 samples.
-        bad_prompts = ['',
-                       'You are an AI assistant that follows instruction extremely well. Help as much as you can.',
-                       'You are an AI assistant. Provide a detailed answer so user don’t need to search outside to understand the answer.',
-                       "You are an AI assistant. Provide a detailed answer so user don't need to search outside to understand the answer.",
-                       'User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer.',
-                       'Explain how you used the definition to come up with the answer.',
-                       ]
+        # Set 'only_niv_t0' to True to explicitly only remove these prompts
+        # from niv and t0 samples.
+        bad_prompts = [
+            "",
+            "You are an AI assistant that follows instruction extremely well. Help as much as you can.",
+            "You are an AI assistant. Provide a detailed answer so user don’t need to search outside to understand the answer.",
+            "You are an AI assistant. Provide a detailed answer so user don't need to search outside to understand the answer.",
+            "User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer.",
+            "Explain how you used the definition to come up with the answer.",
+        ]
         for prompt in bad_prompts:
-            criteria = (df.system_prompt == prompt)
+            criteria = df.system_prompt == prompt
             if only_niv_t0:
-                criteria = criteria & ((df.origin == "niv") | (df.origin == "t0"))
+                criteria = criteria & (
+                    (df.origin == "niv") | (
+                        df.origin == "t0"))
             df = df[~criteria]
 
         return df.reset_index(drop=True)
@@ -153,7 +175,8 @@ def register_keyphrase(self, keyphrase: Keyphrase):
         self.keyphrases.append(keyphrase)
 
     def filter_keyphrases(self, df: pd.DataFrame) -> pd.DataFrame:
-        # Filter out registered keyphrases. This is unused for the final dataset as there are no registered keyphrases.
+        # Filter out registered keyphrases. This is unused for the final
+        # dataset as there are no registered keyphrases.
         for kp in self.keyphrases:
             if kp.startswith:
                 selector = df[kp.col].str.startswith(kp.phrase)
@@ -163,13 +186,13 @@ def filter_keyphrases(self, df: pd.DataFrame) -> pd.DataFrame:
         return df.reset_index(drop=True)
 
     def set_origins(self, df: pd.DataFrame) -> pd.DataFrame:
-        get_sample_origin = lambda x: x.split(".")[0]
-        df['origin'] = df['id'].apply(get_sample_origin)
+        def get_sample_origin(x): return x.split(".")[0]
+        df["origin"] = df["id"].apply(get_sample_origin)
         return df
 
     def _per_origin_split(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
         print(f"Unique sample origin datasets: {df.origin.unique()}")
-        dfs_by_origin = dict(tuple(df.groupby('origin')))
+        dfs_by_origin = dict(tuple(df.groupby("origin")))
         for origin, sub_df in dfs_by_origin.items():
             sub_df = sub_df.reset_index(drop=True, inplace=True)
         return dfs_by_origin
@@ -180,9 +203,13 @@ def _get_sampling(self, df, N, rng_seed: int = 1337):
             raise RuntimeError(f"Not enough samples. Requires {N - _N} more.")
         return df.sample(n=_N, random_state=rng_seed)
 
-    def sample(self, dfs_by_origin: Dict[str, pd.DataFrame], n_total, rng_seed: int = 1337) -> pd.DataFrame:
+    def sample(
+        self, dfs_by_origin: Dict[str, pd.DataFrame], n_total, rng_seed: int = 1337
+    ) -> pd.DataFrame:
         nways = len(dfs_by_origin)
-        assert n_total % nways == 0, f"Total number of samples ({n_total}) must be divisible by n_origins ({nways})"
+        assert (
+            n_total % nways == 0
+        ), f"Total number of samples ({n_total}) must be divisible by n_origins ({nways})"
 
         split_size = n_total // nways
         samplings = []
@@ -195,17 +222,21 @@ def sample(self, dfs_by_origin: Dict[str, pd.DataFrame], n_total, rng_seed: int
         sampled_df = sampled_df.reset_index(drop=True)
         return sampled_df
 
-    def generate(self,
-                 export_dir: os.PathLike,
-                 n_samples: int = 24576,
-                 use_cached: bool = True,
-                 calib_rng_seed: int = 12345):
+    def generate(
+        self,
+        export_dir: os.PathLike,
+        n_samples: int = 24576,
+        use_cached: bool = True,
+        calib_rng_seed: int = 12345,
+    ):
         export_dir = Path(export_dir)
         if not export_dir.exists():
             print(f"Creating {export_dir}")
             export_dir.mkdir(parents=True)
         if export_dir.is_file():
-            raise ValueError(f"Cannot export to file {export_dir}. Must be a directory.")
+            raise ValueError(
+                f"Cannot export to file {export_dir}. Must be a directory."
+            )
 
         full_fpath = export_dir / f"open_orca_gpt4_tokenized_llama.full.pkl"
         if full_fpath.exists() and use_cached:
@@ -227,7 +258,8 @@ def generate(self,
         # Export base files
         for origin, sub_df in dfs_by_origin.items():
             print(f"Subset '{origin}' has {sub_df.shape[0]} samples")
-            origin_fpath = export_dir / f"open_orca_gpt4_tokenized_llama.{origin}.pkl"
+            origin_fpath = export_dir / \
+                f"open_orca_gpt4_tokenized_llama.{origin}.pkl"
             if not origin_fpath.exists() or not use_cached:
                 sub_df.to_pickle(origin_fpath)
 
@@ -237,29 +269,59 @@ def generate(self,
         # cot has a higher rouge score from a 100k sampling (of the whole dataset) than the rest, while niv has lower.
         # Sample from each dataset equally.
         sampled_df = self.sample(dfs_by_origin, n_samples)
-        sampled_fpath = export_dir / f"open_orca_gpt4_tokenized_llama.sampled_{n_samples}.pkl"
+        sampled_fpath = (
+            export_dir /
+            f"open_orca_gpt4_tokenized_llama.sampled_{n_samples}.pkl"
+        )
         sampled_df.to_pickle(sampled_fpath)
 
         # Calibration dataset
-        calib_ds = sampled_df.sample(n=self.calibration_subset_size,
-                                     random_state=calib_rng_seed)
+        calib_ds = sampled_df.sample(
+            n=self.calibration_subset_size, random_state=calib_rng_seed
+        )
         calib_ds = calib_ds.reset_index(drop=True)
-        calib_fpath = export_dir / f"open_orca_gpt4_tokenized_llama.calibration_{self.calibration_subset_size}.pkl"
+        calib_fpath = (
+            export_dir
+            / f"open_orca_gpt4_tokenized_llama.calibration_{self.calibration_subset_size}.pkl"
+        )
         calib_ds.to_pickle(calib_fpath)
 
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--dataset_pq_path', type=str,
-                        default='/raid/data/mlperf-llm/OpenOrca/1M-GPT4-Augmented.parquet',
-                        help="the path to the open_orca GPT4 parquet.")
-    parser.add_argument('--model_dir', type=str, default='/raid/data/mlperf-llm/Llama-2-70b-chat-hf')
-    parser.add_argument('--seqlen_limit', type=int, default=1024, help="Upper limit of the input/output sequence lengths")
-    parser.add_argument('--export_dir', type=str,
-                        default="/raid/data/mlperf-llm/OpenOrca/llama/filtered",
-                        help="Path to the output pkl file.")
-    parser.add_argument('--num_total_samples', type=int, default=24576, help="Number of samples to generate")
-    parser.add_argument('--calibration_subset_size', type=int, default=1000, help="Number of samples for calibration subset")
+    parser.add_argument(
+        "--dataset_pq_path",
+        type=str,
+        default="/raid/data/mlperf-llm/OpenOrca/1M-GPT4-Augmented.parquet",
+        help="the path to the open_orca GPT4 parquet.",
+    )
+    parser.add_argument(
+        "--model_dir", type=str, default="/raid/data/mlperf-llm/Llama-2-70b-chat-hf"
+    )
+    parser.add_argument(
+        "--seqlen_limit",
+        type=int,
+        default=1024,
+        help="Upper limit of the input/output sequence lengths",
+    )
+    parser.add_argument(
+        "--export_dir",
+        type=str,
+        default="/raid/data/mlperf-llm/OpenOrca/llama/filtered",
+        help="Path to the output pkl file.",
+    )
+    parser.add_argument(
+        "--num_total_samples",
+        type=int,
+        default=24576,
+        help="Number of samples to generate",
+    )
+    parser.add_argument(
+        "--calibration_subset_size",
+        type=int,
+        default=1000,
+        help="Number of samples for calibration subset",
+    )
     return parser.parse_args()
 
 
@@ -277,4 +339,9 @@ def parse_arguments():
     )
 
     # Sample command to run:
-    # python3 processorca.py --dataset_pq_path=/raid/data/mlperf-llm/OpenOrca/1M-GPT4-Augmented.parquet --model_dir=/raid/data/mlperf-llm/Llama-2-70b-chat-hf --seqlen_limit=1024 --export_dir=/raid/data/mlperf-llm/OpenOrca/llama/filtered --num_total_samples=24576
+    # python3 processorca.py
+    # --dataset_pq_path=/raid/data/mlperf-llm/OpenOrca/1M-GPT4-Augmented.parquet
+    # --model_dir=/raid/data/mlperf-llm/Llama-2-70b-chat-hf
+    # --seqlen_limit=1024
+    # --export_dir=/raid/data/mlperf-llm/OpenOrca/llama/filtered
+    # --num_total_samples=24576
diff --git a/language/mixtral-8x7b/SUT.py b/language/mixtral-8x7b/SUT.py
index 9ed44dbf5..7aeab024e 100644
--- a/language/mixtral-8x7b/SUT.py
+++ b/language/mixtral-8x7b/SUT.py
@@ -5,7 +5,12 @@
 import torch
 from torch.nn.functional import pad
 from torch.utils.data import DataLoader
-from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessor, LogitsProcessorList
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    LogitsProcessor,
+    LogitsProcessorList,
+)
 from transformers.generation.streamers import BaseStreamer
 
 import pickle
@@ -29,48 +34,61 @@
     "max_new_tokens": 1024,
     "min_new_tokens": 1,
     "num_beams": 1,
-    "do_sample": False
+    "do_sample": False,
 }
 
+
 class StopAfterSequence(LogitsProcessor):
-        """Logits processor (to use with HuggingFace `generate()` method :
-        https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/
-        text_generation#transformers.generation_utils.GenerationMixin).
-
-        This logits processor makes that when the model generates a specified
-        stopping sequence, it stops generating new tokens
-
-        Args:
-            stop_seq (List[int]): ID of the space token.
-            eos_token_id (int): ID of the EOS token.
-            device (str): Device that the model is running
-        """
-        def __init__(self, eos_token_id: int, stop_seq: List[int] = [13, 13940, 28832, 13], device="cpu"):
-            super().__init__()
-            assert(len(stop_seq) >= 1)
-            self.device = device
-            self.stop_seq = torch.tensor(stop_seq, dtype=torch.long).to(device)
-            self.stop_seq_length = len(stop_seq)
-            self.eos_token_id = eos_token_id
-
-        def check_stop_condition(self, input_ids: torch.LongTensor):
-            stop_condition_met = (input_ids[:, -self.stop_seq_length:] == self.stop_seq).all(dim=1)
-            return stop_condition_met
-        
-        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-            if input_ids.size(1) > self.stop_seq_length:
-                forced_eos = torch.full((scores.size(1),), -float("inf")).to(self.device)
-                forced_eos[self.eos_token_id] = 0
-                scores[self.check_stop_condition(input_ids)] = forced_eos
-            return scores
+    """Logits processor (to use with HuggingFace `generate()` method :
+    https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/
+    text_generation#transformers.generation_utils.GenerationMixin).
+
+    This logits processor makes that when the model generates a specified
+    stopping sequence, it stops generating new tokens
+
+    Args:
+        stop_seq (List[int]): ID of the space token.
+        eos_token_id (int): ID of the EOS token.
+        device (str): Device that the model is running
+    """
+
+    def __init__(
+        self,
+        eos_token_id: int,
+        stop_seq: List[int] = [13, 13940, 28832, 13],
+        device="cpu",
+    ):
+        super().__init__()
+        assert len(stop_seq) >= 1
+        self.device = device
+        self.stop_seq = torch.tensor(stop_seq, dtype=torch.long).to(device)
+        self.stop_seq_length = len(stop_seq)
+        self.eos_token_id = eos_token_id
+
+    def check_stop_condition(self, input_ids: torch.LongTensor):
+        stop_condition_met = (
+            input_ids[:, -self.stop_seq_length:] == self.stop_seq
+        ).all(dim=1)
+        return stop_condition_met
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if input_ids.size(1) > self.stop_seq_length:
+            forced_eos = torch.full(
+                (scores.size(1),), -float("inf")).to(self.device)
+            forced_eos[self.eos_token_id] = 0
+            scores[self.check_stop_condition(input_ids)] = forced_eos
+        return scores
 
 
 class FirstTokenStreamer(BaseStreamer):
-    """ Streams first tokens to a 'holder' """
+    """Streams first tokens to a 'holder'"""
 
-    def __init__(self, first_token, tokens_cache=[],
-                 is_first_token=True, response_ids=[]):
-        """ Response ids added to 'sign' the first token"""
+    def __init__(
+        self, first_token, tokens_cache=[], is_first_token=True, response_ids=[]
+    ):
+        """Response ids added to 'sign' the first token"""
 
         self.first_token = first_token  # Queue for first token
         self.is_first_token = is_first_token
@@ -84,7 +102,7 @@ def __init__(self, first_token, tokens_cache=[],
         self.is_prompt = True
 
     def put(self, value):
-        """ Caches the tokens as they're generated. Assumes bs=1 """
+        """Caches the tokens as they're generated. Assumes bs=1"""
 
         # Prompts are streamed first so we need to skip the first time value
         # that arrives
@@ -111,18 +129,20 @@ def get_out_tokens(self):
         return self.tokens_cache
 
 
-class SUT():
-    def __init__(self,
-                 model_path=None,
-                 dtype="bfloat16",
-                 device="cpu",
-                 batch_size=None,
-                 total_sample_count=24576,
-                 dataset_path=None,
-                 use_cached_outputs=False,
-                 # Set this to True *only for test accuracy runs* in case your
-                 # prior session was killed partway through
-                 workers=1):
+class SUT:
+    def __init__(
+        self,
+        model_path=None,
+        dtype="bfloat16",
+        device="cpu",
+        batch_size=None,
+        total_sample_count=24576,
+        dataset_path=None,
+        use_cached_outputs=False,
+        # Set this to True *only for test accuracy runs* in case your
+        # prior session was killed partway through
+        workers=1,
+    ):
 
         self.model_path = model_path or "mistralai/Mixtral-8x7B-Instruct-v0.1"
         self.device = device
@@ -135,26 +155,32 @@ def __init__(self,
         self.batch_size = batch_size
 
         # dtype
-        if dtype == 'bfloat16':
+        if dtype == "bfloat16":
             self.amp_enabled = True
             self.amp_dtype = torch.bfloat16
-        elif dtype == 'float16':
+        elif dtype == "float16":
             self.amp_enabled = True
             self.amp_dtype = torch.float16
         else:
             self.amp_enabled = False
             self.amp_dtype = torch.float32
 
-        if 'cuda' in self.device:
+        if "cuda" in self.device:
             assert torch.cuda.is_available(), "torch gpu is not available, exiting..."
 
         self.dataset_path = dataset_path
-        self.data_object = Dataset(self.model_path,
-                                   dataset_path=self.dataset_path,
-                                   total_sample_count=total_sample_count,
-                                   device=self.device)
-        self.qsl = lg.ConstructQSL(self.data_object.total_sample_count, self.data_object.perf_count,
-                                   self.data_object.LoadSamplesToRam, self.data_object.UnloadSamplesFromRam)
+        self.data_object = Dataset(
+            self.model_path,
+            dataset_path=self.dataset_path,
+            total_sample_count=total_sample_count,
+            device=self.device,
+        )
+        self.qsl = lg.ConstructQSL(
+            self.data_object.total_sample_count,
+            self.data_object.perf_count,
+            self.data_object.LoadSamplesToRam,
+            self.data_object.UnloadSamplesFromRam,
+        )
 
         self.load_model()
 
@@ -181,7 +207,7 @@ def stop(self):
             worker.join()
 
     def process_queries(self):
-        """Processor of the queued queries. User may choose to add batching logic """
+        """Processor of the queued queries. User may choose to add batching logic"""
 
         while True:
             qitem = self.query_queue.get()
@@ -213,18 +239,38 @@ def process_queries(self):
                 input_len = []
                 input_dataset = []
                 for q in qitem:
-                    input_ids_tensor.append(pad(self.data_object.input_ids[q.index],
-                                                (max_seq_len -
-                                                 self.data_object.input_lens[q.index], 0, 0, 0),
-                                                value=self.tokenizer.pad_token_id))
-                    input_masks_tensor.append(pad(self.data_object.attention_masks[q.index],
-                                                  (max_seq_len -
-                                                   self.data_object.input_lens[q.index], 0, 0, 0),
-                                                  value=0))
+                    input_ids_tensor.append(
+                        pad(
+                            self.data_object.input_ids[q.index],
+                            (
+                                max_seq_len -
+                                self.data_object.input_lens[q.index],
+                                0,
+                                0,
+                                0,
+                            ),
+                            value=self.tokenizer.pad_token_id,
+                        )
+                    )
+                    input_masks_tensor.append(
+                        pad(
+                            self.data_object.attention_masks[q.index],
+                            (
+                                max_seq_len -
+                                self.data_object.input_lens[q.index],
+                                0,
+                                0,
+                                0,
+                            ),
+                            value=0,
+                        )
+                    )
                     input_len.append(self.data_object.input_lens[q.index])
 
-                    # In case we predict code generation, we can specify an additional stop sequence
-                    input_dataset.append(self.data_object.dataset_names[q.index])
+                    # In case we predict code generation, we can specify an
+                    # additional stop sequence
+                    input_dataset.append(
+                        self.data_object.dataset_names[q.index])
                 input_ids_tensor = torch.cat(input_ids_tensor)
                 input_masks_tensor = torch.cat(input_masks_tensor)
 
@@ -232,9 +278,16 @@ def process_queries(self):
                 assert input_ids_tensor.shape[0] <= self.batch_size
 
                 tik2 = time.time()
-                logits_processor = LogitsProcessorList([StopAfterSequence(self.tokenizer.eos_token_id, device=self.device)])
+                logits_processor = LogitsProcessorList(
+                    [StopAfterSequence(
+                        self.tokenizer.eos_token_id, device=self.device)]
+                )
                 for i in range(len(input_ids_tensor)):
-                    ids, masks, dataset = input_ids_tensor[i:i+1], input_masks_tensor[i:i+1], input_dataset[i]
+                    ids, masks, dataset = (
+                        input_ids_tensor[i: i + 1],
+                        input_masks_tensor[i: i + 1],
+                        input_dataset[i],
+                    )
                     pred_output_tokens = []
                     if dataset == "MBXP":
                         out = self.model.generate(
@@ -242,22 +295,24 @@ def process_queries(self):
                             attention_mask=masks,
                             pad_token_id=self.tokenizer.pad_token_id,
                             logits_processor=logits_processor,
-                            **gen_kwargs
+                            **gen_kwargs,
                         )
                     else:
                         out = self.model.generate(
                             input_ids=ids,
                             attention_mask=masks,
                             pad_token_id=self.tokenizer.pad_token_id,
-                            **gen_kwargs
+                            **gen_kwargs,
                         )
                     pred_output_tokens.append(out)
                 pred_output_tokens = torch.cat(pred_output_tokens)
                 tik3 = time.time()
 
-                processed_output = self.data_object.postProcess(pred_output_tokens,
-                                                                input_seq_lens=input_len,
-                                                                query_id_list=query_ids)
+                processed_output = self.data_object.postProcess(
+                    pred_output_tokens,
+                    input_seq_lens=input_len,
+                    query_id_list=query_ids,
+                )
 
             for i in range(len(qitem)):
                 n_tokens = processed_output[i].shape[0]
@@ -290,7 +345,7 @@ def load_model(self):
             self.model_path,
             device_map="auto",
             low_cpu_mem_usage=True,
-            torch_dtype=self.amp_dtype
+            torch_dtype=self.amp_dtype,
         )
         print("Loaded model")
 
@@ -307,7 +362,8 @@ def load_model(self):
             self.model_path,
             model_max_length=1024,
             padding_side="left",
-            use_fast=False,)
+            use_fast=False,
+        )
 
         self.tokenizer.pad_token = self.tokenizer.eos_token
         print("Loaded tokenizer")
@@ -323,14 +379,14 @@ def predict(self, **kwargs):
         raise NotImplementedError
 
     def issue_queries(self, query_samples):
-        """ Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
+        """Receives samples from loadgen and adds them to queue. Users may choose to batch here"""
 
         list_prompts_tokens = []
         list_prompts_attn_masks = []
 
         print(f"IssueQuery started with {len(query_samples)} samples")
         while len(query_samples) > 0:
-            self.query_queue.put(query_samples[:self.batch_size])
+            self.query_queue.put(query_samples[: self.batch_size])
             query_samples = query_samples[self.batch_size:]
         print(f"IssueQuery done")
 
@@ -342,8 +398,15 @@ def __del__(self):
 
 
 class SUTServer(SUT):
-    def __init__(self, model_path=None, dtype="bfloat16", device="cpu",
-                 total_sample_count=24576, dataset_path=None, workers=1):
+    def __init__(
+        self,
+        model_path=None,
+        dtype="bfloat16",
+        device="cpu",
+        total_sample_count=24576,
+        dataset_path=None,
+        workers=1,
+    ):
 
         super().__init__(
             model_path=model_path,
@@ -351,7 +414,8 @@ def __init__(self, model_path=None, dtype="bfloat16", device="cpu",
             device=device,
             total_sample_count=total_sample_count,
             dataset_path=dataset_path,
-            workers=workers)
+            workers=workers,
+        )
 
         self.first_token_queue = queue.Queue()
 
@@ -379,14 +443,15 @@ def process_first_tokens(self):
 
             first_tokens, response_id = first_token_item
 
-            response_data = array.array("B", np.array(
-                first_tokens, np.float32).tobytes())
+            response_data = array.array(
+                "B", np.array(first_tokens, np.float32).tobytes()
+            )
             bi = response_data.buffer_info()
             response = [lg.QuerySampleResponse(response_id, bi[0], bi[1])]
             lg.FirstTokenComplete(response)
 
     def process_queries(self):
-        """Processor of the queued queries. User may choose to add batching logic """
+        """Processor of the queued queries. User may choose to add batching logic"""
         while True:
 
             qitem = self.query_queue.get()
@@ -404,34 +469,43 @@ def process_queries(self):
                 self.first_token_queue,
                 tokens_cache=tokens_cache,
                 is_first_token=True,
-                response_ids=[
-                    qitem.id])
-            
-            logits_processor = LogitsProcessorList([StopAfterSequence(self.tokenizer.eos_token_id, device=self.device)])
+                response_ids=[qitem.id],
+            )
+
+            logits_processor = LogitsProcessorList(
+                [StopAfterSequence(
+                    self.tokenizer.eos_token_id, device=self.device)]
+            )
             if dataset == "MBXP":
-                _ = self.model.generate(input_ids=input_ids_tensor,
-                                        attention_mask=input_masks_tensor,
-                                        pad_token_id=self.tokenizer.pad_token_id,
-                                        streamer=tokens_streamer,
-                                        logits_processor=logits_processor,
-                                        **gen_kwargs
-                                        )
+                _ = self.model.generate(
+                    input_ids=input_ids_tensor,
+                    attention_mask=input_masks_tensor,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    streamer=tokens_streamer,
+                    logits_processor=logits_processor,
+                    **gen_kwargs,
+                )
             else:
-                _ = self.model.generate(input_ids=input_ids_tensor,
-                                        attention_mask=input_masks_tensor,
-                                        pad_token_id=self.tokenizer.pad_token_id,
-                                        streamer=tokens_streamer,
-                                        **gen_kwargs
-                                        )
+                _ = self.model.generate(
+                    input_ids=input_ids_tensor,
+                    attention_mask=input_masks_tensor,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    streamer=tokens_streamer,
+                    **gen_kwargs,
+                )
 
             output_tokens = tokens_streamer.get_out_tokens()
             n_tokens = len(output_tokens)
             response_array = array.array(
-                "B", np.array(
-                    output_tokens, np.int32).tobytes())
+                "B", np.array(output_tokens, np.int32).tobytes()
+            )
             bi = response_array.buffer_info()
-            response = [lg.QuerySampleResponse(
-                qitem.id, bi[0], bi[1], n_tokens)]
+            response = [
+                lg.QuerySampleResponse(
+                    qitem.id,
+                    bi[0],
+                    bi[1],
+                    n_tokens)]
             lg.QuerySamplesComplete(response)
 
     def issue_queries(self, query_samples):
diff --git a/language/mixtral-8x7b/dataset.py b/language/mixtral-8x7b/dataset.py
index d2cafac63..34211cab7 100644
--- a/language/mixtral-8x7b/dataset.py
+++ b/language/mixtral-8x7b/dataset.py
@@ -9,18 +9,26 @@
 from torch.utils.data import DataLoader
 from typing import Optional, Dict, Sequence
 import io
+
 # import utils
 import copy
 import pickle
 
 import logging
+
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("Llama-70B-Dataset")
 
 
-class Dataset():
-    def __init__(self, model_name=None, total_sample_count=15000,
-                 perf_count_override=None, dataset_path=None, device="cpu"):
+class Dataset:
+    def __init__(
+        self,
+        model_name=None,
+        total_sample_count=15000,
+        perf_count_override=None,
+        dataset_path=None,
+        device="cpu",
+    ):
         self.model_name = model_name or "mistralai/Mixtral-8x7B-v0.1"
         self.dataset_path = dataset_path
         self.max_length = 1024
@@ -35,12 +43,13 @@ def __init__(self, model_name=None, total_sample_count=15000,
         self.perf_count = perf_count_override or self.total_sample_count
 
     def load_tokenizer(self):
-        """ Returns tokenizer """
+        """Returns tokenizer"""
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
             model_max_length=1024,
             padding_side="left",
-            use_fast=False,)
+            use_fast=False,
+        )
 
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
@@ -48,13 +57,16 @@ def load_processed_dataset(self):
         if not os.path.isfile(self.dataset_path):
             log.warn(
                 "Processed pickle file {} not found. Please check that the path is correct".format(
-                    self.dataset_path))
+                    self.dataset_path
+                )
+            )
 
         print("Loading dataset...")
         import pandas as pd
+
         processed_data = pd.read_pickle(self.dataset_path)
 
-        input_tokens = processed_data['tok_input']
+        input_tokens = processed_data["tok_input"]
 
         self.input_ids = []
         self.input_lens = []
@@ -69,13 +81,18 @@ def load_processed_dataset(self):
             self.attention_masks.append(attn_mask)
             self.input_lens.append(input_ids.shape[-1])
 
-        for dataset in processed_data['dataset']:
+        for dataset in processed_data["dataset"]:
             self.dataset_names.append(dataset)
         print("Finished loading dataset.")
 
-    def postProcess(self, out_tokens, input_seq_lens=None,
-                    query_id_list=None, sample_index_list=None):
-        """ Postprocesses output prediction """
+    def postProcess(
+        self,
+        out_tokens,
+        input_seq_lens=None,
+        query_id_list=None,
+        sample_index_list=None,
+    ):
+        """Postprocesses output prediction"""
 
         # TODO: Create response object in postProcess(?)
         """
@@ -97,9 +114,8 @@ def postProcess(self, out_tokens, input_seq_lens=None,
             os.makedirs("run_outputs")
         fname = "q" + "_".join([str(i) for i in query_id_list])
         fname = f"run_outputs/{fname}.pkl"
-        with open(fname, mode='wb') as f:
-            d = {"query_ids": query_id_list,
-                 "outputs": output_seq}
+        with open(fname, mode="wb") as f:
+            d = {"query_ids": query_id_list, "outputs": output_seq}
             print(f"Saving outputs to {fname}")
             pickle.dump(d, f)
 
diff --git a/language/mixtral-8x7b/evaluate-accuracy.py b/language/mixtral-8x7b/evaluate-accuracy.py
index e20834c41..0d99188e9 100644
--- a/language/mixtral-8x7b/evaluate-accuracy.py
+++ b/language/mixtral-8x7b/evaluate-accuracy.py
@@ -10,18 +10,31 @@
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint-path", required=True,
-                        help="Path to Llama2-70b-hf-chat checkpoint")
-    parser.add_argument("--mlperf-accuracy-file", required=True,
-                        help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--dataset-file", required=True,
-                        help="path to processed validation dataset")
-    parser.add_argument("--n_workers", default=2, type=int,
-                        help="Number of workers used for the MBXP evaluation")
-    parser.add_argument("--verbose", action="store_true",
-                        help="verbose messages")
-    parser.add_argument("--dtype", default="int64",
-                        help="dtype of the accuracy log", choices=["int32", "int64", "float"])
+    parser.add_argument(
+        "--checkpoint-path", required=True, help="Path to Llama2-70b-hf-chat checkpoint"
+    )
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--dataset-file", required=True, help="path to processed validation dataset"
+    )
+    parser.add_argument(
+        "--n_workers",
+        default=2,
+        type=int,
+        help="Number of workers used for the MBXP evaluation",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="int64",
+        help="dtype of the accuracy log",
+        choices=["int32", "int64", "float"],
+    )
     args = parser.parse_args()
     return args
 
@@ -37,14 +50,13 @@ def find_numbers(x: str) -> list[str]:
     # Search for number, possibly negative (hyphen), with thousand separators
     # (comma), and with a decimal point (period inbetween digits).
     numbers = re.compile(
-        r'-?[\d,]*\.?\d+',
+        r"-?[\d,]*\.?\d+",
         re.MULTILINE | re.DOTALL | re.IGNORECASE,
     ).findall(x)
     return numbers
 
 
-def find_number(x: str,
-                answer_delimiter: str = 'The answer is') -> str:
+def find_number(x: str, answer_delimiter: str = "The answer is") -> str:
     """Finds the most relevant number in a string."""
     # If model uses the answer delimiter, then select the first number following
     # that format.
@@ -58,12 +70,12 @@ def find_number(x: str,
     numbers = find_numbers(x)
     if numbers:
         return numbers[-1]
-    return ''
+    return ""
 
 
 def maybe_remove_comma(x: str) -> str:
     # Example: 5,600 -> 5600
-    return x.replace(',', '')
+    return x.replace(",", "")
 
 
 def try_float(x: str):
@@ -73,6 +85,7 @@ def try_float(x: str):
         ret = None
     return ret
 
+
 # Functions for evaluating OpenOrca
 
 
@@ -86,6 +99,7 @@ def postprocess_text(preds, targets):
 
     return preds, targets
 
+
 # Functions for MBXP
 
 
@@ -96,7 +110,7 @@ def create_mbxp_dict(row, response):
         "prompt": row["input"],
         "test_code": row["gt_output"],
         "entry_point": entry_point,
-        "response": response
+        "response": response,
     }
 
 
@@ -106,13 +120,14 @@ def main():
     dataset_path = args.dataset_file
     checkpoint_path = args.checkpoint_path
     metric = evaluate.load("rouge")
-    nltk.download('punkt')
+    nltk.download("punkt")
 
     tokenizer = AutoTokenizer.from_pretrained(
         checkpoint_path,
         model_max_length=2048,
         padding_side="left",
-        use_fast=False,)
+        use_fast=False,
+    )
 
     data = get_groundtruth(args.dataset_file)
     query_types, gt_outputs = data["dataset"], data["gt_output"]
@@ -138,7 +153,7 @@ def main():
     gen_num = 0
     for pred in results:
         gen_num += 1
-        qsl_idx = pred['qsl_idx']
+        qsl_idx = pred["qsl_idx"]
         if qsl_idx in seen:
             continue
 
@@ -148,20 +163,20 @@ def main():
         if query_type == "GSM8K":
             target = gt_outputs.iloc[qsl_idx]
             target_required_GSM8K.append(target)
-            pred = np.frombuffer(bytes.fromhex(pred['data']), eval_dtype)
+            pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
 
             gen_tok_len += len(pred)
             preds_token_GSM8K.append(pred)
         elif query_type == "OpenOrca":
             target = gt_outputs.iloc[qsl_idx]
             target_required_OpenOrca.append(target)
-            pred = np.frombuffer(bytes.fromhex(pred['data']), eval_dtype)
+            pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
 
             gen_tok_len += len(pred)
             preds_token_OpenOrca.append(pred)
         else:
             target = data.iloc[qsl_idx]
-            pred = np.frombuffer(bytes.fromhex(pred['data']), eval_dtype)
+            pred = np.frombuffer(bytes.fromhex(pred["data"]), eval_dtype)
             pred_str = tokenizer.decode(pred, skip_special_tokens=True)
             results_MBXP.append(create_mbxp_dict(target, pred_str))
 
@@ -169,21 +184,24 @@ def main():
 
     # OpenOrca metric
     preds_decoded_text = tokenizer.batch_decode(
-        preds_token_OpenOrca, skip_special_tokens=True)
+        preds_token_OpenOrca, skip_special_tokens=True
+    )
 
     preds, targets = postprocess_text(
         preds_decoded_text, target_required_OpenOrca)
     result = metric.compute(
-        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False)
+        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
+    )
     result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
     prediction_lens = [len(pred) for pred in preds]
     # GSM8K metric
     preds_decoded_text = tokenizer.batch_decode(
-        preds_token_GSM8K, skip_special_tokens=True)
+        preds_token_GSM8K, skip_special_tokens=True
+    )
     pred_nums = [
-        maybe_remove_comma(
-            find_number(
-                pred_text.split("\nQ:")[0])) for pred_text in preds_decoded_text]
+        maybe_remove_comma(find_number(pred_text.split("\nQ:")[0]))
+        for pred_text in preds_decoded_text
+    ]
     gsm8k_total = len(target_required_GSM8K)
     correct = 0
     for idx in range(len(target_required_GSM8K)):
@@ -191,22 +209,23 @@ def main():
         tgt = try_float(pred_nums[idx])
         if tgt is None:
             continue
-        correct += (ref == tgt)
+        correct += ref == tgt
 
     gsm8k_accuracy = 100.0 * correct / gsm8k_total
 
     # MBXP metric
     from evaluate_mbxp import evaluate_mbxp
+
     mbxp_accuracy = evaluate_mbxp(results_MBXP, args.n_workers)
 
     result = {
         **result,
-        'gen_len': np.sum(prediction_lens),
-        'gen_num': gen_num,
-        'gen_tok_len': gen_tok_len,
-        'tokens_per_sample': round(gen_tok_len / gen_num, 1),
-        'gsm8k_accuracy': gsm8k_accuracy,
-        'mbxp_accuracy': mbxp_accuracy
+        "gen_len": np.sum(prediction_lens),
+        "gen_num": gen_num,
+        "gen_tok_len": gen_tok_len,
+        "tokens_per_sample": round(gen_tok_len / gen_num, 1),
+        "gsm8k_accuracy": gsm8k_accuracy,
+        "mbxp_accuracy": mbxp_accuracy,
     }
 
     print("\nResults\n")
diff --git a/language/mixtral-8x7b/evaluate_mbxp.py b/language/mixtral-8x7b/evaluate_mbxp.py
index e7d55d169..a230587fb 100644
--- a/language/mixtral-8x7b/evaluate_mbxp.py
+++ b/language/mixtral-8x7b/evaluate_mbxp.py
@@ -66,7 +66,7 @@ def worker(inp_queue, out_queue):
         solution = problem["response"]
 
         try:
-            solution = solution[:solution.index("```")]
+            solution = solution[: solution.index("```")]
         except ValueError:
             # Happens when a code block isn't closed properly
             pass
@@ -89,11 +89,14 @@ def worker(inp_queue, out_queue):
         try:
             result = checker(problem, solution, timeout=20.0)
             out_queue.put(
-                (key,
-                 problem["lang"],
+                (
+                    key,
+                    problem["lang"],
                     result["passed"],
                     result["result"],
-                    problem["response"]))
+                    problem["response"],
+                )
+            )
         except Exception as e:
             print(e)
             out_queue.put(
diff --git a/language/mixtral-8x7b/main.py b/language/mixtral-8x7b/main.py
index 396948ba0..56447c778 100644
--- a/language/mixtral-8x7b/main.py
+++ b/language/mixtral-8x7b/main.py
@@ -17,21 +17,22 @@ def get_args():
     parser.add_argument(
         "--scenario",
         type=str,
-        choices=[
-            "Offline",
-            "Server"],
+        choices=["Offline", "Server"],
         default="Offline",
-        help="Scenario")
+        help="Scenario",
+    )
     parser.add_argument(
         "--model-path",
         type=str,
         default="mistralai/Mixtral-8x7B-Instruct-v0.1",
-        help="Model name")
+        help="Model name",
+    )
     parser.add_argument(
         "--dataset-path",
         type=str,
         default=None,
-        help="path to processed validation dataset")
+        help="path to processed validation dataset",
+    )
     parser.add_argument(
         "--accuracy",
         action="store_true",
@@ -40,56 +41,58 @@ def get_args():
         "--dtype",
         type=str,
         default="float32",
-        help="data type of the model, choose from float16, bfloat16 and float32")
+        help="data type of the model, choose from float16, bfloat16 and float32",
+    )
     parser.add_argument(
         "--device",
         type=str,
-        choices=[
-            "cpu",
-            "cuda:0"],
+        choices=["cpu", "cuda:0"],
         default="cpu",
-        help="device to use")
+        help="device to use",
+    )
     parser.add_argument(
         "--audit-conf",
         type=str,
         default="audit.conf",
-        help="audit config for LoadGen settings during compliance runs")
+        help="audit config for LoadGen settings during compliance runs",
+    )
     parser.add_argument(
-        "--mlperf-conf",
-        type=str,
-        default="mlperf.conf",
-        help="mlperf rules config")
+        "--mlperf-conf", type=str, default="mlperf.conf", help="mlperf rules config"
+    )
     parser.add_argument(
         "--user-conf",
         type=str,
         default="user.conf",
-        help="user config for user LoadGen settings such as target QPS")
+        help="user config for user LoadGen settings such as target QPS",
+    )
     # TODO: This interpretation of 'total-sample-count' is a little
     # misleading. Fix it
     parser.add_argument(
         "--total-sample-count",
         type=int,
         default=24576,
-        help="Number of samples to use in benchmark.")
+        help="Number of samples to use in benchmark.",
+    )
     parser.add_argument(
         "--batch-size",
         type=int,
         default=1,
-        help="Model batch-size to use in benchmark.")
+        help="Model batch-size to use in benchmark.",
+    )
     parser.add_argument(
-        "--output-log-dir",
-        type=str,
-        default="output-logs",
-        help="Where logs are saved")
+        "--output-log-dir", type=str, default="output-logs", help="Where logs are saved"
+    )
     parser.add_argument(
         "--enable-log-trace",
         action="store_true",
-        help="Enable log tracing. This file can become quite large")
+        help="Enable log tracing. This file can become quite large",
+    )
     parser.add_argument(
         "--num-workers",
         type=int,
         default=1,
-        help="Number of workers to process queries")
+        help="Number of workers to process queries",
+    )
 
     args = parser.parse_args()
     return args
@@ -100,10 +103,7 @@ def get_args():
     "server": lg.TestScenario.Server,
 }
 
-sut_map = {
-    "offline": SUT,
-    "server": SUTServer
-}
+sut_map = {"offline": SUT, "server": SUTServer}
 
 
 def main():
@@ -118,7 +118,8 @@ def main():
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly
         log.warning(
-            "Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet")
+            "Accuracy run will generate the accuracy logs, but the evaluation of the log is not completed yet"
+        )
     else:
         settings.mode = lg.TestMode.PerformanceOnly
 
diff --git a/loadgen/bindings/c_api.cc b/loadgen/bindings/c_api.cc
index 4101c7b96..0248a1c16 100644
--- a/loadgen/bindings/c_api.cc
+++ b/loadgen/bindings/c_api.cc
@@ -155,15 +155,14 @@ void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
       });
 }
 
-void FirstTokenComplete(QuerySampleResponse* responses,
-                          size_t response_count) {
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count) {
   mlperf::FirstTokenComplete(responses, response_count);
 }
 
 void FirstTokenCompleteResponseCb(QuerySampleResponse* responses,
-                                    size_t response_count,
-                                    ResponseCallback response_cb,
-                                    ClientData client_data) {
+                                  size_t response_count,
+                                  ResponseCallback response_cb,
+                                  ClientData client_data) {
   mlperf::FirstTokenComplete(
       responses, response_count,
       [client_data, response_cb](QuerySampleResponse* response) {
diff --git a/loadgen/bindings/c_api.h b/loadgen/bindings/c_api.h
index 8c730e96a..0ee44fb71 100644
--- a/loadgen/bindings/c_api.h
+++ b/loadgen/bindings/c_api.h
@@ -49,13 +49,12 @@ void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
                                     ResponseCallback response_cb,
                                     ClientData client_data);
 
-void FirstTokenComplete(QuerySampleResponse* responses,
-                          size_t response_count);
-                        
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count);
+
 void FirstTokenCompleteResponseCb(QuerySampleResponse* responses,
-                                    size_t response_count,
-                                    ResponseCallback response_cb,
-                                    ClientData client_data);
+                                  size_t response_count,
+                                  ResponseCallback response_cb,
+                                  ClientData client_data);
 
 /// \brief Create an opaque SUT pointer based on C callbacks.
 void* ConstructSUT(ClientData client_data, const char* name, size_t name_length,
diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc
index 83e14e0ec..f14e5b319 100644
--- a/loadgen/bindings/python_api.cc
+++ b/loadgen/bindings/python_api.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <functional>
 
 #include "../loadgen.h"
+#include "../query_dispatch_library.h"
 #include "../query_sample.h"
 #include "../query_sample_library.h"
-#include "../query_dispatch_library.h"
 #include "../system_under_test.h"
 #include "../test_settings.h"
 #include "pybind11/functional.h"
@@ -133,34 +133,36 @@ class QuerySampleLibraryTrampoline : public QuerySampleLibrary {
 // A QDL that allows defining callbacks for
 // IssueQuery, FlushQueries, and Name methods.
 class QueryDispatchLibraryTrampoline : public QueryDispatchLibrary {
-  public:
-    QueryDispatchLibraryTrampoline(IssueQueryCallback issue_query_callback,
+ public:
+  QueryDispatchLibraryTrampoline(IssueQueryCallback issue_query_callback,
                                  FlushQueriesCallback flush_queries_callback,
                                  NameCallback name_callback)
-        : issue_query_callback_(issue_query_callback),
-          flush_queries_callback_(flush_queries_callback),
-          name_callback_(name_callback) {}
-
-    // Returns the name of the SUT. Name shall be returned over the network
-    // TODO: other bindings should also be fixed eventually to be used over the network 
-    const std::string& Name() override {
-      static std::string name; // HACK: avoid returning a reference to temporary.
-      pybind11::gil_scoped_acquire gil_acquirer; 
-      name = name_callback_(); // name_callback_() shall returned name over the network.
-      return name;
-    }
+      : issue_query_callback_(issue_query_callback),
+        flush_queries_callback_(flush_queries_callback),
+        name_callback_(name_callback) {}
+
+  // Returns the name of the SUT. Name shall be returned over the network
+  // TODO: other bindings should also be fixed eventually to be used over the
+  // network
+  const std::string& Name() override {
+    static std::string name;  // HACK: avoid returning a reference to temporary.
+    pybind11::gil_scoped_acquire gil_acquirer;
+    name = name_callback_();  // name_callback_() shall returned name over the
+                              // network.
+    return name;
+  }
 
-    void IssueQuery(const std::vector<QuerySample>& samples) override {
-        pybind11::gil_scoped_acquire gil_acquirer;
-        issue_query_callback_(samples);
-    }
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    issue_query_callback_(samples);
+  }
 
-    void FlushQueries() override { flush_queries_callback_(); }
+  void FlushQueries() override { flush_queries_callback_(); }
 
-    protected:
-      IssueQueryCallback issue_query_callback_;
-      FlushQueriesCallback flush_queries_callback_;
-      NameCallback name_callback_;
+ protected:
+  IssueQueryCallback issue_query_callback_;
+  FlushQueriesCallback flush_queries_callback_;
+  NameCallback name_callback_;
 };
 
 }  // namespace
@@ -213,8 +215,8 @@ void DestroyQSL(uintptr_t qsl) {
 uintptr_t ConstructQDL(IssueQueryCallback issue_cb,
                        FlushQueriesCallback flush_queries_cb,
                        NameCallback name_callback) {
-  QueryDispatchLibraryTrampoline* qdl =
-      new QueryDispatchLibraryTrampoline(issue_cb, flush_queries_cb, name_callback);
+  QueryDispatchLibraryTrampoline* qdl = new QueryDispatchLibraryTrampoline(
+      issue_cb, flush_queries_cb, name_callback);
   return reinterpret_cast<uintptr_t>(qdl);
 }
 
@@ -223,7 +225,7 @@ void DestroyQDL(uintptr_t qdl) {
       reinterpret_cast<QueryDispatchLibraryTrampoline*>(qdl);
   delete qdl_cast;
 }
- 
+
 void StartTest(uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings,
                const std::string& audit_config_filename) {
   pybind11::gil_scoped_release gil_releaser;
@@ -259,7 +261,7 @@ void QuerySamplesComplete(std::vector<QuerySampleResponse> responses,
 }
 
 void FirstTokenComplete(std::vector<QuerySampleResponse> responses,
-                          ResponseCallback response_cb = {}) {
+                        ResponseCallback response_cb = {}) {
   pybind11::gil_scoped_release gil_releaser;
   mlperf::FirstTokenComplete(responses.data(), responses.size(), response_cb);
 }
@@ -329,17 +331,19 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
                      &TestSettings::performance_issue_same_index)
       .def_readwrite("performance_sample_count_override",
                      &TestSettings::performance_sample_count_override)
-      .def_readwrite("test05",
-                     &TestSettings::test05)
+      .def_readwrite("test05", &TestSettings::test05)
       .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
       .def_readwrite("test05_sample_index_rng_seed",
                      &TestSettings::test05_sample_index_rng_seed)
-      .def_readwrite("test05_schedule_rng_seed", &TestSettings::test05_schedule_rng_seed)
+      .def_readwrite("test05_schedule_rng_seed",
+                     &TestSettings::test05_schedule_rng_seed)
       .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
       .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
       .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
-      .def_readwrite("infer_token_latencies", &TestSettings::infer_token_latencies)
-      .def_readwrite("token_latency_scaling_factor", &TestSettings::token_latency_scaling_factor)
+      .def_readwrite("infer_token_latencies",
+                     &TestSettings::infer_token_latencies)
+      .def_readwrite("token_latency_scaling_factor",
+                     &TestSettings::token_latency_scaling_factor)
       .def("FromConfig", &TestSettings::FromConfig, "FromConfig.");
 
   pybind11::enum_<LoggingMode>(m, "LoggingMode")
@@ -408,10 +412,9 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
             q.id = t[0].cast<uintptr_t>();
             q.data = t[1].cast<uintptr_t>();
             q.size = t[2].cast<size_t>();
-            if (t.size() == 4){
+            if (t.size() == 4) {
               q.n_tokens = t[3].cast<int64_t>();
-            }
-            else{
+            } else {
               q.n_tokens = 0;
             }
             return q;
@@ -436,10 +439,11 @@ PYBIND11_MODULE(mlperf_loadgen, m) {
   m.def("DestroyQSL", &py::DestroyQSL,
         "Destroy the object created by ConstructQSL.");
 
-  m.def("ConstructQDL", &py::ConstructQDL, 
-      "Construct the query sample library, communicating with the SUT over the network.");
+  m.def("ConstructQDL", &py::ConstructQDL,
+        "Construct the query sample library, communicating with the SUT over "
+        "the network.");
   m.def("DestroyQDL", &py::DestroyQDL,
-      "Destroy the object created by ConstructQDL.");
+        "Destroy the object created by ConstructQDL.");
 
   m.def("StartTest", &py::StartTest,
         "Run tests on a SUT created by ConstructSUT() with the provided QSL. "
diff --git a/loadgen/demos/lon/py_demo_server_lon.py b/loadgen/demos/lon/py_demo_server_lon.py
index bee0b2f8b..1248215db 100644
--- a/loadgen/demos/lon/py_demo_server_lon.py
+++ b/loadgen/demos/lon/py_demo_server_lon.py
@@ -32,8 +32,9 @@
 
 FLAGS = flags.FLAGS
 
-flags.DEFINE_list('sut_server', 'http://localhost:8000',
-                    'Address of the server(s) under test.')
+flags.DEFINE_list(
+    "sut_server", "http://localhost:8000", "Address of the server(s) under test."
+)
 
 
 class QSL:
@@ -41,9 +42,14 @@ class QSL:
 
     def __init__(self, total_sample_count, performance_sample_count):
         self.eval_features = {
-            i: f"what_is_my_dummy_feature_{i}?" for i in range(total_sample_count)}
+            i: f"what_is_my_dummy_feature_{i}?" for i in range(total_sample_count)
+        }
         self.qsl = mlperf_loadgen.ConstructQSL(
-            total_sample_count, performance_sample_count, self.load_samples_to_ram, self.unload_samples_from_ram)
+            total_sample_count,
+            performance_sample_count,
+            self.load_samples_to_ram,
+            self.unload_samples_from_ram,
+        )
 
     def get_features(self, sample_id):
         """Returns the feature for a given sample id."""
@@ -81,10 +87,11 @@ def __init__(self, qsl: QSL, sut_server_addr: list):
             sut_server_addr: A list of addresses of the SUT.
         """
         self.qsl = qsl
- 
+
         # Construct QDL from the python binding
         self.qdl = mlperf_loadgen.ConstructQDL(
-            self.issue_query, self.flush_queries, self.client_get_name)
+            self.issue_query, self.flush_queries, self.client_get_name
+        )
         self.sut_server_addr = sut_server_addr
         self.num_nodes = len(sut_server_addr)
 
@@ -94,8 +101,9 @@ def __init__(self, qsl: QSL, sut_server_addr: list):
 
     def issue_query(self, query_samples):
         """Process the query to send to the SUT"""
-        threading.Thread(target=self.process_query_async,
-                         args=[query_samples]).start()
+        threading.Thread(
+            target=self.process_query_async,
+            args=[query_samples]).start()
 
     def flush_queries(self):
         """Flush the queries. Dummy implementation."""
@@ -120,15 +128,16 @@ def process_query_async(self, query_samples):
             # Read features from the QSL
             features = self.qsl.get_features(s.index)
 
-            time.sleep(.001)  # Ensure a maximal rate of queries to the SUT
+            time.sleep(0.001)  # Ensure a maximal rate of queries to the SUT
 
             # Send the query to SUT in round robin
             # Wait for a response
             sut_result = self.client_predict(features, s.index)
-            response_array = array.array('B', sut_result.encode('utf-8'))
+            response_array = array.array("B", sut_result.encode("utf-8"))
             bi = response_array.buffer_info()
-            responses.append(mlperf_loadgen.QuerySampleResponse(
-                s.id, bi[0], bi[1]))
+            responses.append(
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, bi[0], bi[1]))
         mlperf_loadgen.QuerySamplesComplete(responses)
 
     def get_sut_id_round_robin(self):
@@ -140,17 +149,22 @@ def get_sut_id_round_robin(self):
 
     def client_predict(self, query, id):
         """Serialize the query, send it to the SUT in round robin, and return the deserialized response."""
-        url = '{}/predict/'.format(self.sut_server_addr[self.get_sut_id_round_robin()])
-        response = requests.post(url, json={'query': query, id: id})
-        return response.json()['result']
+        url = "{}/predict/".format(
+            self.sut_server_addr[self.get_sut_id_round_robin()])
+        response = requests.post(url, json={"query": query, id: id})
+        return response.json()["result"]
 
     def client_get_name(self):
         """Get the name of the SUT from ALL the SUTS."""
         if len(self.sut_server_addr) == 1:
-            return requests.post(f'{self.sut_server_addr[0]}/getname/').json()['name']
-    
-        sut_names = [requests.post(f'{addr}/getname/').json()['name'] for addr in self.sut_server_addr]
-        return "Multi-node SUT: " + ', '.join(sut_names)
+            return requests.post(
+                f"{self.sut_server_addr[0]}/getname/").json()["name"]
+
+        sut_names = [
+            requests.post(f"{addr}/getname/").json()["name"]
+            for addr in self.sut_server_addr
+        ]
+        return "Multi-node SUT: " + ", ".join(sut_names)
 
     def __del__(self):
         mlperf_loadgen.DestroyQDL(self.qdl)
diff --git a/loadgen/demos/lon/sut_over_network_demo.py b/loadgen/demos/lon/sut_over_network_demo.py
index 9922fc765..55e5e038d 100644
--- a/loadgen/demos/lon/sut_over_network_demo.py
+++ b/loadgen/demos/lon/sut_over_network_demo.py
@@ -24,7 +24,7 @@
 - /predict/ : Receives a query (e.g., a text) runs inference, and returns a prediction.
 - /getname/ : Get the name of the SUT.
 
-The current implementation is a dummy implementation, which does not use 
+The current implementation is a dummy implementation, which does not use
 a real DNN model, batching, or pre/postprocessing code,
 but rather just returns subset of the input query as a response,
 Yet, it illustrates the basic structure of a SUT server.
@@ -38,45 +38,51 @@
 
 
 node = ""
+
+
 def preprocess(query):
     """[SUT Node] A dummy preprocess."""
-    # Here may come for example batching, tokenization, resizing, normalization, etc.
+    # Here may come for example batching, tokenization, resizing,
+    # normalization, etc.
     response = query
     return response
 
 
 def dnn_model(query):
     """[SUT Node] A dummy DNN model."""
-    # Here may come for example a call to a dnn model such as resnet, bert, etc.
+    # Here may come for example a call to a dnn model such as resnet, bert,
+    # etc.
     response = query
     return response
 
 
 def postprocess(query):
     """[SUT Node] A dummy postprocess."""
-    # Here may come for example a postprocessing call, e.g., NMS, detokenization, etc.
+    # Here may come for example a postprocessing call, e.g., NMS,
+    # detokenization, etc.
     response = query
     return response
 
 
-@app.route('/predict/', methods=['POST'])
+@app.route("/predict/", methods=["POST"])
 def predict():
     """Receives a query (e.g., a text) runs inference, and returns a prediction."""
-    query = request.get_json(force=True)['query']
+    query = request.get_json(force=True)["query"]
     result = postprocess(dnn_model(preprocess(query)))
     return jsonify(result=result)
 
 
-@app.route('/getname/', methods=['POST', 'GET'])
+@app.route("/getname/", methods=["POST", "GET"])
 def getname():
     """Returns the name of the SUT."""
-    return jsonify(name=f'Demo SUT (Network SUT) node' + (' ' + node) if node else '')
+    return jsonify(name=f"Demo SUT (Network SUT) node" +
+                   (" " + node) if node else "")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--port', type=int, default=8000)
-    parser.add_argument('--node', type=str, default="")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--node", type=str, default="")
     args = parser.parse_args()
     node = args.node
     app.run(debug=False, port=args.port)
diff --git a/loadgen/demos/py_demo_multi_stream.py b/loadgen/demos/py_demo_multi_stream.py
index cd0addd4a..f6082cad6 100644
--- a/loadgen/demos/py_demo_multi_stream.py
+++ b/loadgen/demos/py_demo_multi_stream.py
@@ -28,7 +28,8 @@
 
 # Global var
 NUM_AGENTS = 8
-LOOPBACK_LATENCY_S = .001
+LOOPBACK_LATENCY_S = 0.001
+
 
 def load_samples_to_ram(query_samples):
     del query_samples
@@ -44,7 +45,8 @@ def unload_samples_from_ram(query_samples):
 def process_query_async(query_samples, i_slice):
     time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1))
     responses = []
-    samples_to_complete = query_samples[i_slice:len(query_samples):NUM_AGENTS]
+    samples_to_complete = query_samples[i_slice: len(
+        query_samples): NUM_AGENTS]
     for j, s in enumerate(samples_to_complete):
         responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
     mlperf_loadgen.QuerySamplesComplete(responses)
@@ -52,8 +54,9 @@ def process_query_async(query_samples, i_slice):
 
 def issue_query(query_samples):
     for i in range(8):
-        threading.Thread(target=process_query_async,
-                         args=(query_samples, i)).start()
+        threading.Thread(
+            target=process_query_async, args=(
+                query_samples, i)).start()
 
 
 def flush_queries():
@@ -72,7 +75,8 @@ def main(argv):
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
diff --git a/loadgen/demos/py_demo_offline.py b/loadgen/demos/py_demo_offline.py
index 46c606219..909585edc 100644
--- a/loadgen/demos/py_demo_offline.py
+++ b/loadgen/demos/py_demo_offline.py
@@ -39,19 +39,22 @@ def unload_samples_from_ram(query_samples):
 def process_query_async(query_samples, i_slice):
     time.sleep(3 * (i_slice + 1))
     responses = []
-    samples_to_complete = query_samples[i_slice:len(query_samples):3]
+    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
     for s in samples_to_complete:
         responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
     mlperf_loadgen.QuerySamplesComplete(responses)
 
 
 def issue_query(query_samples):
-    threading.Thread(target=process_query_async,
-                     args=(query_samples, 0)).start()
-    threading.Thread(target=process_query_async,
-                     args=(query_samples, 1)).start()
-    threading.Thread(target=process_query_async,
-                     args=(query_samples, 2)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 0)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 1)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 2)).start()
 
 
 def flush_queries():
@@ -67,7 +70,8 @@ def main(argv):
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
diff --git a/loadgen/demos/py_demo_server.py b/loadgen/demos/py_demo_server.py
index 3806a1fd0..8b6f2b826 100644
--- a/loadgen/demos/py_demo_server.py
+++ b/loadgen/demos/py_demo_server.py
@@ -36,7 +36,7 @@ def unload_samples_from_ram(query_samples):
 
 
 def process_query_async(query_samples):
-    time.sleep(.001)
+    time.sleep(0.001)
     responses = []
     for s in query_samples:
         responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
@@ -44,8 +44,7 @@ def process_query_async(query_samples):
 
 
 def issue_query(query_samples):
-    threading.Thread(target=process_query_async,
-                     args=[query_samples]).start()
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
 
 
 def flush_queries():
@@ -64,7 +63,8 @@ def main(argv):
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
diff --git a/loadgen/demos/py_demo_single_stream.py b/loadgen/demos/py_demo_single_stream.py
index a8ab45eb2..8806271bd 100644
--- a/loadgen/demos/py_demo_single_stream.py
+++ b/loadgen/demos/py_demo_single_stream.py
@@ -38,23 +38,24 @@ def unload_samples_from_ram(query_samples):
 
 def process_query_async(query_samples):
     """Processes the list of queries."""
-    time.sleep(.001)
+    time.sleep(0.001)
     responses = []
     response_array = array.array(
-        "f", [0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 254, 255])
+        "f", [0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 254, 255]
+    )
     response_info = response_array.buffer_info()
     response_data = response_info[0]
     response_size = response_info[1] * response_array.itemsize
     for s in query_samples:
         responses.append(
             mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size))
+                s.id, response_data, response_size)
+        )
     mlperf_loadgen.QuerySamplesComplete(responses)
 
 
 def issue_query(query_samples):
-    threading.Thread(target=process_query_async,
-                     args=[query_samples]).start()
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
 
 
 def flush_queries():
@@ -72,7 +73,8 @@ def main(argv):
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
diff --git a/loadgen/demos/token_metrics/py_demo_multi_stream.py b/loadgen/demos/token_metrics/py_demo_multi_stream.py
index a6350e53d..e4b083853 100644
--- a/loadgen/demos/token_metrics/py_demo_multi_stream.py
+++ b/loadgen/demos/token_metrics/py_demo_multi_stream.py
@@ -30,18 +30,23 @@
 
 # Global var
 NUM_AGENTS = 8
-LOOPBACK_LATENCY_S = .001
+LOOPBACK_LATENCY_S = 0.001
+
 
 def f(x, y):
-    return (4 + 3*x*y + x**3 + y**2)
+    return 4 + 3 * x * y + x**3 + y**2
+
 
-def create_responses(n, m, mod = 4):
+def create_responses(n, m, mod=4):
     r = []
     for i in range(n):
-        r.append([f(i,j) for j in range(m + (i%mod))])
+        r.append([f(i, j) for j in range(m + (i % mod))])
     return r
+
+
 responses = create_responses(1024, 20)
 
+
 def load_samples_to_ram(query_samples):
     del query_samples
     return
@@ -56,17 +61,24 @@ def unload_samples_from_ram(query_samples):
 def process_query_async(query_samples, i_slice):
     time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1))
     query_responses = []
-    samples_to_complete = query_samples[i_slice:len(query_samples):NUM_AGENTS]
+    samples_to_complete = query_samples[i_slice: len(
+        query_samples): NUM_AGENTS]
     for j, s in enumerate(samples_to_complete):
         response_array = np.array(responses[s.index], np.int32)
         token = response_array[0]
-        time.sleep(.0002)
+        time.sleep(0.0002)
         response_token = array.array("B", token.tobytes())
         response_token_info = response_token.buffer_info()
         response_token_data = response_token_info[0]
         response_token_size = response_token_info[1] * response_token.itemsize
-        mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
-        time.sleep(.02)
+        mlperf_loadgen.FirstTokenComplete(
+            [
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, response_token_data, response_token_size
+                )
+            ]
+        )
+        time.sleep(0.02)
         n_tokens = len(response_array)
         response_array = array.array("B", response_array.tobytes())
         response_info = response_array.buffer_info()
@@ -74,14 +86,17 @@ def process_query_async(query_samples, i_slice):
         response_size = response_info[1] * response_array.itemsize
         query_responses.append(
             mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size, n_tokens))
+                s.id, response_data, response_size, n_tokens
+            )
+        )
     mlperf_loadgen.QuerySamplesComplete(query_responses)
 
 
 def issue_query(query_samples):
     for i in range(8):
-        threading.Thread(target=process_query_async,
-                         args=(query_samples, i)).start()
+        threading.Thread(
+            target=process_query_async, args=(
+                query_samples, i)).start()
 
 
 def flush_queries():
@@ -90,7 +105,9 @@ def flush_queries():
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance")
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
     parser.add_argument("--expected-latency", type=int, default=8000000)
     parser.add_argument("--samples-per-query", type=int, default=8)
     parser.add_argument("--min-query-count", type=int, default=100)
@@ -114,7 +131,8 @@ def main():
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
diff --git a/loadgen/demos/token_metrics/py_demo_offline.py b/loadgen/demos/token_metrics/py_demo_offline.py
index d0bbd2c49..2e190cdd5 100644
--- a/loadgen/demos/token_metrics/py_demo_offline.py
+++ b/loadgen/demos/token_metrics/py_demo_offline.py
@@ -28,15 +28,19 @@
 
 
 def f(x, y):
-    return (4 + 3*x*y + x**3 + y**2)
+    return 4 + 3 * x * y + x**3 + y**2
 
-def create_responses(n, m, mod = 4):
+
+def create_responses(n, m, mod=4):
     r = []
     for i in range(n):
-        r.append([f(i,j) for j in range(m + (i%mod))])
+        r.append([f(i, j) for j in range(m + (i % mod))])
     return r
+
+
 responses = create_responses(1024, 20)
 
+
 def load_samples_to_ram(query_samples):
     del query_samples
     return
@@ -51,17 +55,17 @@ def unload_samples_from_ram(query_samples):
 def process_query_async(query_samples, i_slice):
     time.sleep(3 * (i_slice + 1))
     query_responses = []
-    samples_to_complete = query_samples[i_slice:len(query_samples):3]
+    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
     for s in samples_to_complete:
         response_array = np.array(responses[s.index], np.int32)
         token = response_array[0]
-        time.sleep(.0002)
+        time.sleep(0.0002)
         response_token = array.array("B", token.tobytes())
         response_token_info = response_token.buffer_info()
         response_token_data = response_token_info[0]
         response_token_size = response_token_info[1] * response_token.itemsize
         # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
-        time.sleep(.02)
+        time.sleep(0.02)
         n_tokens = len(response_array)
         response_array = array.array("B", response_array.tobytes())
         response_info = response_array.buffer_info()
@@ -69,17 +73,22 @@ def process_query_async(query_samples, i_slice):
         response_size = response_info[1] * response_array.itemsize
         query_responses.append(
             mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size, n_tokens))
+                s.id, response_data, response_size, n_tokens
+            )
+        )
     mlperf_loadgen.QuerySamplesComplete(query_responses)
 
 
 def issue_query(query_samples):
-    threading.Thread(target=process_query_async,
-                     args=(query_samples, 0)).start()
-    threading.Thread(target=process_query_async,
-                     args=(query_samples, 1)).start()
-    threading.Thread(target=process_query_async,
-                     args=(query_samples, 2)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 0)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 1)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 2)).start()
 
 
 def flush_queries():
@@ -88,7 +97,9 @@ def flush_queries():
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance")
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
     parser.add_argument("--expected-qps", type=int, default=1000)
     parser.add_argument("--min-duration-ms", type=int, default=30000)
     return parser.parse_args()
@@ -108,7 +119,8 @@ def main():
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
diff --git a/loadgen/demos/token_metrics/py_demo_offline_inferred.py b/loadgen/demos/token_metrics/py_demo_offline_inferred.py
index 79390e1b8..9325b8410 100644
--- a/loadgen/demos/token_metrics/py_demo_offline_inferred.py
+++ b/loadgen/demos/token_metrics/py_demo_offline_inferred.py
@@ -28,14 +28,18 @@
 
 
 def f(x, y):
-    return (4 + 3*x*y + x**3 + y**2)
+    return 4 + 3 * x * y + x**3 + y**2
 
-def create_responses(n, m, mod = 4):
+
+def create_responses(n, m, mod=4):
     r = []
     for i in range(n):
-        r.append([f(i,j) for j in range(m + (i%mod))])
+        r.append([f(i, j) for j in range(m + (i % mod))])
     return r
-responses = create_responses(1024, 20, mod = 3)
+
+
+responses = create_responses(1024, 20, mod=3)
+
 
 def load_samples_to_ram(query_samples):
     del query_samples
@@ -51,17 +55,17 @@ def unload_samples_from_ram(query_samples):
 def process_query_async(query_samples, i_slice):
     time.sleep(3 * (i_slice + 1))
     query_responses = []
-    samples_to_complete = query_samples[i_slice:len(query_samples):3]
+    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
     for s in samples_to_complete:
         response_array = np.array(responses[s.index], np.int32)
         token = response_array[0]
-        time.sleep(.0002)
+        time.sleep(0.0002)
         response_token = array.array("B", token.tobytes())
         response_token_info = response_token.buffer_info()
         response_token_data = response_token_info[0]
         response_token_size = response_token_info[1] * response_token.itemsize
         # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
-        time.sleep(.02)
+        time.sleep(0.02)
         n_tokens = len(response_array)
         response_array = array.array("B", response_array.tobytes())
         response_info = response_array.buffer_info()
@@ -69,17 +73,21 @@ def process_query_async(query_samples, i_slice):
         response_size = response_info[1] * response_array.itemsize
         query_responses.append(
             mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size))
+                s.id, response_data, response_size)
+        )
     mlperf_loadgen.QuerySamplesComplete(query_responses)
 
 
 def issue_query(query_samples):
-    threading.Thread(target=process_query_async,
-                     args=(query_samples, 0)).start()
-    threading.Thread(target=process_query_async,
-                     args=(query_samples, 1)).start()
-    threading.Thread(target=process_query_async,
-                     args=(query_samples, 2)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 0)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 1)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 2)).start()
 
 
 def flush_queries():
@@ -88,7 +96,9 @@ def flush_queries():
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance")
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
     parser.add_argument("--expected-qps", type=int, default=1000)
     parser.add_argument("--min-duration-ms", type=int, default=30000)
     return parser.parse_args()
@@ -109,7 +119,8 @@ def main():
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
diff --git a/loadgen/demos/token_metrics/py_demo_server.py b/loadgen/demos/token_metrics/py_demo_server.py
index 8879863e0..b564543cd 100644
--- a/loadgen/demos/token_metrics/py_demo_server.py
+++ b/loadgen/demos/token_metrics/py_demo_server.py
@@ -27,16 +27,21 @@
 from absl import app
 import mlperf_loadgen
 
+
 def f(x, y):
-    return (4 + 3*x*y + x**3 + y**2)
+    return 4 + 3 * x * y + x**3 + y**2
+
 
-def create_responses(n, m, mod = 4):
+def create_responses(n, m, mod=4):
     r = []
     for i in range(n):
-        r.append([f(i,j) for j in range(m + (i%mod))])
+        r.append([f(i, j) for j in range(m + (i % mod))])
     return r
+
+
 responses = create_responses(1024, 20)
 
+
 def load_samples_to_ram(query_samples):
     del query_samples
     return
@@ -53,13 +58,19 @@ def process_query_async(query_samples):
     for s in query_samples:
         response_array = np.array(responses[s.index], np.int32)
         token = response_array[0]
-        time.sleep(.0002)
+        time.sleep(0.0002)
         response_token = array.array("B", token.tobytes())
         response_token_info = response_token.buffer_info()
         response_token_data = response_token_info[0]
         response_token_size = response_token_info[1] * response_token.itemsize
-        mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
-        time.sleep(.02)
+        mlperf_loadgen.FirstTokenComplete(
+            [
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, response_token_data, response_token_size
+                )
+            ]
+        )
+        time.sleep(0.02)
         n_tokens = len(response_array)
         response_array = array.array("B", response_array.tobytes())
         response_info = response_array.buffer_info()
@@ -68,27 +79,32 @@ def process_query_async(query_samples):
         # print(f"Reported size python: {n_tokens}")
         query_responses.append(
             mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size, n_tokens))
+                s.id, response_data, response_size, n_tokens
+            )
+        )
     mlperf_loadgen.QuerySamplesComplete(query_responses)
 
 
 def issue_query(query_samples):
-    threading.Thread(target=process_query_async,
-                     args=[query_samples]).start()
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
 
 
 def flush_queries():
     pass
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance")
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
     parser.add_argument("--target-qps", type=int, default=100)
     parser.add_argument("--target-latency-ns", type=int, default=100000000)
     parser.add_argument("--min-query-count", type=int, default=100)
     parser.add_argument("--min-duration-ms", type=int, default=30000)
     return parser.parse_args()
 
+
 def main():
     args = get_args()
     settings = mlperf_loadgen.TestSettings()
@@ -105,7 +121,8 @@ def main():
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
@@ -113,4 +130,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/loadgen/demos/token_metrics/py_demo_server_inferred.py b/loadgen/demos/token_metrics/py_demo_server_inferred.py
index b4431ec9c..76461a75d 100644
--- a/loadgen/demos/token_metrics/py_demo_server_inferred.py
+++ b/loadgen/demos/token_metrics/py_demo_server_inferred.py
@@ -27,15 +27,20 @@
 from absl import app
 import mlperf_loadgen
 
+
 def f(x, y):
-    return (4 + 3*x*y + x**3 + y**2)
+    return 4 + 3 * x * y + x**3 + y**2
+
 
-def create_responses(n, m, mod = 4):
+def create_responses(n, m, mod=4):
     r = []
     for i in range(n):
-        r.append([f(i,j) for j in range(m + (i%mod))])
+        r.append([f(i, j) for j in range(m + (i % mod))])
     return r
-responses = create_responses(1024, 20, mod = 3)
+
+
+responses = create_responses(1024, 20, mod=3)
+
 
 def load_samples_to_ram(query_samples):
     del query_samples
@@ -53,12 +58,12 @@ def process_query_async(query_samples):
     for s in query_samples:
         response_array = np.array(responses[s.index], np.int32)
         token = response_array[0]
-        time.sleep(.0002)
+        time.sleep(0.0002)
         response_token = array.array("B", token.tobytes())
         response_token_info = response_token.buffer_info()
         response_token_data = response_token_info[0]
         response_token_size = response_token_info[1] * response_token.itemsize
-        time.sleep(.02)
+        time.sleep(0.02)
         n_tokens = len(response_array)
         response_array = array.array("B", response_array.tobytes())
         response_info = response_array.buffer_info()
@@ -67,27 +72,31 @@ def process_query_async(query_samples):
         # print(f"Reported size python: {n_tokens}")
         query_responses.append(
             mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size))
+                s.id, response_data, response_size)
+        )
     mlperf_loadgen.QuerySamplesComplete(query_responses)
 
 
 def issue_query(query_samples):
-    threading.Thread(target=process_query_async,
-                     args=[query_samples]).start()
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
 
 
 def flush_queries():
     pass
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance")
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
     parser.add_argument("--target-qps", type=int, default=100)
     parser.add_argument("--target-latency-ns", type=int, default=100000000)
     parser.add_argument("--min-query-count", type=int, default=100)
     parser.add_argument("--min-duration-ms", type=int, default=30000)
     return parser.parse_args()
 
+
 def main():
     args = get_args()
     settings = mlperf_loadgen.TestSettings()
@@ -105,7 +114,8 @@ def main():
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
@@ -113,4 +123,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/loadgen/demos/token_metrics/py_demo_single_stream.py b/loadgen/demos/token_metrics/py_demo_single_stream.py
index e4aa4226f..ca8d84591 100644
--- a/loadgen/demos/token_metrics/py_demo_single_stream.py
+++ b/loadgen/demos/token_metrics/py_demo_single_stream.py
@@ -27,16 +27,21 @@
 from absl import app
 import mlperf_loadgen
 
+
 def f(x, y):
-    return (4 + 3*x*y + x**3 + y**2)
+    return 4 + 3 * x * y + x**3 + y**2
+
 
-def create_responses(n, m, mod = 4):
+def create_responses(n, m, mod=4):
     r = []
     for i in range(n):
-        r.append([f(i,j) for j in range(m + (i%mod))])
+        r.append([f(i, j) for j in range(m + (i % mod))])
     return r
+
+
 responses = create_responses(1024, 20)
 
+
 def load_samples_to_ram(query_samples):
     del query_samples
     return
@@ -52,14 +57,20 @@ def process_query_async(query_samples):
     query_responses = []
     for s in query_samples:
         response_array = np.array(responses[s.index], np.int32)
-        time.sleep(.0002)
+        time.sleep(0.0002)
         token = response_array[:1]
         response_token = array.array("B", token.tobytes())
         response_token_info = response_token.buffer_info()
         response_token_data = response_token_info[0]
         response_token_size = response_token_info[1] * response_token.itemsize
-        mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
-        time.sleep(.02)
+        mlperf_loadgen.FirstTokenComplete(
+            [
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, response_token_data, response_token_size
+                )
+            ]
+        )
+        time.sleep(0.02)
         n_tokens = len(response_array)
         response_array = array.array("B", response_array.tobytes())
         response_info = response_array.buffer_info()
@@ -67,21 +78,25 @@ def process_query_async(query_samples):
         response_size = response_info[1] * response_array.itemsize
         query_responses.append(
             mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size, n_tokens))
+                s.id, response_data, response_size, n_tokens
+            )
+        )
     mlperf_loadgen.QuerySamplesComplete(query_responses)
 
 
 def issue_query(query_samples):
-    threading.Thread(target=process_query_async,
-                     args=[query_samples]).start()
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
 
 
 def flush_queries():
     pass
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mode", choices=["performance", "accuracy"], default="performance")
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
     parser.add_argument("--expected-latency", type=int, default=2050000)
     parser.add_argument("--min-query-count", type=int, default=100)
     parser.add_argument("--min-duration-ms", type=int, default=30000)
@@ -103,7 +118,8 @@ def main():
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram)
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
diff --git a/loadgen/docs/src/doxygen_html_generator.py b/loadgen/docs/src/doxygen_html_generator.py
index 405ac1e55..4065d7bd0 100644
--- a/loadgen/docs/src/doxygen_html_generator.py
+++ b/loadgen/docs/src/doxygen_html_generator.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # =============================================================================
 
-## \file
+# \file
 #  \brief A script that sets the environment variables expected by doxygen.cfg.
 #  \details This can be run manually without any arguments, but also allows a
 #  build system to customize the output directory.
diff --git a/loadgen/issue_query_controller.cc b/loadgen/issue_query_controller.cc
index 8e1e80f1a..c1abea9d1 100644
--- a/loadgen/issue_query_controller.cc
+++ b/loadgen/issue_query_controller.cc
@@ -323,10 +323,10 @@ void IssueQueryController::StartIssueQueries(IssueQueryState* s) {
 
 template void IssueQueryController::StartIssueQueries<
     TestScenario::MultiStream>(IssueQueryState* s);
-template void IssueQueryController::StartIssueQueries<
-    TestScenario::Offline>(IssueQueryState* s);
-template void IssueQueryController::StartIssueQueries<
-    TestScenario::Server>(IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<TestScenario::Offline>(
+    IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<TestScenario::Server>(
+    IssueQueryState* s);
 template void IssueQueryController::StartIssueQueries<
     TestScenario::SingleStream>(IssueQueryState* s);
 
@@ -459,8 +459,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
             std::stringstream ss;
             ss << "IssueQueryThread " << thread_idx
-               << " Ending early: Too many outstanding queries."
-               << " issued " << queries_issued_total << " outstanding "
+               << " Ending early: Too many outstanding queries." << " issued "
+               << queries_issued_total << " outstanding "
                << queries_outstanding;
             MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
@@ -499,8 +499,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
         std::stringstream ss;
         ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max query count reached."
-           << " query_count " << queries_issued;
+           << " Ending early: Max query count reached." << " query_count "
+           << queries_issued;
         MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
         detail.Error("IssueQueryThread ", std::to_string(thread_idx),
@@ -519,8 +519,8 @@ void IssueQueryController::IssueQueriesInternal(size_t query_stride,
 #if USE_NEW_LOGGING_FORMAT
         std::stringstream ss;
         ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max test duration reached."
-           << " duration_ns " << duration.count();
+           << " Ending early: Max test duration reached." << " duration_ns "
+           << duration.count();
         MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
 #else
         detail.Error("IssueQueryThread ", std::to_string(thread_idx),
diff --git a/loadgen/issue_query_controller.h b/loadgen/issue_query_controller.h
index e723234cd..5668c574e 100644
--- a/loadgen/issue_query_controller.h
+++ b/loadgen/issue_query_controller.h
@@ -71,8 +71,8 @@ struct ResponseDelegate {
                               PerfClock::time_point,
                               const ResponseCallback&) = 0;
   virtual void TokenComplete(SampleMetadata*, QuerySampleResponse*,
-                              PerfClock::time_point,
-                              const ResponseCallback&) = 0;
+                             PerfClock::time_point,
+                             const ResponseCallback&) = 0;
   virtual void QueryComplete() = 0;
   std::atomic<size_t> queries_completed{0};
 };
diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc
index 76e59151b..beda3a6c4 100644
--- a/loadgen/loadgen.cc
+++ b/loadgen/loadgen.cc
@@ -36,12 +36,12 @@ limitations under the License.
 #include "logging.h"
 #include "query_sample.h"
 #include "query_sample_library.h"
+#include "results.h"
 #include "system_under_test.h"
 #include "test_settings.h"
 #include "test_settings_internal.h"
 #include "utils.h"
 #include "version.h"
-#include "results.h"
 
 namespace mlperf {
 
@@ -97,7 +97,8 @@ struct ResponseDelegateDetailed : public ResponseDelegate {
       sample_data_copy = new std::vector<uint8_t>(src_begin, src_end);
     }
     int64_t n_tokens = response->n_tokens;
-    Log([sample, complete_begin_time, sample_data_copy, n_tokens](AsyncLog& log) {
+    Log([sample, complete_begin_time, sample_data_copy,
+         n_tokens](AsyncLog& log) {
       QueryMetadata* query = sample->query_metadata;
       DurationGeneratorNs sched{query->scheduled_time};
       if (scenario == TestScenario::Server) {
@@ -132,9 +133,9 @@ struct ResponseDelegateDetailed : public ResponseDelegate {
     });
   }
 
-    void TokenComplete(SampleMetadata* sample, QuerySampleResponse* response,
-                      PerfClock::time_point complete_begin_time,
-                      const ResponseCallback& response_cb) override {
+  void TokenComplete(SampleMetadata* sample, QuerySampleResponse* response,
+                     PerfClock::time_point complete_begin_time,
+                     const ResponseCallback& response_cb) override {
     // Using a raw pointer here should help us hit the std::function
     // small buffer optimization code path when we aren't copying data.
     // For some reason, using std::unique_ptr<std::vector> wasn't moving
@@ -155,24 +156,25 @@ struct ResponseDelegateDetailed : public ResponseDelegate {
       DurationGeneratorNs sched{query->scheduled_time};
       if (scenario == TestScenario::Server) {
         DurationGeneratorNs issued{query->issued_start_time};
-        log.TraceCounterEvent("Token_Latency", query->scheduled_time, "issue_delay",
-                              sched.delta(query->issued_start_time),
-                              "issue_to_done",
-                              issued.delta(complete_begin_time));
-      }else{
+        log.TraceCounterEvent(
+            "Token_Latency", query->scheduled_time, "issue_delay",
+            sched.delta(query->issued_start_time), "issue_to_done",
+            issued.delta(complete_begin_time));
+      } else {
         log.TraceSample("Token", sample->sequence_id, query->scheduled_time,
-                      complete_begin_time, "sample_seq", sample->sequence_id,
-                      "query_seq", query->sequence_id, "sample_idx",
-                      sample->sample_index, "issue_start_ns",
-                      sched.delta(query->issued_start_time), "complete_ns",
-                      sched.delta(complete_begin_time));
+                        complete_begin_time, "sample_seq", sample->sequence_id,
+                        "query_seq", query->sequence_id, "sample_idx",
+                        sample->sample_index, "issue_start_ns",
+                        sched.delta(query->issued_start_time), "complete_ns",
+                        sched.delta(complete_begin_time));
       }
       if (token_data_copy) {
-        log.CacheToken(sample->sequence_id, LogBinaryAsHexString{token_data_copy});
+        log.CacheToken(sample->sequence_id,
+                       LogBinaryAsHexString{token_data_copy});
       }
       QuerySampleLatency latency = sched.delta(complete_begin_time);
       log.RecordTokenCompletion(sample->sequence_id, complete_begin_time,
-                                 latency);
+                                latency);
     });
   }
 
@@ -227,7 +229,8 @@ auto SampleDistribution<TestMode::PerformanceOnly>(size_t sample_count,
              auto& gen) mutable { return dist(gen); };
 }
 
-/// \brief Sample across the dataset, and ensure coverage of each of the samples.
+/// \brief Sample across the dataset, and ensure coverage of each of the
+/// samples.
 // Useful for non-uniform dataset (e.g. Llama2, GPTJ, 3d-unet)
 auto SampleDistributionEqualIssue(size_t sample_count, size_t set_size,
                                   std::mt19937* rng) {
@@ -304,10 +307,8 @@ std::vector<QueryMetadata> GenerateQueries(
   auto sample_distribution_unique = SampleDistribution<TestMode::AccuracyOnly>(
       loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
 
-  auto sample_distribution_equal_issue =
-      SampleDistributionEqualIssue(min_queries,
-                                   loaded_samples.size(),
-                                   &sample_rng);
+  auto sample_distribution_equal_issue = SampleDistributionEqualIssue(
+      min_queries, loaded_samples.size(), &sample_rng);
 
   auto schedule_distribution =
       ScheduleDistribution<scenario>(settings.target_qps);
@@ -315,20 +316,18 @@ std::vector<QueryMetadata> GenerateQueries(
   // When sample_concatenate_permutation is turned on, pad to a multiple of the
   // complete dataset to ensure fairness.
   auto enable_equal_issue = settings.sample_concatenate_permutation;
-  if (mode != TestMode::AccuracyOnly && enable_equal_issue)
-  {
+  if (mode != TestMode::AccuracyOnly && enable_equal_issue) {
     if (scenario == TestScenario::Offline &&
-      samples_per_query % loaded_samples.size() != 0)
-    {
+        samples_per_query % loaded_samples.size() != 0) {
       // In offline mode, we pad samples_per_query
       size_t pad_size =
-        (loaded_samples.size() - samples_per_query % loaded_samples.size());
+          (loaded_samples.size() - samples_per_query % loaded_samples.size());
       samples_per_query += pad_size;
-    }
-    else if (min_queries % loaded_samples.size() != 0)
-    {
-      // In Server, SingleStream, MultiStream mode, the min_queries should be padded
-      size_t pad_size = (loaded_samples.size() - min_queries % loaded_samples.size());
+    } else if (min_queries % loaded_samples.size() != 0) {
+      // In Server, SingleStream, MultiStream mode, the min_queries should be
+      // padded
+      size_t pad_size =
+          (loaded_samples.size() - min_queries % loaded_samples.size());
       min_queries += pad_size;
     }
   }
@@ -388,10 +387,9 @@ std::vector<QueryMetadata> GenerateQueries(
     } else {
       for (auto& s : samples) {
         s = loaded_samples[settings.performance_issue_unique
-                           ? sample_distribution_unique(sample_rng)
-                           : settings.performance_issue_same
-                             ? same_sample
-                             : enable_equal_issue
+                               ? sample_distribution_unique(sample_rng)
+                           : settings.performance_issue_same ? same_sample
+                           : enable_equal_issue
                                ? sample_distribution_equal_issue(sample_rng)
                                : sample_distribution(sample_rng)];
       }
@@ -399,11 +397,11 @@ std::vector<QueryMetadata> GenerateQueries(
     queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
     prev_timestamp = timestamp;
     timestamp += schedule_distribution(schedule_rng);
-    // In equal_issue mode, the min_queries will be bumped up by a multiple of the dataset size
-    // if the test time has not met the threshold.
+    // In equal_issue mode, the min_queries will be bumped up by a multiple of
+    // the dataset size if the test time has not met the threshold.
     if (enable_equal_issue && (queries.size() >= min_queries) &&
-      (prev_timestamp < gen_duration) && (scenario != TestScenario::Offline))
-    {
+        (prev_timestamp < gen_duration) &&
+        (scenario != TestScenario::Offline)) {
       min_queries += loaded_samples.size();
     }
   }
@@ -536,14 +534,13 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
       GlobalLogger().GetLatenciesBlocking(expected_latencies));
 
   std::vector<QuerySampleLatency> first_token_latencies(
-    GlobalLogger().GetTokenLatencies(expected_latencies));
+      GlobalLogger().GetTokenLatencies(expected_latencies));
 
   std::vector<QuerySampleLatency> time_per_output_token_arr(
-    GlobalLogger().GetTimePerOutputToken(expected_latencies));
+      GlobalLogger().GetTimePerOutputToken(expected_latencies));
 
   std::vector<int64_t> tokens_per_sample(
-    GlobalLogger().GetTokensPerSample(expected_latencies));
-
+      GlobalLogger().GetTokensPerSample(expected_latencies));
 
   // Log contention counters after every test as a sanity check.
   GlobalLogger().LogContentionAndAllocations();
@@ -591,19 +588,16 @@ PerformanceResult IssueQueries(SystemUnderTest* sut,
     }
   }
 
-  return PerformanceResult{std::move(sample_latencies),
-                          std::move(query_latencies),
-                          queries_issued,
-                          max_latency,
-                          final_query_scheduled_time,
-                          final_query_issued_time,
-                          final_query_all_samples_done_time,
-                          TokenPerformanceResults{
-                            first_token_latencies,
-                            time_per_output_token_arr,
-                            tokens_per_sample
-                          }
-                        };
+  return PerformanceResult{
+      std::move(sample_latencies),
+      std::move(query_latencies),
+      queries_issued,
+      max_latency,
+      final_query_scheduled_time,
+      final_query_issued_time,
+      final_query_all_samples_done_time,
+      TokenPerformanceResults{first_token_latencies, time_per_output_token_arr,
+                              tokens_per_sample}};
 }
 
 void LoadSamplesToRam(QuerySampleLibrary* qsl,
@@ -1172,9 +1166,10 @@ void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
                               &log_outputs.accuracy_out,
                               log_settings.log_output.copy_detail_to_stdout,
                               log_settings.log_output.copy_summary_to_stdout);
-            
+
   GlobalLogger().SetUseTokens(requested_settings.use_token_latencies);
-  bool needs_first_token = (requested_settings.scenario != TestScenario::Offline);
+  bool needs_first_token =
+      (requested_settings.scenario != TestScenario::Offline);
   GlobalLogger().SetNeedsFirstToken(needs_first_token);
 
   if (log_settings.enable_trace) {
@@ -1235,7 +1230,7 @@ void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
     test_settings.FromConfig(audit_config_filename, generic_model,
                              audit_scenario);
   }
-  if(test_settings.test05){
+  if (test_settings.test05) {
     // If the configuration indicates we are running test05,
     // random seeds
     LogDetail([](AsyncDetail& detail) {
@@ -1251,8 +1246,10 @@ void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
     });
     test_settings.mode = TestMode::PerformanceOnly;
     test_settings.qsl_rng_seed = requested_settings.test05_qsl_rng_seed;
-    test_settings.sample_index_rng_seed = requested_settings.test05_sample_index_rng_seed;
-    test_settings.schedule_rng_seed = requested_settings.test05_schedule_rng_seed;
+    test_settings.sample_index_rng_seed =
+        requested_settings.test05_sample_index_rng_seed;
+    test_settings.schedule_rng_seed =
+        requested_settings.test05_schedule_rng_seed;
   }
 
   loadgen::TestSettingsInternal sanitized_settings(
@@ -1324,11 +1321,11 @@ void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
 }
 
 void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
-                          const ResponseCallback& response_cb) {
+                        const ResponseCallback& response_cb) {
   PerfClock::time_point timestamp = PerfClock::now();
 
-  auto tracer = MakeScopedTracer(
-      [](AsyncTrace& trace) { trace("FirstTokenComplete"); });
+  auto tracer =
+      MakeScopedTracer([](AsyncTrace& trace) { trace("FirstTokenComplete"); });
 
   const QuerySampleResponse* end = responses + response_count;
 
@@ -1338,7 +1335,7 @@ void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
         reinterpret_cast<loadgen::SampleMetadata*>(response->id);
     loadgen::QueryMetadata* query = sample->query_metadata;
     query->response_delegate->TokenComplete(sample, response, timestamp,
-                                             response_cb);
+                                            response_cb);
   }
   // PerfClock::time_point end_timestamp = PerfClock::now();
   // mlperf::tokens_overhead_acum += (end_timestamp - timestamp).count();
diff --git a/loadgen/loadgen.h b/loadgen/loadgen.h
index f0e655490..84e02656c 100644
--- a/loadgen/loadgen.h
+++ b/loadgen/loadgen.h
@@ -19,9 +19,8 @@ limitations under the License.
 
 #include <cstddef>
 #include <functional>
-#include <string>
 #include <numeric>
-
+#include <string>
 
 /// \brief Contains the loadgen API.
 namespace mlperf {
@@ -66,7 +65,7 @@ void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
                           const ResponseCallback& response_cb = {});
 
 void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
-                          const ResponseCallback& response_cb = {});
+                        const ResponseCallback& response_cb = {});
 
 ///
 /// \brief Starts the test against SUT with the specified settings.
diff --git a/loadgen/logging.cc b/loadgen/logging.cc
index d33074c01..807c1954a 100644
--- a/loadgen/logging.cc
+++ b/loadgen/logging.cc
@@ -286,25 +286,24 @@ void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
     return;
   }
   *accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
-  if (!use_tokens_){
+  if (!use_tokens_) {
     LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-          response);
-  } else if (!needs_first_token_)
-  {
+            response);
+  } else if (!needs_first_token_) {
     LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-          response, "token_count", n_tokens);
-  }
-  else {
+            response, "token_count", n_tokens);
+  } else {
     const size_t i = seq_id - latencies_first_sample_sequence_id_;
     LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-          response, "token_data", token_records_[i], "token_count", n_tokens);
+            response, "token_data", token_records_[i], "token_count", n_tokens);
   }
-  
+
   *accuracy_out_ << " }";
   accuracy_needs_comma_ = true;
 }
 
-void AsyncLog::CacheToken(uint64_t seq_id, const LogBinaryAsHexString& response){
+void AsyncLog::CacheToken(uint64_t seq_id,
+                          const LogBinaryAsHexString& response) {
   std::unique_lock<std::mutex> lock(token_record_mutex_);
   const size_t i = seq_id - latencies_first_sample_sequence_id_;
   if (token_records_.size() <= i) {
@@ -408,14 +407,16 @@ void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
     // the error above.
     return;
   }
-  
-  if (use_tokens_){
-    if(needs_first_token_ && (token_latencies_.size() <= i)){
+
+  if (use_tokens_) {
+    if (needs_first_token_ && (token_latencies_.size() <= i)) {
       MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                          "Attempted to record a sample latency before it's first token latency");
-    }else if (needs_first_token_ && (token_latencies_[i] == kInvalidLatency)){
+                            "Attempted to record a sample latency before it's "
+                            "first token latency");
+    } else if (needs_first_token_ && (token_latencies_[i] == kInvalidLatency)) {
       MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                          "Attempted to record a sample latency before it's first token latency");
+                            "Attempted to record a sample latency before it's "
+                            "first token latency");
     }
 
     if (tokens_per_sample_.size() <= i) {
@@ -424,12 +425,12 @@ void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
     } else if (tokens_per_sample_[i] != nTokenInvalid) {
       // Call LogErrorSync here since this kind of error could result in a
       // segfault in the near future.
-  #if USE_NEW_LOGGING_FORMAT
+#if USE_NEW_LOGGING_FORMAT
       MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
                             "Attempted to complete a sample twice.");
-  #else
+#else
       GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
-  #endif
+#endif
 
       // Return without recording the latency again to avoid potentially
       // ending the test before the SUT is actually done, which could result
@@ -437,30 +438,31 @@ void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
       // If the SUT recorded the wrong sample, the test will hang and see
       // the error above.
       return;
-    } 
-    if (n_tokens == 0){
+    }
+    if (n_tokens == 0) {
       MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                            "n_tokens argument missing or attempted to record 0 as number of tokens");
-    } else if (n_tokens < 0){
+                            "n_tokens argument missing or attempted to record "
+                            "0 as number of tokens");
+    } else if (n_tokens < 0) {
       MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
                             "Attempted to record a negative number of tokens");
       n_tokens = 0;
-    } else if (n_tokens == 1){
+    } else if (n_tokens == 1) {
       MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
                             "Number of tokens need to be greater than 1");
       n_tokens = 0;
     }
-    if (time_per_output_token_.size() <= i){
+    if (time_per_output_token_.size() <= i) {
       time_per_output_token_.resize(i + 1, kInvalidLatency);
     } else if (time_per_output_token_[i] != kInvalidLatency) {
       // Call LogErrorSync here since this kind of error could result in a
       // segfault in the near future.
-  #if USE_NEW_LOGGING_FORMAT
+#if USE_NEW_LOGGING_FORMAT
       MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
                             "Attempted to complete a sample twice.");
-  #else
+#else
       GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
-  #endif
+#endif
 
       // Return without recording the latency again to avoid potentially
       // ending the test before the SUT is actually done, which could result
@@ -470,7 +472,8 @@ void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
       return;
     }
     tokens_per_sample_[i] = n_tokens;
-    time_per_output_token_[i] = (latency - token_latencies_[i]) / (n_tokens - 1);
+    time_per_output_token_[i] =
+        (latency - token_latencies_[i]) / (n_tokens - 1);
   }
   latencies_[i] = latency;
   latencies_recorded_++;
@@ -480,14 +483,14 @@ void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
 }
 
 void AsyncLog::RecordTokenCompletion(uint64_t sample_sequence_id,
-                                      PerfClock::time_point completion_time,
-                                      QuerySampleLatency latency) {
+                                     PerfClock::time_point completion_time,
+                                     QuerySampleLatency latency) {
   std::unique_lock<std::mutex> lock(token_latencies_mutex_);
-  //std::unique_lock<std::mutex> lock(latencies_mutex_);
-  //max_latency_ = std::max(max_latency_, latency);
+  // std::unique_lock<std::mutex> lock(latencies_mutex_);
+  // max_latency_ = std::max(max_latency_, latency);
 
-  //max_completion_timstamp_ =
-  //    std::max(max_completion_timstamp_, completion_time);
+  // max_completion_timstamp_ =
+  //     std::max(max_completion_timstamp_, completion_time);
 
   if (sample_sequence_id < latencies_first_sample_sequence_id_) {
     // Call LogErrorSync here since this kind of error could result in a
@@ -507,22 +510,24 @@ void AsyncLog::RecordTokenCompletion(uint64_t sample_sequence_id,
   }
 
   const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
-  
-  if (latencies_.size() > i){
-    if (latencies_[i] != kInvalidLatency){
+
+  if (latencies_.size() > i) {
+    if (latencies_[i] != kInvalidLatency) {
 #if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                          "Attempted to record token latency after sample was completed");
+      MLPERF_LOG_ERROR_SYNC(
+          GlobalLogger(), "error_runtime",
+          "Attempted to record token latency after sample was completed");
 #else
-    GlobalLogger().LogErrorSync("Attempted to record token latency after sample was completed");
+      GlobalLogger().LogErrorSync(
+          "Attempted to record token latency after sample was completed");
 #endif
 
-    // Return without recording the latency again to avoid potentially
-    // ending the test before the SUT is actually done, which could result
-    // in a segfault.
-    // If the SUT recorded the wrong sample, the test will hang and see
-    // the error above.
-    return;
+      // Return without recording the latency again to avoid potentially
+      // ending the test before the SUT is actually done, which could result
+      // in a segfault.
+      // If the SUT recorded the wrong sample, the test will hang and see
+      // the error above.
+      return;
     }
   }
   if (token_latencies_.size() <= i) {
@@ -598,13 +603,15 @@ std::vector<QuerySampleLatency> AsyncLog::GetLatenciesBlocking(
   return latencies;
 }
 
-std::vector<QuerySampleLatency> AsyncLog::GetTokenLatencies(size_t expected_count) {
+std::vector<QuerySampleLatency> AsyncLog::GetTokenLatencies(
+    size_t expected_count) {
   std::vector<QuerySampleLatency> token_latencies;
   token_latencies.swap(token_latencies_);
   return token_latencies;
 }
 
-std::vector<QuerySampleLatency> AsyncLog::GetTimePerOutputToken(size_t expected_count){
+std::vector<QuerySampleLatency> AsyncLog::GetTimePerOutputToken(
+    size_t expected_count) {
   std::vector<QuerySampleLatency> tpot_latencies;
   tpot_latencies.swap(time_per_output_token_);
   return tpot_latencies;
@@ -625,11 +632,9 @@ QuerySampleLatency AsyncLog::GetMaxLatencySoFar() {
   return max_latency_;
 }
 
-void AsyncLog::SetUseTokens(bool use_tokens){
-  use_tokens_ = use_tokens;
-}
+void AsyncLog::SetUseTokens(bool use_tokens) { use_tokens_ = use_tokens; }
 
-void AsyncLog::SetNeedsFirstToken(bool needs_first_token){
+void AsyncLog::SetNeedsFirstToken(bool needs_first_token) {
   needs_first_token_ = needs_first_token;
 }
 
@@ -807,8 +812,7 @@ void Logger::CollectTlsLoggerStats(TlsLogger* tls_logger) {
   if (max_entry_vector_size > kTlsLogReservedEntryCount) {
 #if USE_NEW_LOGGING_FORMAT
     std::stringstream msg;
-    msg << "Logging allocation detected:"
-        << " tid: " << tls_logger->Tid()
+    msg << "Logging allocation detected:" << " tid: " << tls_logger->Tid()
         << " reserved_entries: " << kTlsLogReservedEntryCount
         << " max_entries: " << max_entry_vector_size;
     MLPERF_LOG_WARNING((*this), "warning_generic_message", msg.str());
@@ -963,11 +967,11 @@ QuerySampleLatency Logger::GetMaxLatencySoFar() {
   return async_logger_.GetMaxLatencySoFar();
 }
 
-void Logger::SetUseTokens(bool use_tokens){
+void Logger::SetUseTokens(bool use_tokens) {
   async_logger_.SetUseTokens(use_tokens);
 }
 
-void Logger::SetNeedsFirstToken(bool needs_first_token){
+void Logger::SetNeedsFirstToken(bool needs_first_token) {
   async_logger_.SetNeedsFirstToken(needs_first_token);
 }
 
diff --git a/loadgen/logging.h b/loadgen/logging.h
index e62825859..c5514562a 100644
--- a/loadgen/logging.h
+++ b/loadgen/logging.h
@@ -119,13 +119,10 @@ class ChromeTracer {
   void AddCompleteEvent(const std::string& name, uint64_t pid, uint64_t tid,
                         PerfClock::time_point start, PerfClock::time_point end,
                         const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"ph\":\"X\","
-          << "\"pid\":" << pid << ","
-          << "\"tid\":" << tid << ","
+    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\":\"X\","
+          << "\"pid\":" << pid << "," << "\"tid\":" << tid << ","
           << "\"ts\":" << Micros(start - origin_).count() << ","
-          << "\"dur\":" << Micros(end - start).count() << ","
-          << "\"args\":{";
+          << "\"dur\":" << Micros(end - start).count() << "," << "\"args\":{";
     AddArgs(args...);
     *out_ << "}},\n";
   }
@@ -133,12 +130,9 @@ class ChromeTracer {
   template <typename... Args>
   void AddAsyncBeginEvent(const std::string& name, uint64_t pid, uint64_t id,
                           PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"cat\":\"default\","
-          << "\"ph\":\"b\","
-          << "\"pid\":" << pid << ","
-          << "\"id\":" << id << ","
-          << "\"ts\":" << Micros(time - origin_).count() << ","
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"b\"," << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
           << "\"args\":{";
     AddArgs(args...);
     *out_ << "}},\n";
@@ -147,12 +141,9 @@ class ChromeTracer {
   template <typename... Args>
   void AddAsyncInstantEvent(const std::string& name, uint64_t pid, uint64_t id,
                             PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"cat\":\"default\","
-          << "\"ph\":\"n\","
-          << "\"pid\":" << pid << ","
-          << "\"id\":" << id << ","
-          << "\"ts\":" << Micros(time - origin_).count() << ","
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"n\"," << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
           << "\"args\":{";
     AddArgs(args...);
     *out_ << "}},\n";
@@ -161,19 +152,15 @@ class ChromeTracer {
   template <typename... Args>
   void AddAsyncEndEvent(const std::string& name, uint64_t pid, uint64_t id,
                         PerfClock::time_point time) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"cat\":\"default\","
-          << "\"ph\":\"e\", "
-          << "\"pid\":" << pid << ","
-          << "\"id\":" << id << ","
-          << "\"ts\":" << Micros(time - origin_).count() << "},\n";
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"e\", " << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << "},\n";
   }
 
   template <typename... Args>
   void AddCounterEvent(const std::string& name, uint64_t pid,
                        PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\","
-          << "\"ph\": \"C\","
+    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\": \"C\","
           << "\"pid\":" << pid << ","
           << "\"ts\":" << Micros(time - origin_).count() << ","
           << "\"args\":{ ";
@@ -315,11 +302,10 @@ class AsyncLog {
                                size_t latencies_to_reserve);
   void RecordSampleCompletion(uint64_t sample_sequence_id,
                               PerfClock::time_point completion_time,
-                              QuerySampleLatency latency,
-                              int64_t n_tokens);
+                              QuerySampleLatency latency, int64_t n_tokens);
   void RecordTokenCompletion(uint64_t sample_sequence_id,
-                              PerfClock::time_point completion_time,
-                              QuerySampleLatency latency);
+                             PerfClock::time_point completion_time,
+                             QuerySampleLatency latency);
   std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
   std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
   std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
@@ -734,15 +720,13 @@ void AsyncLog::LogDetail(const std::string& key, const T& value,
   }
   auto time_ns = (log_detail_time_ - log_origin_).count();
   for (auto os : detail_streams) {
-    *os << ":::MLLOG {"
-        << "\"key\": " << ArgValueTransform(key) << ", "
+    *os << ":::MLLOG {" << "\"key\": " << ArgValueTransform(key) << ", "
         << "\"value\": " << ArgValueTransform(value) << ", "
         << "\"time_ms\": " << ArgValueTransform(time_ns / 1000000ULL) << "."
         << std::setfill('0') << std::setw(6)
         << ArgValueTransform(time_ns % 1000000ULL) << ", "
         << "\"namespace\": \"mlperf::logging\", "
-        << "\"event_type\": \"POINT_IN_TIME\", "
-        << "\"metadata\": {"
+        << "\"event_type\": \"POINT_IN_TIME\", " << "\"metadata\": {"
         << "\"is_error\": " << ArgValueTransform(error_flagged_) << ", "
         << "\"is_warning\": " << ArgValueTransform(warning_flagged_) << ", "
         << "\"file\": \"" << file_name << "\", "
@@ -771,9 +755,9 @@ void AsyncLog::LogDetail(const std::string& message, const Args... args) {
     detail_streams.pop_back();
   }
   for (auto os : detail_streams) {
-    *os << "\"pid\": " << current_pid_ << ", "
-        << "\"tid\": " << current_tid_ << ", "
-        << "\"ts\": " << (log_detail_time_ - log_origin_).count() << "ns : ";
+    *os << "\"pid\": " << current_pid_ << ", " << "\"tid\": " << current_tid_
+        << ", " << "\"ts\": " << (log_detail_time_ - log_origin_).count()
+        << "ns : ";
     if (error_flagged_) {
       *os << "ERROR : ";
     } else if (warning_flagged_) {
diff --git a/loadgen/query_dispatch_library.h b/loadgen/query_dispatch_library.h
index 5f4f518c3..6c594efe0 100644
--- a/loadgen/query_dispatch_library.h
+++ b/loadgen/query_dispatch_library.h
@@ -25,8 +25,10 @@ namespace mlperf {
 /// \addtogroup LoadgenAPI
 /// @{
 
-/// \brief The interface a client implements for the LoadGen over the network to test. The API inherits the System_under_test.h API
-/// When working in LON mode the QueryDispatchLibrary class is used and natively Upcasted to the QueryDispatchLibrary class.
+/// \brief The interface a client implements for the LoadGen over the network to
+/// test. The API inherits the System_under_test.h API When working in LON mode
+/// the QueryDispatchLibrary class is used and natively Upcasted to the
+/// QueryDispatchLibrary class.
 
 class QueryDispatchLibrary : public SystemUnderTest {
  public:
diff --git a/loadgen/query_sample.h b/loadgen/query_sample.h
index 371b49e97..e740be99e 100644
--- a/loadgen/query_sample.h
+++ b/loadgen/query_sample.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include <iostream>
 
 namespace mlperf {
@@ -51,28 +52,33 @@ struct QuerySampleResponse {
   uintptr_t data;
   size_t size;  ///< Size in bytes.
   int64_t n_tokens;
-  public:
-    QuerySampleResponse(ResponseId id, uintptr_t data, size_t size, int64_t n_tokens)
-    : id(id), 
-    data(data), 
-    size(size), 
-    n_tokens(n_tokens){
-      //std::cout << "Initialized with 4 arguments, n_tokens: " << n_tokens <<"\n";
-    };
-    QuerySampleResponse(ResponseId id, uintptr_t data, size_t size)
-    : id(id), 
-    data(data), 
-    size(size), 
-    n_tokens(0){
-      //std::cout << "Initialized with 3 arguments, n_tokens: " << n_tokens <<"\n";
-    };
-    QuerySampleResponse()
-    : id(0), 
-    data(0), 
-    size(0), 
-    n_tokens(0){
-      //std::cout << "Initialized with 0 arguments, n_tokens: " << n_tokens <<"\n";
-    };
+
+ public:
+  QuerySampleResponse(ResponseId id, uintptr_t data, size_t size,
+                      int64_t n_tokens)
+      : id(id),
+        data(data),
+        size(size),
+        n_tokens(n_tokens){
+            // std::cout << "Initialized with 4 arguments, n_tokens: " <<
+            // n_tokens <<"\n";
+        };
+  QuerySampleResponse(ResponseId id, uintptr_t data, size_t size)
+      : id(id),
+        data(data),
+        size(size),
+        n_tokens(0){
+            // std::cout << "Initialized with 3 arguments, n_tokens: " <<
+            // n_tokens <<"\n";
+        };
+  QuerySampleResponse()
+      : id(0),
+        data(0),
+        size(0),
+        n_tokens(0){
+            // std::cout << "Initialized with 0 arguments, n_tokens: " <<
+            // n_tokens <<"\n";
+        };
 };
 
 /// \brief A latency in nanoseconds, as recorded by the loadgen.
diff --git a/loadgen/results.cc b/loadgen/results.cc
index 97f7c6b9a..05db43d78 100644
--- a/loadgen/results.cc
+++ b/loadgen/results.cc
@@ -11,6 +11,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "results.h"
+
 #include "early_stopping.h"
 #include "utils.h"
 
@@ -55,7 +56,7 @@ void PerformanceSummary::ProcessLatencies() {
                          max_latency);
   }
 
-  if (settings.use_token_latencies){
+  if (settings.use_token_latencies) {
     ProcessTokenLatencies();
   }
 
@@ -85,9 +86,8 @@ void PerformanceSummary::ProcessLatencies() {
 void PerformanceSummary::ProcessTokenLatencies() {
   constexpr auto nTokenInvalid = std::numeric_limits<int64_t>::min();
   token_count = 0;
-  for (auto n_tokens: pr.token_results.tokens_per_sample){
-    if (n_tokens != nTokenInvalid)
-      token_count += n_tokens;
+  for (auto n_tokens : pr.token_results.tokens_per_sample) {
+    if (n_tokens != nTokenInvalid) token_count += n_tokens;
   }
   if (pr.token_results.first_token_latencies.empty()) {
     return;
@@ -102,29 +102,37 @@ void PerformanceSummary::ProcessTokenLatencies() {
     accumulated_tpot += latency;
   }
   time_per_output_token_mean = accumulated_tpot / sample_count;
-  std::sort(pr.token_results.first_token_latencies.begin(), 
-    pr.token_results.first_token_latencies.end());
+  std::sort(pr.token_results.first_token_latencies.begin(),
+            pr.token_results.first_token_latencies.end());
   std::sort(pr.token_results.time_per_output_token_arr.begin(),
-    pr.token_results.time_per_output_token_arr.end());
-  
+            pr.token_results.time_per_output_token_arr.end());
+
   token_target_latency_percentile.sample_latency =
-      pr.token_results.first_token_latencies[sample_count * token_target_latency_percentile.percentile];
+      pr.token_results
+          .first_token_latencies[sample_count *
+                                 token_target_latency_percentile.percentile];
   first_token_latency_min = pr.token_results.first_token_latencies.front();
   first_token_latency_max = pr.token_results.first_token_latencies.back();
   for (auto& lp : token_latency_percentiles) {
     assert(lp.percentile >= 0.0);
     assert(lp.percentile < 1.0);
-    lp.sample_latency = pr.token_results.first_token_latencies[sample_count * lp.percentile];
+    lp.sample_latency =
+        pr.token_results.first_token_latencies[sample_count * lp.percentile];
   }
 
   target_tpot_percentile.sample_latency =
-      pr.token_results.time_per_output_token_arr[sample_count * target_tpot_percentile.percentile];
-  time_per_output_token_min = pr.token_results.time_per_output_token_arr.front();
+      pr.token_results
+          .time_per_output_token_arr[sample_count *
+                                     target_tpot_percentile.percentile];
+  time_per_output_token_min =
+      pr.token_results.time_per_output_token_arr.front();
   time_per_output_token_max = pr.token_results.time_per_output_token_arr.back();
   for (auto& lp : tpot_percentiles) {
     assert(lp.percentile >= 0.0);
     assert(lp.percentile < 1.0);
-    lp.sample_latency = pr.token_results.time_per_output_token_arr[sample_count * lp.percentile];
+    lp.sample_latency =
+        pr.token_results
+            .time_per_output_token_arr[sample_count * lp.percentile];
   }
 
   if (settings.scenario == TestScenario::Server) {
@@ -132,16 +140,17 @@ void PerformanceSummary::ProcessTokenLatencies() {
     QuerySampleLatency max_latency = settings.target_latency.count() + 1;
     overlatency_first_token_count =
         pr.token_results.first_token_latencies.end() -
-        std::lower_bound(pr.token_results.first_token_latencies.begin(), pr.token_results.first_token_latencies.end(),
+        std::lower_bound(pr.token_results.first_token_latencies.begin(),
+                         pr.token_results.first_token_latencies.end(),
                          max_latency);
   }
-
 }
 
-bool PerformanceSummary::EarlyStopping(std::string* recommendation, int64_t queries_issued, 
-                                        std::vector<QuerySampleLatency>* sample_latencies,
-                                        std::vector<QuerySampleLatency>* query_latencies,
-                                        std::chrono::nanoseconds target_latency) {
+bool PerformanceSummary::EarlyStopping(
+    std::string* recommendation, int64_t queries_issued,
+    std::vector<QuerySampleLatency>* sample_latencies,
+    std::vector<QuerySampleLatency>* query_latencies,
+    std::chrono::nanoseconds target_latency) {
   recommendation->clear();
 
   MinPassingQueriesFinder find_min_passing;
@@ -336,25 +345,29 @@ bool PerformanceSummary::PerfConstraintsMet(std::string* recommendation) {
       break;
     case TestScenario::Server:
       ProcessLatencies();
-      if (!settings.use_token_latencies){
+      if (!settings.use_token_latencies) {
         if (target_latency_percentile.sample_latency >
             settings.target_latency.count()) {
           *recommendation = "Reduce target QPS to improve latency.";
           perf_constraints_met = false;
         }
       } else {
-        if ( token_target_latency_percentile.sample_latency >
+        if (token_target_latency_percentile.sample_latency >
             settings.server_ttft_latency) {
-          *recommendation = "TTFT constrain not met: Reduce target QPS to improve latency.";
+          *recommendation =
+              "TTFT constrain not met: Reduce target QPS to improve latency.";
           perf_constraints_met = false;
         }
 
-        if ( target_tpot_percentile.sample_latency >
+        if (target_tpot_percentile.sample_latency >
             settings.server_tpot_latency) {
-          if (recommendation->empty()){
-            *recommendation = "TPOT constrain not met: Reduce target QPS to improve latency.";
+          if (recommendation->empty()) {
+            *recommendation =
+                "TPOT constrain not met: Reduce target QPS to improve latency.";
           } else {
-            recommendation->append("\n * TPOT constrain not met: Reduce target QPS to improve latency.");
+            recommendation->append(
+                "\n * TPOT constrain not met: Reduce target QPS to improve "
+                "latency.");
           }
           perf_constraints_met = false;
         }
@@ -379,10 +392,10 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
 
   switch (settings.scenario) {
     case TestScenario::SingleStream: {
-        summary(DoubleToString(target_latency_percentile.percentile * 100, 0) +
-                    "th percentile latency (ns) : ",
-                target_latency_percentile.sample_latency);
-        break;
+      summary(DoubleToString(target_latency_percentile.percentile * 100, 0) +
+                  "th percentile latency (ns) : ",
+              target_latency_percentile.sample_latency);
+      break;
     }
     case TestScenario::MultiStream: {
       summary(DoubleToString(target_latency_percentile.percentile * 100, 0) +
@@ -401,7 +414,7 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
       //    1000 queries / 1 second.
       // TODO: make a more permanent solution
       double qps_as_completed =
-        (sample_count - 1) / pr.final_query_all_samples_done_time;
+          (sample_count - 1) / pr.final_query_all_samples_done_time;
       summary("Completed samples per second    : ",
               DoubleToString(qps_as_completed));
       break;
@@ -413,16 +426,18 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
     }
   }
 
-  if (settings.use_token_latencies){
+  if (settings.use_token_latencies) {
     switch (settings.scenario) {
       case TestScenario::SingleStream: {
-      summary(DoubleToString(token_target_latency_percentile.percentile * 100, 0) +
-                  "th first token percentile latency (ns) : ",
-              token_target_latency_percentile.sample_latency);
-      break;
+        summary(DoubleToString(token_target_latency_percentile.percentile * 100,
+                               0) +
+                    "th first token percentile latency (ns) : ",
+                token_target_latency_percentile.sample_latency);
+        break;
       }
       case TestScenario::MultiStream: {
-        summary(DoubleToString(token_target_latency_percentile.percentile * 100, 0) +
+        summary(DoubleToString(token_target_latency_percentile.percentile * 100,
+                               0) +
                     "th first token percentile latency (ns) : ",
                 token_target_latency_percentile.sample_latency);
         break;
@@ -441,7 +456,7 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
     }
   }
 
-  if (settings.infer_token_latencies){
+  if (settings.infer_token_latencies) {
     switch (settings.scenario) {
       case TestScenario::SingleStream: {
         break;
@@ -450,13 +465,15 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
         break;
       }
       case TestScenario::Offline: {
-        double tokens_per_second = settings.token_latency_scaling_factor * sample_count / pr.max_latency;
+        double tokens_per_second = settings.token_latency_scaling_factor *
+                                   sample_count / pr.max_latency;
         summary("Tokens per second (inferred): ", tokens_per_second);
         break;
       }
       case TestScenario::Server:
-        double tps_as_completed =
-          settings.token_latency_scaling_factor * (sample_count - 1) / pr.final_query_all_samples_done_time;
+        double tps_as_completed = settings.token_latency_scaling_factor *
+                                  (sample_count - 1) /
+                                  pr.final_query_all_samples_done_time;
         summary("Completed tokens per second (inferred): ",
                 DoubleToString(tps_as_completed));
         break;
@@ -472,23 +489,20 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
   bool min_duration_met = MinDurationMet(&min_duration_recommendation);
   bool min_queries_met = MinQueriesMet() && MinSamplesMet();
   bool early_stopping_met = true;
-  if (!settings.use_token_latencies){
-    early_stopping_met = EarlyStopping(&early_stopping_recommendation,
-                                        pr.queries_issued, 
-                                        &pr.sample_latencies, 
-                                        &pr.query_latencies,
-                                        settings.target_latency);
+  if (!settings.use_token_latencies) {
+    early_stopping_met = EarlyStopping(
+        &early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
+        &pr.query_latencies, settings.target_latency);
   } else {
-    early_stopping_met = EarlyStopping(&early_stopping_tpot_recommendation,
-                                        pr.queries_issued, 
-                                        &pr.token_results.time_per_output_token_arr, 
-                                        &pr.query_latencies,
-                                        std::chrono::nanoseconds(settings.server_tpot_latency)) && 
-                          EarlyStopping(&early_stopping_ttft_recommendation,
-                                        pr.queries_issued, 
-                                        &pr.token_results.first_token_latencies, 
-                                        &pr.query_latencies,
-                                        std::chrono::nanoseconds(settings.server_ttft_latency));
+    early_stopping_met =
+        EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
+                      &pr.token_results.time_per_output_token_arr,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_tpot_latency)) &&
+        EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
+                      &pr.token_results.first_token_latencies,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_ttft_latency));
   }
   bool perf_constraints_met =
       PerfConstraintsMet(&perf_constraints_recommendation);
@@ -521,7 +535,7 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
   if (settings.scenario == TestScenario::SingleStream ||
       settings.scenario == TestScenario::Server ||
       settings.scenario == TestScenario::MultiStream) {
-    if (!settings.use_token_latencies){
+    if (!settings.use_token_latencies) {
       summary("Early Stopping Result:");
       summary(early_stopping_recommendation);
     } else {
@@ -547,9 +561,9 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
   } else if (settings.scenario == TestScenario::Server) {
     // Scheduled samples per second as an additional stat
     double qps_as_scheduled =
-          (sample_count - 1) / pr.final_query_scheduled_time;
-      summary("Scheduled samples per second : ",
-              DoubleToString(qps_as_scheduled));
+        (sample_count - 1) / pr.final_query_scheduled_time;
+    summary("Scheduled samples per second : ",
+            DoubleToString(qps_as_scheduled));
   } else if (settings.scenario == TestScenario::MultiStream) {
     summary("Per-query latency:  ");
     summary("Min latency (ns)                : ", query_latency_min);
@@ -572,14 +586,16 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
           lp.sample_latency);
     }
   }
-  if (settings.use_token_latencies){
+  if (settings.use_token_latencies) {
     summary("");
     if (settings.scenario == TestScenario::SingleStream) {
-    double tps_w_lg = token_count / pr.final_query_issued_time;
-    double tps_wo_lg = ((double)token_count) / (QuerySampleLatencyToSeconds(sample_latency_mean) * sample_count);
-    summary("TPS w/ loadgen overhead         : " + DoubleToString(tps_w_lg));
-    summary("TPS w/o loadgen overhead        : " + DoubleToString(tps_wo_lg));
-    
+      double tps_w_lg = token_count / pr.final_query_issued_time;
+      double tps_wo_lg =
+          ((double)token_count) /
+          (QuerySampleLatencyToSeconds(sample_latency_mean) * sample_count);
+      summary("TPS w/ loadgen overhead         : " + DoubleToString(tps_w_lg));
+      summary("TPS w/o loadgen overhead        : " + DoubleToString(tps_wo_lg));
+
     } else if (settings.scenario == TestScenario::Server) {
       double tps_as_completed =
           token_count / pr.final_query_all_samples_done_time;
@@ -588,22 +604,28 @@ void PerformanceSummary::LogSummary(AsyncSummary& summary) {
     }
 
     if (settings.scenario != TestScenario::Offline) {
-      summary("Min First Token latency (ns)                : ", first_token_latency_min);
-      summary("Max First Token latency (ns)                : ", first_token_latency_max);
-      summary("Mean First Token latency (ns)               : ", first_token_latency_mean);
+      summary("Min First Token latency (ns)                : ",
+              first_token_latency_min);
+      summary("Max First Token latency (ns)                : ",
+              first_token_latency_max);
+      summary("Mean First Token latency (ns)               : ",
+              first_token_latency_mean);
       for (auto& lp : token_latency_percentiles) {
-        summary(
-            DoubleToString(lp.percentile * 100) + " percentile first token latency (ns)   : ",
-            lp.sample_latency);
+        summary(DoubleToString(lp.percentile * 100) +
+                    " percentile first token latency (ns)   : ",
+                lp.sample_latency);
       }
       summary("");
-      summary("Min Time to Output Token (ns)                : ", time_per_output_token_min);
-      summary("Max Time to Output Token (ns)                : ", time_per_output_token_max);
-      summary("Mean Time to Output Token (ns)               : ", time_per_output_token_mean);
+      summary("Min Time to Output Token (ns)                : ",
+              time_per_output_token_min);
+      summary("Max Time to Output Token (ns)                : ",
+              time_per_output_token_max);
+      summary("Mean Time to Output Token (ns)               : ",
+              time_per_output_token_mean);
       for (auto& lp : tpot_percentiles) {
-        summary(
-            DoubleToString(lp.percentile * 100) + " percentile time to output token (ns)   : ",
-            lp.sample_latency);
+        summary(DoubleToString(lp.percentile * 100) +
+                    " percentile time to output token (ns)   : ",
+                lp.sample_latency);
       }
     }
   }
@@ -631,23 +653,20 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
   bool perf_constraints_met =
       PerfConstraintsMet(&perf_constraints_recommendation);
   bool early_stopping_met = true;
-  if (!settings.use_token_latencies){
-    early_stopping_met = EarlyStopping(&early_stopping_recommendation,
-                                        pr.queries_issued, 
-                                        &pr.sample_latencies, 
-                                        &pr.query_latencies,
-                                        settings.target_latency);
+  if (!settings.use_token_latencies) {
+    early_stopping_met = EarlyStopping(
+        &early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
+        &pr.query_latencies, settings.target_latency);
   } else {
-    early_stopping_met = EarlyStopping(&early_stopping_tpot_recommendation,
-                                        pr.queries_issued, 
-                                        &pr.token_results.time_per_output_token_arr, 
-                                        &pr.query_latencies,
-                                        std::chrono::nanoseconds(settings.server_tpot_latency)) && 
-                          EarlyStopping(&early_stopping_ttft_recommendation,
-                                        pr.queries_issued, 
-                                        &pr.token_results.first_token_latencies, 
-                                        &pr.query_latencies,
-                                        std::chrono::nanoseconds(settings.server_ttft_latency));
+    early_stopping_met =
+        EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
+                      &pr.token_results.time_per_output_token_arr,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_tpot_latency)) &&
+        EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
+                      &pr.token_results.first_token_latencies,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_ttft_latency));
   }
   bool all_constraints_met = min_duration_met && min_queries_met &&
                              perf_constraints_met && early_stopping_met;
@@ -676,15 +695,17 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
   }
   std::replace(early_stopping_recommendation.begin(),
                early_stopping_recommendation.end(), '\n', ' ');
-  if (!settings.use_token_latencies){
+  if (!settings.use_token_latencies) {
     MLPERF_LOG(detail, "early_stopping_result", early_stopping_recommendation);
-  } else{
+  } else {
     std::replace(early_stopping_ttft_recommendation.begin(),
-               early_stopping_ttft_recommendation.end(), '\n', ' ');
+                 early_stopping_ttft_recommendation.end(), '\n', ' ');
     std::replace(early_stopping_tpot_recommendation.begin(),
-                early_stopping_tpot_recommendation.end(), '\n', ' ');
-    MLPERF_LOG(detail, "early_stopping_ttft_result", early_stopping_ttft_recommendation);
-    MLPERF_LOG(detail, "early_stopping_tpot_result", early_stopping_tpot_recommendation);
+                 early_stopping_tpot_recommendation.end(), '\n', ' ');
+    MLPERF_LOG(detail, "early_stopping_ttft_result",
+               early_stopping_ttft_recommendation);
+    MLPERF_LOG(detail, "early_stopping_tpot_result",
+               early_stopping_tpot_recommendation);
   }
   // Report number of queries
   MLPERF_LOG(detail, "result_query_count", query_count);
@@ -759,49 +780,64 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
                lp.sample_latency);
   }
   // Detailed first token latencies
-  if (settings.use_token_latencies){
+  if (settings.use_token_latencies) {
     if (settings.scenario != TestScenario::Offline) {
-      MLPERF_LOG(detail, "result_first_token_min_latency_ns", first_token_latency_min);
-      MLPERF_LOG(detail, "result_first_token_max_latency_ns", first_token_latency_max);
-      MLPERF_LOG(detail, "result_first_token_mean_latency_ns", first_token_latency_mean);
+      MLPERF_LOG(detail, "result_first_token_min_latency_ns",
+                 first_token_latency_min);
+      MLPERF_LOG(detail, "result_first_token_max_latency_ns",
+                 first_token_latency_max);
+      MLPERF_LOG(detail, "result_first_token_mean_latency_ns",
+                 first_token_latency_mean);
       for (auto& lp : token_latency_percentiles) {
         MLPERF_LOG(detail,
-                    "result_first_token_" + DoubleToString(lp.percentile * 100) +
-                        "_percentile_latency_ns",
-                    lp.sample_latency);
+                   "result_first_token_" + DoubleToString(lp.percentile * 100) +
+                       "_percentile_latency_ns",
+                   lp.sample_latency);
       }
       double tps_w_lg = ((double)token_count) / pr.final_query_issued_time;
-      double tps_wo_lg= ((double)token_count) / (sample_latency_mean * sample_count);
-      MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead", tps_w_lg);
+      double tps_wo_lg =
+          ((double)token_count) / (sample_latency_mean * sample_count);
+      MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead",
+                 tps_w_lg);
       MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg);
       for (auto& lp : tpot_percentiles) {
         MLPERF_LOG(detail,
-                    "result_time_per_output_token_" + DoubleToString(lp.percentile * 100) +
-                        "_percentile_ns",
-                    lp.sample_latency);
+                   "result_time_per_output_token_" +
+                       DoubleToString(lp.percentile * 100) + "_percentile_ns",
+                   lp.sample_latency);
       }
-      MLPERF_LOG(detail, "result_time_to_output_token_min", time_per_output_token_min);
-      MLPERF_LOG(detail, "result_time_to_output_token_max", time_per_output_token_max);
-      MLPERF_LOG(detail, "result_time_to_output_token_mean", time_per_output_token_mean);
+      MLPERF_LOG(detail, "result_time_to_output_token_min",
+                 time_per_output_token_min);
+      MLPERF_LOG(detail, "result_time_to_output_token_max",
+                 time_per_output_token_max);
+      MLPERF_LOG(detail, "result_time_to_output_token_mean",
+                 time_per_output_token_mean);
       double tps_as_completed =
-            token_count / pr.final_query_all_samples_done_time;
-      MLPERF_LOG(detail, "result_completed_tokens_per_second", tps_as_completed);
+          token_count / pr.final_query_all_samples_done_time;
+      MLPERF_LOG(detail, "result_completed_tokens_per_second",
+                 tps_as_completed);
     } else {
       double tokens_per_second = token_count / pr.max_latency;
       MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
     }
   }
 
-  if (settings.infer_token_latencies){
+  if (settings.infer_token_latencies) {
     switch (settings.scenario) {
       case TestScenario::Server: {
-        double completed_tokens_per_second = (sample_count - 1) * settings.token_latency_scaling_factor / pr.final_query_all_samples_done_time;
-        MLPERF_LOG(detail, "result_inferred_completed_tokens_per_second", completed_tokens_per_second);
+        double completed_tokens_per_second =
+            (sample_count - 1) * settings.token_latency_scaling_factor /
+            pr.final_query_all_samples_done_time;
+        MLPERF_LOG(detail, "result_inferred_completed_tokens_per_second",
+                   completed_tokens_per_second);
         break;
       }
       case TestScenario::Offline: {
-        double tokens_per_second = sample_count * settings.token_latency_scaling_factor / pr.max_latency;
-        MLPERF_LOG(detail, "result_inferred_tokens_per_second", tokens_per_second);
+        double tokens_per_second = sample_count *
+                                   settings.token_latency_scaling_factor /
+                                   pr.max_latency;
+        MLPERF_LOG(detail, "result_inferred_tokens_per_second",
+                   tokens_per_second);
         break;
       }
       case TestScenario::SingleStream: {
@@ -810,9 +846,9 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
       case TestScenario::MultiStream: {
         break;
       }
-  }
+    }
 #endif
-}
+  }
 }
 }  // namespace loadgen
-} // namespace mlperf
+}  // namespace mlperf
diff --git a/loadgen/results.h b/loadgen/results.h
index 38bbe32d4..b53abfa3d 100644
--- a/loadgen/results.h
+++ b/loadgen/results.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef MLPERF_LOADGEN_RESULTS_H_
 #define MLPERF_LOADGEN_RESULTS_H_
 
+#include <string>
+#include <vector>
+
 #include "query_sample.h"
 #include "test_settings_internal.h"
 
-#include <vector>
-#include <string>
-
 namespace mlperf {
 namespace loadgen {
 
@@ -46,7 +46,6 @@ struct PerformanceResult {
   TokenPerformanceResults token_results;
 };
 
-
 /// \brief Wraps PerformanceResult with relevant context to change how
 /// it's interpreted and reported.
 struct PerformanceSummary {
@@ -92,12 +91,13 @@ struct PerformanceSummary {
   QuerySampleLatency time_per_output_token_mean;
 
   // Latency token target percentile
-  PercentileEntry token_target_latency_percentile{settings.target_latency_percentile};
+  PercentileEntry token_target_latency_percentile{
+      settings.target_latency_percentile};
   PercentileEntry token_latency_percentiles[6] = {{.50}, {.90}, {.95},
                                                   {.97}, {.99}, {.999}};
   PercentileEntry target_tpot_percentile{settings.target_latency_percentile};
   PercentileEntry tpot_percentiles[6] = {{.50}, {.90}, {.95},
-                                        {.97}, {.99}, {.999}};
+                                         {.97}, {.99}, {.999}};
 
 #if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
   // MSVC complains if there is no explicit constructor.
@@ -111,10 +111,10 @@ struct PerformanceSummary {
   void ProcessTokenLatencies();
 
   bool MinDurationMet(std::string* recommendation);
-  bool EarlyStopping(std::string* recommendation, int64_t queries_issued, 
-                      std::vector<QuerySampleLatency>* sample_latencies,
-                      std::vector<QuerySampleLatency>* query_latencies,
-                      std::chrono::nanoseconds target_latency);
+  bool EarlyStopping(std::string* recommendation, int64_t queries_issued,
+                     std::vector<QuerySampleLatency>* sample_latencies,
+                     std::vector<QuerySampleLatency>* query_latencies,
+                     std::chrono::nanoseconds target_latency);
   bool MinQueriesMet();
   bool MinSamplesMet();
   bool HasPerfConstraints();
@@ -125,5 +125,4 @@ struct PerformanceSummary {
 }  // namespace loadgen
 }  // namespace mlperf
 
-
 #endif
diff --git a/loadgen/setup.py b/loadgen/setup.py
index d7b8224de..81952dff7 100644
--- a/loadgen/setup.py
+++ b/loadgen/setup.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # =============================================================================
 
-## \file
+# \file
 #  \brief MLPerf Inference LoadGen python module setup.
 #  \details Creates a module that python can import.
 #  All source files are compiled by python"s C++ toolchain  without depending
@@ -69,23 +69,30 @@
 this_directory = Path(__file__).parent
 mlperf_loadgen_headers = public_headers + lib_headers
 mlperf_loadgen_sources_no_gen = lib_sources + lib_bindings
-mlperf_loadgen_sources = (mlperf_loadgen_sources_no_gen +
-                          [generated_version_source_filename])
-mlperf_long_description = (this_directory / "README.md").read_text(encoding="utf-8")
+mlperf_loadgen_sources = mlperf_loadgen_sources_no_gen + [
+    generated_version_source_filename
+]
+mlperf_long_description = (
+    this_directory /
+    "README.md").read_text(
+        encoding="utf-8")
 
 
 mlperf_loadgen_module = Pybind11Extension(
-        "mlperf_loadgen",
-        define_macros=[("MAJOR_VERSION", "4"), ("MINOR_VERSION", "0")],
-        include_dirs=[".", get_include()],
-        sources=mlperf_loadgen_sources,
-        depends=mlperf_loadgen_headers)
+    "mlperf_loadgen",
+    define_macros=[("MAJOR_VERSION", "4"), ("MINOR_VERSION", "0")],
+    include_dirs=[".", get_include()],
+    sources=mlperf_loadgen_sources,
+    depends=mlperf_loadgen_headers,
+)
 
-setup(name="mlperf_loadgen",
-      version="4.0",
-      description="MLPerf Inference LoadGen python bindings",
-      url="https://mlcommons.org/",
-      cmdclass={"build_ext": build_ext},
-      ext_modules=[mlperf_loadgen_module],
-      long_description=mlperf_long_description,
-      long_description_content_type='text/markdown')
+setup(
+    name="mlperf_loadgen",
+    version="4.0",
+    description="MLPerf Inference LoadGen python bindings",
+    url="https://mlcommons.org/",
+    cmdclass={"build_ext": build_ext},
+    ext_modules=[mlperf_loadgen_module],
+    long_description=mlperf_long_description,
+    long_description_content_type="text/markdown",
+)
diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h
index 8b209035c..0c6fad431 100644
--- a/loadgen/test_settings.h
+++ b/loadgen/test_settings.h
@@ -180,10 +180,10 @@ struct TestSettings {
   double offline_expected_qps = 1;
   /// \brief Affects the order in which the samples of the dataset are chosen.
   /// If false it concatenates a single permutation of the dataset (or part
-  /// of it depending on QSL->PerformanceSampleCount()) several times up to the 
+  /// of it depending on QSL->PerformanceSampleCount()) several times up to the
   /// number of samples requested.
-  /// If true it concatenates a multiple permutation of the dataset (or a 
-  /// part of it depending on QSL->PerformanceSampleCount()) several times  
+  /// If true it concatenates a multiple permutation of the dataset (or a
+  /// part of it depending on QSL->PerformanceSampleCount()) several times
   /// up to the number of samples requested.
   bool sample_concatenate_permutation = false;
   /**@}*/
@@ -228,7 +228,8 @@ struct TestSettings {
   uint64_t accuracy_log_sampling_target = 0;
 
   /// \brief Variables for running test05 from native config. A boolean that
-  /// determines whether or not to run test05 and three random seed to run the test
+  /// determines whether or not to run test05 and three random seed to run the
+  /// test
   bool test05 = false;
   uint64_t test05_qsl_rng_seed = 0;
   uint64_t test05_sample_index_rng_seed = 0;
diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc
index 5a18c32f9..ac993e3e2 100644
--- a/loadgen/test_settings_internal.cc
+++ b/loadgen/test_settings_internal.cc
@@ -53,7 +53,7 @@ TestSettingsInternal::TestSettingsInternal(
       server_ttft_latency(requested.server_ttft_latency),
       server_tpot_latency(requested.server_tpot_latency),
       infer_token_latencies(requested.infer_token_latencies),
-      token_latency_scaling_factor(requested.token_latency_scaling_factor){
+      token_latency_scaling_factor(requested.token_latency_scaling_factor) {
   // Target QPS, target latency, and max_async_queries.
   switch (requested.scenario) {
     case TestScenario::SingleStream:
@@ -158,14 +158,15 @@ TestSettingsInternal::TestSettingsInternal(
   // performance_sample_count == 0 makes it to be equal to loaded_samples.size()
   if (sample_concatenate_permutation &&
       requested.scenario == TestScenario::SingleStream) {
-    // set slack larger for 3D-UNet KiTS19 distribution, i.e. 50% latency << 90% latency
+    // set slack larger for 3D-UNet KiTS19 distribution, i.e. 50% latency << 90%
+    // latency
     constexpr double kSlack = 2.0;
-    uint64_t expected_queries = kSlack * DurationToSeconds(target_duration) * target_qps;
-    min_query_count = min_query_count > expected_queries 
-                      ? min_query_count
-                      : expected_queries;
-    min_query_count += 
-        qsl_performance_sample_count - (min_query_count  % qsl_performance_sample_count);
+    uint64_t expected_queries =
+        kSlack * DurationToSeconds(target_duration) * target_qps;
+    min_query_count =
+        min_query_count > expected_queries ? min_query_count : expected_queries;
+    min_query_count += qsl_performance_sample_count -
+                       (min_query_count % qsl_performance_sample_count);
   }
 
   min_sample_count = min_query_count * samples_per_query;
@@ -335,11 +336,14 @@ void LogRequestedTestSettings(const TestSettings &s) {
     MLPERF_LOG(detail, "requested_performance_sample_count_override",
                s.performance_sample_count_override);
     // Token latencies specific values
-    if (s.use_token_latencies){
-      MLPERF_LOG(detail, "requested_use_token_latencies", s.use_token_latencies);
-      if (s.scenario != TestScenario::Offline){
-        MLPERF_LOG(detail, "requested_server_ttft_latency", s.server_ttft_latency);
-        MLPERF_LOG(detail, "requested_server_tpot_latency", s.server_tpot_latency);
+    if (s.use_token_latencies) {
+      MLPERF_LOG(detail, "requested_use_token_latencies",
+                 s.use_token_latencies);
+      if (s.scenario != TestScenario::Offline) {
+        MLPERF_LOG(detail, "requested_server_ttft_latency",
+                   s.server_ttft_latency);
+        MLPERF_LOG(detail, "requested_server_tpot_latency",
+                   s.server_tpot_latency);
       }
     }
 #else
@@ -484,7 +488,7 @@ void TestSettingsInternal::LogAllSettings() const {
 void TestSettingsInternal::LogSummary(AsyncSummary &summary) const {
   summary("samples_per_query : ", samples_per_query);
   summary("target_qps : ", target_qps);
-  if (!use_token_latencies){
+  if (!use_token_latencies) {
     summary("target_latency (ns): ", target_latency.count());
   } else {
     summary("ttft_latency (ns): ", server_ttft_latency);
@@ -682,29 +686,35 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
            &performance_issue_same_index, nullptr);
   lookupkv(model, scenario, "performance_sample_count_override",
            &performance_sample_count_override, nullptr);
-  if (lookupkv(model, scenario, "sample_concatenate_permutation", &val, nullptr))
+  if (lookupkv(model, scenario, "sample_concatenate_permutation", &val,
+               nullptr))
     sample_concatenate_permutation = (val == 1) ? true : false;
   if (lookupkv(model, scenario, "test05", &val, nullptr))
     test05 = (val == 1) ? true : false;
-  lookupkv(model, scenario, "test05_qsl_rng_seed", &test05_qsl_rng_seed, nullptr);
-  lookupkv(model, scenario, "test05_sample_index_rng_seed", &test05_sample_index_rng_seed,
+  lookupkv(model, scenario, "test05_qsl_rng_seed", &test05_qsl_rng_seed,
            nullptr);
-  lookupkv(model, scenario, "test05_schedule_rng_seed", &test05_schedule_rng_seed, nullptr);
+  lookupkv(model, scenario, "test05_sample_index_rng_seed",
+           &test05_sample_index_rng_seed, nullptr);
+  lookupkv(model, scenario, "test05_schedule_rng_seed",
+           &test05_schedule_rng_seed, nullptr);
 
   // keys to measure token metrics
-  if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)){
+  if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)) {
     use_token_latencies = (val == 1) ? true : false;
-    if (use_token_latencies){
-      lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr, 1000 * 1000);
-      lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr, 1000 * 1000);
+    if (use_token_latencies) {
+      lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr,
+               1000 * 1000);
+      lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr,
+               1000 * 1000);
     }
   }
 
   // keys to infer token metrics
-  if (lookupkv(model, scenario, "infer_token_latencies", &val, nullptr)){
+  if (lookupkv(model, scenario, "infer_token_latencies", &val, nullptr)) {
     infer_token_latencies = (val == 1) ? true : false;
-    if (infer_token_latencies){
-      lookupkv(model, scenario, "token_latency_scaling_factor", &token_latency_scaling_factor, nullptr, 1);
+    if (infer_token_latencies) {
+      lookupkv(model, scenario, "token_latency_scaling_factor",
+               &token_latency_scaling_factor, nullptr, 1);
     }
   }
   // keys that apply to SingleStream
diff --git a/loadgen/test_settings_internal.h b/loadgen/test_settings_internal.h
index d557e9706..ab2773bd1 100644
--- a/loadgen/test_settings_internal.h
+++ b/loadgen/test_settings_internal.h
@@ -102,7 +102,7 @@ TestSettingsInternal MidOfBoundaries(
     const TestSettingsInternal &lower_bound_settings,
     const TestSettingsInternal &upper_bound_settings) {
   TestSettingsInternal mid_settings = lower_bound_settings;
-if (scenario == TestScenario::Server) {
+  if (scenario == TestScenario::Server) {
     assert(lower_bound_settings.target_qps < upper_bound_settings.target_qps);
     mid_settings.target_qps =
         lower_bound_settings.target_qps +
@@ -122,7 +122,7 @@ if (scenario == TestScenario::Server) {
 template <TestScenario scenario>
 bool IsFinished(const TestSettingsInternal &lower_bound_settings,
                 const TestSettingsInternal &upper_bound_settings) {
-if (scenario == TestScenario::Server) {
+  if (scenario == TestScenario::Server) {
     uint8_t precision = lower_bound_settings.requested
                             .server_find_peak_qps_decimals_of_precision;
     double l =
diff --git a/loadgen/tests/loadgen_test.h b/loadgen/tests/loadgen_test.h
index 00afcb63a..777029b99 100644
--- a/loadgen/tests/loadgen_test.h
+++ b/loadgen/tests/loadgen_test.h
@@ -30,10 +30,10 @@ limitations under the License.
   static Test::StaticRegistrant t##name##scenario(        \
       #name "_" #scenario, test, __VA_ARGS__, mlperf::TestScenario::scenario)
 
-#define REGISTER_TEST_ALL_SCENARIOS(name, test, ...)                \
-  REGISTER_TEST_SCENARIO(name, SingleStream, test, __VA_ARGS__);    \
-  REGISTER_TEST_SCENARIO(name, MultiStream, test, __VA_ARGS__);     \
-  REGISTER_TEST_SCENARIO(name, Server, test, __VA_ARGS__);          \
+#define REGISTER_TEST_ALL_SCENARIOS(name, test, ...)             \
+  REGISTER_TEST_SCENARIO(name, SingleStream, test, __VA_ARGS__); \
+  REGISTER_TEST_SCENARIO(name, MultiStream, test, __VA_ARGS__);  \
+  REGISTER_TEST_SCENARIO(name, Server, test, __VA_ARGS__);       \
   REGISTER_TEST_SCENARIO(name, Offline, test, __VA_ARGS__);
 
 #define FAIL_IF(exp)                                              \
diff --git a/loadgen/tests/perftests_null_sut.py b/loadgen/tests/perftests_null_sut.py
index 5364c6365..115372e18 100644
--- a/loadgen/tests/perftests_null_sut.py
+++ b/loadgen/tests/perftests_null_sut.py
@@ -50,7 +50,8 @@ def main(argv):
 
     sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        1024 * 1024, 1024, load_samples_to_ram, unload_samples_from_ram)
+        1024 * 1024, 1024, load_samples_to_ram, unload_samples_from_ram
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
diff --git a/loadgen/version_generator.py b/loadgen/version_generator.py
index 4de930a63..810c6be28 100644
--- a/loadgen/version_generator.py
+++ b/loadgen/version_generator.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # =============================================================================
 
-## \file
+# \file
 #  \brief A script run by the build to generate the version definitions
 #  expected at link time.
 
@@ -27,27 +27,39 @@
 
 # Creates a C++ raw string literal using a delimiter that is very
 # unlikely to show up in a git stats.
-def make_raw_string(str) :
+def make_raw_string(str):
     delimeter = "LGVG_RSLD"
-    return "R\"" + delimeter + "(" + str + ")" + delimeter + "\""
+    return 'R"' + delimeter + "(" + str + ")" + delimeter + '"'
+
 
 def func_def(name, string):
-    return ("const std::string& Loadgen" + name + "() {\n" +
-            "  static const std::string str = " + string + ";\n" +
-            "  return str;\n" +
-            "}\n\n")
+    return (
+        "const std::string& Loadgen"
+        + name
+        + "() {\n"
+        + "  static const std::string str = "
+        + string
+        + ";\n"
+        + "  return str;\n"
+        + "}\n\n"
+    )
 
 
 # For clients that build the loadgen from the git respository without
 # any modifications.
 def generate_loadgen_version_definitions_git(ofile, git_command):
     git_rev = os.popen(git_command + "rev-parse --short=10 HEAD").read()
-    git_commit_date = os.popen(git_command + "log --format=\"%cI\" -n 1").read()
+    git_commit_date = os.popen(git_command + 'log --format="%cI" -n 1').read()
     git_status = os.popen(git_command + "status -s -uno .").read()
     git_log = subprocess.Popen(
-        git_command + "log --pretty=oneline -n 16 --no-decorate", stdout=subprocess.PIPE, shell=True, encoding='ascii', errors="ignore" ).stdout.read()
-    ofile.write(func_def("GitRevision", "\"" + git_rev[0:-1] + "\""))
-    ofile.write(func_def("GitCommitDate", "\"" + git_commit_date[0:-1] + "\""))
+        git_command + "log --pretty=oneline -n 16 --no-decorate",
+        stdout=subprocess.PIPE,
+        shell=True,
+        encoding="ascii",
+        errors="ignore",
+    ).stdout.read()
+    ofile.write(func_def("GitRevision", '"' + git_rev[0:-1] + '"'))
+    ofile.write(func_def("GitCommitDate", '"' + git_commit_date[0:-1] + '"'))
     ofile.write(func_def("GitStatus", make_raw_string(git_status[0:-1])))
     ofile.write(func_def("GitLog", make_raw_string(git_log[0:-1])))
 
@@ -55,7 +67,7 @@ def generate_loadgen_version_definitions_git(ofile, git_command):
 # For clients that might not import the loadgen code as the original git
 # repository.
 def generate_loadgen_verstion_definitions_git_stubs(ofile):
-    na = "\"NA\""
+    na = '"NA"'
     ofile.write(func_def("GitRevision", na))
     ofile.write(func_def("GitCommitDate", na))
     ofile.write(func_def("GitStatus", na))
@@ -68,9 +80,10 @@ def generate_loadgen_version_definitions_sha1(ofile, loadgen_root):
     """Writes definition for Sha1OfFiles."""
     sha1s = ""
     loadgen_files = (
-        ["/bindings/" + s for s in os.listdir(loadgen_root + "/bindings")] +
-        ["/demos/" + s for s in os.listdir(loadgen_root + "/demos")] +
-        ["/" + s for s in os.listdir(loadgen_root)])
+        ["/bindings/" + s for s in os.listdir(loadgen_root + "/bindings")]
+        + ["/demos/" + s for s in os.listdir(loadgen_root + "/demos")]
+        + ["/" + s for s in os.listdir(loadgen_root)]
+    )
     for fn in sorted(loadgen_files):
         full_fn = loadgen_root + fn
         if not os.path.isfile(full_fn):
@@ -94,15 +107,15 @@ def generate_loadgen_version_definitions(cc_filename, loadgen_root):
     ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n")
     ofile.write("#include <string>\n\n")
     ofile.write("namespace mlperf {\n\n")
-    ofile.write(func_def("Version", "\"4.0\""))
+    ofile.write(func_def("Version", '"4.0"'))
 
     date_time_now_local = datetime.datetime.now().isoformat()
     date_time_now_utc = datetime.datetime.utcnow().isoformat()
-    ofile.write(func_def("BuildDateLocal", "\"" + date_time_now_local + "\""))
-    ofile.write(func_def("BuildDateUtc", "\"" + date_time_now_utc + "\""))
+    ofile.write(func_def("BuildDateLocal", '"' + date_time_now_local + '"'))
+    ofile.write(func_def("BuildDateUtc", '"' + date_time_now_utc + '"'))
 
-    git_dir = "--git-dir=\"" + loadgen_root + "/../.git\" "
-    git_work_tree = "--work-tree=\"" + loadgen_root + "/..\" "
+    git_dir = '--git-dir="' + loadgen_root + '/../.git" '
+    git_work_tree = '--work-tree="' + loadgen_root + '/.." '
     git_command = "git " + git_dir + git_work_tree
     git_status = os.popen(git_command + "status")
     git_status.read()
diff --git a/lon/network_SUT.py b/lon/network_SUT.py
index 193515149..4cbb31eca 100644
--- a/lon/network_SUT.py
+++ b/lon/network_SUT.py
@@ -13,15 +13,14 @@
 # limitations under the License.
 # =============================================================================
 
+from flask import Flask, request, jsonify
+import argparse
 import array
 import json
 import os
 import sys
-sys.path.insert(0, os.getcwd())
 
-
-import argparse
-from flask import Flask, request, jsonify
+sys.path.insert(0, os.getcwd())
 
 
 app = Flask(__name__)
@@ -29,39 +28,45 @@
 
 node = ""
 
+
 def set_backend(b):
     global backend
     backend = b
 
+
 def preprocess(query):
     """[SUT Node] A dummy preprocess."""
-    # Here may come for example batching, tokenization, resizing, normalization, etc.
+    # Here may come for example batching, tokenization, resizing,
+    # normalization, etc.
     response = query
     return response
 
 
 def dnn_model(query):
-    # Here may come for example a call to a dnn model such as resnet, bert, etc.
+    # Here may come for example a call to a dnn model such as resnet, bert,
+    # etc.
     response = backend.process_sample(query)
     return response
 
 
 def postprocess(query):
     """[SUT Node] A dummy postprocess."""
-    # Here may come for example a postprocessing call, e.g., NMS, detokenization, etc.
+    # Here may come for example a postprocessing call, e.g., NMS,
+    # detokenization, etc.
     response = query
     return response
 
 
-@app.route('/predict/', methods=['POST'])
+@app.route("/predict/", methods=["POST"])
 def predict():
     """Receives a query (e.g., a text) runs inference, and returns a prediction."""
-    query = request.get_json(force=True)['query']
+    query = request.get_json(force=True)["query"]
     result = postprocess(dnn_model(preprocess(query)))
     return jsonify(result=result)
 
 
-@app.route('/getname/', methods=['POST', 'GET'])
+@app.route("/getname/", methods=["POST", "GET"])
 def getname():
     """Returns the name of the SUT."""
-    return jsonify(name=f'Demo SUT (Network SUT) node' + (' ' + node) if node else '')
+    return jsonify(name=f"Demo SUT (Network SUT) node" +
+                   (" " + node) if node else "")
diff --git a/main.py b/main.py
index f73357d4e..c91edb41b 100644
--- a/main.py
+++ b/main.py
@@ -4,126 +4,161 @@ def define_env(env):
     def mlperf_inference_implementation_readme(spaces, model, implementation):
         pre_space = ""
 
-        for i in range(1,spaces):
-            pre_space  = pre_space + " "
+        for i in range(1, spaces):
+            pre_space = pre_space + " "
         f_pre_space = pre_space
         pre_space += " "
 
-        content=""
+        content = ""
         scenarios = []
-        execution_envs = ["Docker","Native"]
+        execution_envs = ["Docker", "Native"]
 
         if implementation == "reference":
-            devices = [ "CPU", "CUDA", "ROCm" ]
+            devices = ["CPU", "CUDA", "ROCm"]
             if model.lower() == "resnet50":
-                 frameworks = [ "Onnxruntime", "Tensorflow", "Deepsparse" ]
+                frameworks = ["Onnxruntime", "Tensorflow", "Deepsparse"]
             elif model.lower() == "retinanet":
-                 frameworks = [ "Onnxruntime", "Pytorch" ]
+                frameworks = ["Onnxruntime", "Pytorch"]
             elif "bert" in model.lower():
-                 frameworks = [ "Onnxruntime", "Pytorch", "Tensorflow" ]
+                frameworks = ["Onnxruntime", "Pytorch", "Tensorflow"]
             else:
-                 frameworks = [ "Pytorch" ]
+                frameworks = ["Pytorch"]
 
         elif implementation == "nvidia":
-            if model in [ "sdxl", "llama2-70b-99", "llama2-70b-99.9" ]:
-                 return pre_space+"    WIP"
-            devices = [ "CUDA" ]
-            frameworks = [ "TensorRT" ]
+            if model in ["sdxl", "llama2-70b-99", "llama2-70b-99.9"]:
+                return pre_space + "    WIP"
+            devices = ["CUDA"]
+            frameworks = ["TensorRT"]
 
         elif implementation == "intel":
-            if model not in [ "bert-99", "bert-99.9", "gptj-99", "gptj-99.9" ]:
-                 return pre_space+"    WIP"
-            devices = [ "CPU" ]
-            frameworks = [ "Pytorch" ]
+            if model not in ["bert-99", "bert-99.9", "gptj-99", "gptj-99.9"]:
+                return pre_space + "    WIP"
+            devices = ["CPU"]
+            frameworks = ["Pytorch"]
 
         elif implementation == "qualcomm":
-            if model not in [ "resnet50", "retinanet", "bert-99", "bert-99.9" ]:
-                 return pre_space+"    WIP"
+            if model not in ["resnet50", "retinanet", "bert-99", "bert-99.9"]:
+                return pre_space + "    WIP"
 
-            devices = [ "QAIC" ]
-            frameworks = [ "Glow" ]
+            devices = ["QAIC"]
+            frameworks = ["Glow"]
 
         elif implementation == "cpp":
-            devices = [ "CPU", "CUDA" ]
-            frameworks = [ "Onnxruntime" ]
+            devices = ["CPU", "CUDA"]
+            frameworks = ["Onnxruntime"]
 
         elif implementation == "ctuning-cpp":
-            scenarios = [ "SingleStream" ]
-            devices = [ "CPU" ]
+            scenarios = ["SingleStream"]
+            devices = ["CPU"]
             if model.lower() == "resnet50":
-                 frameworks = [ "TFLite" ]
+                frameworks = ["TFLite"]
             else:
-                 frameworks = []
+                frameworks = []
 
         if model.lower() == "bert-99.9":
-            categories = [ "Datacenter" ]
+            categories = ["Datacenter"]
         elif "dlrm" in model.lower() or "llama2" in model.lower():
-            categories = [ "Datacenter" ]
+            categories = ["Datacenter"]
         else:
-            categories = [ "Edge", "Datacenter" ]
+            categories = ["Edge", "Datacenter"]
 
         for category in categories:
             if category == "Edge" and not scenarios:
-                scenarios = [ "Offline", "SingleStream" ]
-                if model.lower() in [ "resnet50", "retinanet" ] and not "MultiStream" in scenarios:#MultiStream was duplicating
-                     scenarios.append("MultiStream")
+                scenarios = ["Offline", "SingleStream"]
+                if (
+                    model.lower() in ["resnet50", "retinanet"]
+                    and not "MultiStream" in scenarios
+                ):  # MultiStream was duplicating
+                    scenarios.append("MultiStream")
             elif category == "Datacenter":
-                 scenarios = [ "Offline", "Server" ] 
+                scenarios = ["Offline", "Server"]
 
-            content += f"{pre_space}=== \"{category.lower()}\"\n\n"
+            content += f'{pre_space}=== "{category.lower()}"\n\n'
 
             cur_space = pre_space + "    "
             scenarios_string = ", ".join(scenarios)
 
             content += f"{cur_space}### {category} category \n\n{cur_space} In the {category.lower()} category, {model} has {scenarios_string} scenarios and all the scenarios are mandatory for a closed division submission.\n\n"
 
-
             for framework in frameworks:
                 cur_space1 = cur_space + "    "
-                content += f"{cur_space}=== \"{framework}\"\n"
+                content += f'{cur_space}=== "{framework}"\n'
                 content += f"{cur_space1}#### {framework} framework\n\n"
 
                 for device in devices:
                     if framework.lower() == "deepsparse":
                         if device.lower() != "cpu":
-                             continue
+                            continue
                     cur_space2 = cur_space1 + "    "
                     cur_space3 = cur_space2 + "    "
                     cur_space4 = cur_space3 + "    "
-                    
-                    content += f"{cur_space1}=== \"{device}\"\n"
+
+                    content += f'{cur_space1}=== "{device}"\n'
                     content += f"{cur_space2}##### {device} device\n\n"
 
-                    # to select the execution environments(currently Docker and Native)
+                    # to select the execution environments(currently Docker and
+                    # Native)
                     for execution_env in execution_envs:
-                        if (device == "ROCm" or implementation == "qualcomm") and execution_env == "Docker":
+                        if (
+                            device == "ROCm" or implementation == "qualcomm"
+                        ) and execution_env == "Docker":
                             continue  # docker not currently supported for Qualcomm implementation and ROCm device
                         if implementation == "nvidia" and execution_env == "Native":
                             continue  # Nvidia implementation only supports execution through docker
-                        content += f"{cur_space2}=== \"{execution_env}\"\n"
+                        content += f'{cur_space2}=== "{execution_env}"\n'
                         content += f"{cur_space3}###### {execution_env} Environment\n\n"
-                        test_query_count=get_test_query_count(model, implementation, device)
-
-                        if "99.9" not in model: #not showing docker command as it is already done for the 99% variant
-                            if execution_env == "Native": # Native implementation steps through virtual environment
+                        test_query_count = get_test_query_count(
+                            model, implementation, device
+                        )
+
+                        if (
+                            "99.9" not in model
+                        ):  # not showing docker command as it is already done for the 99% variant
+                            if (
+                                execution_env == "Native"
+                            ):  # Native implementation steps through virtual environment
                                 content += f"{cur_space3}####### Setup a virtual environment for Python\n"
-                                content += get_venv_command(spaces+16)
+                                content += get_venv_command(spaces + 16)
                                 content += f"{cur_space3}####### Performance Estimation for Offline Scenario\n"
-                                content += mlperf_inference_run_command(spaces+17, model, implementation, framework.lower(), category.lower(), "Offline", device.lower(), "test", test_query_count, True).replace("--docker ","")
+                                content += mlperf_inference_run_command(
+                                    spaces + 17,
+                                    model,
+                                    implementation,
+                                    framework.lower(),
+                                    category.lower(),
+                                    "Offline",
+                                    device.lower(),
+                                    "test",
+                                    test_query_count,
+                                    True,
+                                ).replace("--docker ", "")
                                 content += f"{cur_space3}The above command should do a test run of Offline scenario and record the estimated offline_target_qps.\n\n"
 
-                            else: # Docker implementation steps
+                            else:  # Docker implementation steps
                                 content += f"{cur_space3}####### Docker Container Build and Performance Estimation for Offline Scenario\n"
-                                docker_info = get_docker_info(spaces+16, model, implementation, device)
+                                docker_info = get_docker_info(
+                                    spaces + 16, model, implementation, device
+                                )
                                 content += docker_info
-                                content += mlperf_inference_run_command(spaces+17, model, implementation, framework.lower(), category.lower(), "Offline", device.lower(), "test", test_query_count, True)
+                                content += mlperf_inference_run_command(
+                                    spaces + 17,
+                                    model,
+                                    implementation,
+                                    framework.lower(),
+                                    category.lower(),
+                                    "Offline",
+                                    device.lower(),
+                                    "test",
+                                    test_query_count,
+                                    True,
+                                )
                                 content += f"{cur_space3}The above command should get you to an interactive shell inside the docker container and do a quick test run for the Offline scenario. Once inside the docker container please do the below commands to do the accuracy + performance runs for each scenario.\n\n"
                                 content += f"{cur_space3}<details>\n"
                                 content += f"{cur_space3}<summary> Please click here to see more options for the docker launch </summary>\n\n"
                                 content += f"{cur_space3}* `--docker_cm_repo <Custom CM repo URL>`: to use a custom fork of cm4mlops repository inside the docker image\n\n"
                                 content += f"{cur_space3}* `--docker_cache=no`: to not use docker cache during the image build\n"
 
-                                if device.lower() not in [ "cuda" ]:
+                                if device.lower() not in ["cuda"]:
                                     content += f"{cur_space3}* `--docker_os=ubuntu`: ubuntu and rhel are supported. \n"
                                     content += f"{cur_space3}* `--docker_os_version=20.04`: [20.04, 22.04] are supported for Ubuntu and [8, 9] for RHEL\n"
 
@@ -131,30 +166,58 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
                         else:
                             content += f"{cur_space3} You can reuse the same environment as described for {model.split('.')[0]}.\n"
                             content += f"{cur_space3}###### Performance Estimation for Offline Scenario\n"
-                            content += mlperf_inference_run_command(spaces+17, model, implementation, framework.lower(), category.lower(), "Offline", device.lower(), "test", test_query_count, True).replace("--docker ","")
+                            content += mlperf_inference_run_command(
+                                spaces + 17,
+                                model,
+                                implementation,
+                                framework.lower(),
+                                category.lower(),
+                                "Offline",
+                                device.lower(),
+                                "test",
+                                test_query_count,
+                                True,
+                            ).replace("--docker ", "")
                             content += f"{cur_space3}The above command should do a test run of Offline scenario and record the estimated offline_target_qps.\n\n"
 
-
                         run_suffix = ""
                         run_suffix += f"{cur_space3}<details>\n"
                         run_suffix += f"{cur_space3}<summary> Please click here to see more options for the RUN command</summary>\n\n"
                         run_suffix += f"{cur_space3}* Use `--division=closed` to do a closed division submission which includes compliance runs\n\n"
-                        run_suffix += f"{cur_space3}* Use `--rerun` to do a rerun even when a valid run exists\n"  
+                        run_suffix += f"{cur_space3}* Use `--rerun` to do a rerun even when a valid run exists\n"
                         run_suffix += f"{cur_space3}</details>\n"
 
                         for scenario in scenarios:
-                            content += f"{cur_space3}=== \"{scenario}\"\n{cur_space4}###### {scenario}\n\n"
-                            run_cmd = mlperf_inference_run_command(spaces+21, model, implementation, framework.lower(), category.lower(), scenario, device.lower(), "valid", scenarios)
+                            content += f'{cur_space3}=== "{scenario}"\n{cur_space4}###### {scenario}\n\n'
+                            run_cmd = mlperf_inference_run_command(
+                                spaces + 21,
+                                model,
+                                implementation,
+                                framework.lower(),
+                                category.lower(),
+                                scenario,
+                                device.lower(),
+                                "valid",
+                                scenarios,
+                            )
                             content += run_cmd
-                            #content += run_suffix
- 
-                        content += f"{cur_space3}=== \"All Scenarios\"\n{cur_space4}###### All Scenarios\n\n"
-                        run_cmd = mlperf_inference_run_command(spaces+21, model, implementation, framework.lower(), category.lower(), "All Scenarios", device.lower(), "valid", scenarios)
+                            # content += run_suffix
+
+                        content += f'{cur_space3}=== "All Scenarios"\n{cur_space4}###### All Scenarios\n\n'
+                        run_cmd = mlperf_inference_run_command(
+                            spaces + 21,
+                            model,
+                            implementation,
+                            framework.lower(),
+                            category.lower(),
+                            "All Scenarios",
+                            device.lower(),
+                            "valid",
+                            scenarios,
+                        )
                         content += run_cmd
                         content += run_suffix
 
-                    
-
         readme_prefix = get_readme_prefix(spaces, model, implementation)
 
         readme_suffix = get_readme_suffix(spaces, model, implementation)
@@ -164,11 +227,11 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
     def get_test_query_count(model, implementation, device, num_devices=1):
 
         if model == "resnet50":
-             p_range = 1000
-        elif model in [ "retinanet", "bert-99", "bert-99.9" ]:
-             p_range = 100
+            p_range = 1000
+        elif model in ["retinanet", "bert-99", "bert-99.9"]:
+            p_range = 100
         else:
-             p_range = 50
+            p_range = 50
 
         if device == "cuda":
             p_range *= 40
@@ -178,52 +241,56 @@ def get_test_query_count(model, implementation, device, num_devices=1):
 
     def get_readme_prefix(spaces, model, implementation):
         readme_prefix = ""
-        pre_space="    "
-        #for i in range(1,spaces):
+        pre_space = "    "
+        # for i in range(1,spaces):
         #     pre_space  = pre_space + " "
-        #pre_space += "  "
+        # pre_space += "  "
 
         return readme_prefix
-    
+
     def get_venv_command(spaces):
-      pre_space = " "*spaces
-      return f"""\n
+        pre_space = " " * spaces
+        return f"""\n
 {pre_space}```bash
 {pre_space}cm run script --tags=install,python-venv --name=mlperf
 {pre_space}export CM_SCRIPT_EXTRA_CMD=\"--adr.python.name=mlperf\"
-{pre_space}```\n"""   
+{pre_space}```\n"""
 
     def get_docker_info(spaces, model, implementation, device):
         info = ""
-        pre_space=""
-        for i in range(1,spaces):
-             pre_space  = pre_space + " "
+        pre_space = ""
+        for i in range(1, spaces):
+            pre_space = pre_space + " "
         pre_space += " "
-        #pre_space = "                "
+        # pre_space = "                "
         if implementation == "nvidia":
             info += f"\n{pre_space}!!! tip\n\n"
-            info+= f"{pre_space}    All the Nvidia benchmarks, except GPT-J and LLAMA2-70B, use the same Docker container. Therefore, if you have already executed the Docker setup command for any benchmark, you can skip the Docker setup command below and run the commands inside the existing Docker container. The Docker container for GPT-J and LLAMA2-70B is the same and can be used for the other benchmarks, but not vice versa. This is because TensorRT-LLM is built specifically for the LLM benchmarks. If you are already inside a Docker container, execute the below Docker setup command without the --docker option for performance estimation.\n\n"
+            info += f"{pre_space}    All the Nvidia benchmarks, except GPT-J and LLAMA2-70B, use the same Docker container. Therefore, if you have already executed the Docker setup command for any benchmark, you can skip the Docker setup command below and run the commands inside the existing Docker container. The Docker container for GPT-J and LLAMA2-70B is the same and can be used for the other benchmarks, but not vice versa. This is because TensorRT-LLM is built specifically for the LLM benchmarks. If you are already inside a Docker container, execute the below Docker setup command without the --docker option for performance estimation.\n\n"
         return info
 
     def get_readme_suffix(spaces, model, implementation):
         readme_suffix = ""
-        pre_space=""
-        for i in range(1,spaces):
-             pre_space  = pre_space + " "
+        pre_space = ""
+        for i in range(1, spaces):
+            pre_space = pre_space + " "
         pre_space += "  "
 
         if implementation == "reference":
             if not model.endswith("-99"):
-                model_base_name = model.replace("-99.9","").replace("-99","")
-                readme_suffix+= f"{pre_space}* If you want to download the official MLPerf model and dataset for {model} you can follow [this README](get-{model_base_name}-data.md).\n"
+                model_base_name = model.replace("-99.9", "").replace("-99", "")
+                readme_suffix += f"{pre_space}* If you want to download the official MLPerf model and dataset for {model} you can follow [this README](get-{model_base_name}-data.md).\n"
             if model == "resnet50":
-                 readme_suffix += f"{pre_space}* Please see [mobilenets.md](mobilenets.md) for running mobilenet models for Image Classification."
+                readme_suffix += f"{pre_space}* Please see [mobilenets.md](mobilenets.md) for running mobilenet models for Image Classification."
         return readme_suffix
 
-    def get_run_cmd_extra(f_pre_space, model, implementation, device, scenario, scenarios = []):
+    def get_run_cmd_extra(
+        f_pre_space, model, implementation, device, scenario, scenarios=[]
+    ):
         extra_content = ""
         f_pre_space += ""
-        if scenario == "Server" or (scenario == "All Scenarios" and "Server" in scenarios):
+        if scenario == "Server" or (
+            scenario == "All Scenarios" and "Server" in scenarios
+        ):
             extra_content += f"{f_pre_space}    * `<SERVER_TARGET_QPS>` must be determined manually. It is usually around 80% of the Offline QPS, but on some systems, it can drop below 50%. If a higher value is specified, the latency constraint will not be met, and the run will be considered invalid.\n"
 
         if "gptj" in model and device == "cuda" and implementation == "reference":
@@ -235,28 +302,46 @@ def get_run_cmd_extra(f_pre_space, model, implementation, device, scenario, scen
         return extra_content
 
     @env.macro
-    def mlperf_inference_run_command(spaces, model, implementation, framework, category, scenario, device="cpu", execution_mode="test", test_query_count="20", docker=False, scenarios = []):
+    def mlperf_inference_run_command(
+        spaces,
+        model,
+        implementation,
+        framework,
+        category,
+        scenario,
+        device="cpu",
+        execution_mode="test",
+        test_query_count="20",
+        docker=False,
+        scenarios=[],
+    ):
         pre_space = ""
-        for i in range(1,spaces):
-             pre_space  = pre_space + " "
+        for i in range(1, spaces):
+            pre_space = pre_space + " "
         f_pre_space = pre_space
         pre_space += "  "
 
         if scenario == "All Scenarios":
-             scenario_variation_tag = ",_all-scenarios"
-             scenario_option = ""
+            scenario_variation_tag = ",_all-scenarios"
+            scenario_option = ""
         else:
             scenario_variation_tag = ""
             scenario_option = f"\\\n{pre_space} --scenario={scenario}"
 
-        if scenario == "Server" or (scenario == "All Scenarios" and "Server" in scenarios):
+        if scenario == "Server" or (
+            scenario == "All Scenarios" and "Server" in scenarios
+        ):
             scenario_option = f"\\\n{pre_space} --server_target_qps=<SERVER_TARGET_QPS>"
 
-        run_cmd_extra = get_run_cmd_extra(f_pre_space, model, implementation, device, scenario, scenarios)
+        run_cmd_extra = get_run_cmd_extra(
+            f_pre_space, model, implementation, device, scenario, scenarios
+        )
 
         if docker:
             docker_cmd_suffix = f" \\\n{pre_space} --docker --quiet"
-            docker_cmd_suffix += f" \\\n{pre_space} --test_query_count={test_query_count}"
+            docker_cmd_suffix += (
+                f" \\\n{pre_space} --test_query_count={test_query_count}"
+            )
 
             docker_setup_cmd = f"""\n
 {f_pre_space}```bash
diff --git a/recommendation/dlrm_v2/pytorch/python/backend.py b/recommendation/dlrm_v2/pytorch/python/backend.py
index 955eddb88..6fc13454a 100755
--- a/recommendation/dlrm_v2/pytorch/python/backend.py
+++ b/recommendation/dlrm_v2/pytorch/python/backend.py
@@ -2,10 +2,10 @@
 abstract backend class
 """
 
-
 # pylint: disable=unused-argument,missing-docstring
 
-class Backend():
+
+class Backend:
     def __init__(self):
         self.inputs = []
         self.outputs = []
diff --git a/recommendation/dlrm_v2/pytorch/python/backend_dist_pytorch_native.py b/recommendation/dlrm_v2/pytorch/python/backend_dist_pytorch_native.py
index e988c3d1e..5ff11b2bf 100644
--- a/recommendation/dlrm_v2/pytorch/python/backend_dist_pytorch_native.py
+++ b/recommendation/dlrm_v2/pytorch/python/backend_dist_pytorch_native.py
@@ -1,6 +1,7 @@
 """
 pytoch native backend for dlrm
 """
+
 import os
 import torch
 import backend
@@ -61,7 +62,6 @@ def __init__(
         else:
             print("Using CPU...")
 
-        
         # assert ngpus == 8, "Reference implementation only supports ngpus = 8"
         os.environ["RANK"] = "0"
         os.environ["MASTER_ADDR"] = "localhost"
@@ -80,7 +80,7 @@ def version(self):
 
     def name(self):
         return "pytorch-native-dlrm"
-    
+
     def load(self, model_path, inputs=None, outputs=None):
         # debug prints
         # print(model_path, inputs, outputs)
@@ -93,7 +93,6 @@ def load(self, model_path, inputs=None, outputs=None):
         self.dataset_cache = manager.dict()
         self.predictions_cache = [manager.dict() for _ in range(world_size)]
         self.main_lock = manager.Event()
-        
 
         # Create processes to load model
         ctx = mp.get_context("spawn")
@@ -102,7 +101,9 @@ def load(self, model_path, inputs=None, outputs=None):
             p = ctx.Process(
                 target=self.distributed_setup,
                 args=(
-                    rank, world_size, model_path,
+                    rank,
+                    world_size,
+                    model_path,
                 ),
             )
             p.start()
@@ -110,13 +111,15 @@ def load(self, model_path, inputs=None, outputs=None):
         self.main_lock.wait()
 
         return self
-        
+
     def distributed_setup(self, rank, world_size, model_path):
         print("Initializing process...")
         if self.use_gpu:
             self.device = torch.device(f"cuda:{rank}")
             torch.cuda.set_device(f"cuda:{rank}")
-        dist.init_process_group(backend=self.dist_backend, rank=rank, world_size=world_size)
+        dist.init_process_group(
+            backend=self.dist_backend, rank=rank, world_size=world_size
+        )
         pg = dist.group.WORLD
         print("Initializing embeddings...")
         eb_configs = [
@@ -155,16 +158,20 @@ def distributed_setup(self, rank, world_size, model_path):
             model, get_default_sharders(), dist.GroupMember.WORLD
         )
         dist_model = DistributedModelParallel(
-            module=model, device=self.device, plan=plan, env=ShardingEnv.from_process_group(pg),
+            module=model,
+            device=self.device,
+            plan=plan,
+            env=ShardingEnv.from_process_group(pg),
         )
         self.model = dist_model
         if not self.debug:
             print("Loading model weights...")
             from torchsnapshot import Snapshot
+
             snapshot = Snapshot(path=model_path)
             snapshot.restore(app_state={"model": self.model})
 
-            ### To understand the keys in snapshot, you can look at following code snippet.
+            # To understand the keys in snapshot, you can look at following code snippet.
             # d = snapshot.get_manifest()
             # for k, v in d.items():
             #     print(k, v)
@@ -173,7 +180,7 @@ def distributed_setup(self, rank, world_size, model_path):
         self.main_lock.set()
 
         # Main prediction loop
-        while(True):
+        while True:
             item = self.samples_q[rank].get()
             # If -1 is received terminate all subprocesses
             if item == -1:
@@ -199,7 +206,6 @@ def capture_output(self, id):
         self.dataset_cache.pop(id)
         return out
 
-
     def predict(self, samples, ids):
         outputs = []
         # If none is received terminate all subprocesses
diff --git a/recommendation/dlrm_v2/pytorch/python/backend_pytorch_native.py b/recommendation/dlrm_v2/pytorch/python/backend_pytorch_native.py
index aadbdc8f2..19f2b44af 100755
--- a/recommendation/dlrm_v2/pytorch/python/backend_pytorch_native.py
+++ b/recommendation/dlrm_v2/pytorch/python/backend_pytorch_native.py
@@ -1,6 +1,7 @@
 """
 pytoch native backend for dlrm
 """
+
 import os
 import torch
 import backend
@@ -68,7 +69,7 @@ def __init__(
             self.dist_backend = "nccl"
             # torch.cuda.set_device(self.device)
         else:
-            #os.environ["WORLD_SIZE"] = "8"
+            # os.environ["WORLD_SIZE"] = "8"
             self.device: torch.device = torch.device("cpu")
             self.dist_backend = "gloo"
 
@@ -82,9 +83,12 @@ def load(self, model_path, inputs=None, outputs=None):
         # debug prints
         # print(model_path, inputs, outputs)
         print(f"Loading model from {model_path}")
-        
+
         print("Initializing embeddings...")
-        dist.init_process_group(backend=self.dist_backend, rank=0, world_size=1)
+        dist.init_process_group(
+            backend=self.dist_backend,
+            rank=0,
+            world_size=1)
         eb_configs = [
             EmbeddingBagConfig(
                 name=f"t_{feature_name}",
@@ -121,9 +125,7 @@ def load(self, model_path, inputs=None, outputs=None):
             model, get_default_sharders(), dist.GroupMember.WORLD
         )
         self.model = DistributedModelParallel(
-            module=model,
-            device=self.device,
-            plan=plan
+            module=model, device=self.device, plan=plan
         )
         # path_to_sharded_weights should have 2 subdirectories - batched and sharded
         # If we need to load the weights on different device or world size, we would need to change the process
@@ -133,26 +135,24 @@ def load(self, model_path, inputs=None, outputs=None):
         if not self.debug:
             print("Loading model weights...")
             from torchsnapshot import Snapshot
+
             snapshot = Snapshot(path=model_path)
             snapshot.restore(app_state={"model": self.model})
 
-            ### To understand the keys in snapshot, you can look at following code snippet.
+            # To understand the keys in snapshot, you can look at following code snippet.
             # d = snapshot.get_manifest()
             # for k, v in d.items():
             #     print(k, v)
         self.model.eval()
         return self
 
-    def predict(self, samples, ids = None):
+    def predict(self, samples, ids=None):
         outputs = []
         for batch in samples:
             batch_in = batch.to(self.device)
             with torch.no_grad():
-                _, (_, out, _) = self.model(
-                    batch_in
-                )
+                _, (_, out, _) = self.model(batch_in)
                 out = torch.sigmoid(out)
                 out = torch.reshape(out, (-1,))
                 outputs.append(out)
         return outputs
-
diff --git a/recommendation/dlrm_v2/pytorch/python/dataset.py b/recommendation/dlrm_v2/pytorch/python/dataset.py
index c19483739..422ed4636 100755
--- a/recommendation/dlrm_v2/pytorch/python/dataset.py
+++ b/recommendation/dlrm_v2/pytorch/python/dataset.py
@@ -14,7 +14,8 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("dataset")
 
-class Item():
+
+class Item:
     def __init__(self, label, img, idx):
         self.label = label
         self.img = img
@@ -23,19 +24,25 @@ def __init__(self, label, img, idx):
 
 
 def usleep(sec):
-    if sys.platform == 'win32':
+    if sys.platform == "win32":
         # on windows time.sleep() doesn't work to well
         import ctypes
+
         kernel32 = ctypes.windll.kernel32
-        timer = kernel32.CreateWaitableTimerA(ctypes.c_void_p(), True, ctypes.c_void_p())
+        timer = kernel32.CreateWaitableTimerA(
+            ctypes.c_void_p(), True, ctypes.c_void_p()
+        )
         delay = ctypes.c_longlong(int(-1 * (10 * 1000000 * sec)))
-        kernel32.SetWaitableTimer(timer, ctypes.byref(delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False)
-        kernel32.WaitForSingleObject(timer, 0xffffffff)
+        kernel32.SetWaitableTimer(
+            timer, ctypes.byref(
+                delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False
+        )
+        kernel32.WaitForSingleObject(timer, 0xFFFFFFFF)
     else:
         time.sleep(sec)
 
 
-class Dataset():
+class Dataset:
     def __init__(self):
         self.arrival = None
         self.image_list = []
@@ -92,7 +99,7 @@ def start(self):
         self.good = 0
         self.total = 0
 
-    def finalize(self, results, ds=False,  output_dir=None):
+    def finalize(self, results, ds=False, output_dir=None):
         results["good"] = self.good
         results["total"] = self.total
 
@@ -125,4 +132,3 @@ def start(self):
     def finalize(self, results, ds=False, output_dir=None):
         results["good"] = self.good
         results["total"] = self.total
-
diff --git a/recommendation/dlrm_v2/pytorch/python/main.py b/recommendation/dlrm_v2/pytorch/python/main.py
index 53b8d9237..662b5ad48 100755
--- a/recommendation/dlrm_v2/pytorch/python/main.py
+++ b/recommendation/dlrm_v2/pytorch/python/main.py
@@ -34,15 +34,24 @@
 
 # the datasets we support
 SUPPORTED_DATASETS = {
-    "debug":
-        (multihot_criteo.MultihotCriteo, multihot_criteo.pre_process_criteo_dlrm, multihot_criteo.DlrmPostProcess(),
-         {"randomize": 'total',  "memory_map": True}),
-    "multihot-criteo-sample":
-        (multihot_criteo.MultihotCriteo, multihot_criteo.pre_process_criteo_dlrm, multihot_criteo.DlrmPostProcess(),
-         {"randomize": 'total',  "memory_map": True}),
-    "multihot-criteo":
-        (multihot_criteo.MultihotCriteo, multihot_criteo.pre_process_criteo_dlrm, multihot_criteo.DlrmPostProcess(),
-         {"randomize": 'total',  "memory_map": True}),
+    "debug": (
+        multihot_criteo.MultihotCriteo,
+        multihot_criteo.pre_process_criteo_dlrm,
+        multihot_criteo.DlrmPostProcess(),
+        {"randomize": "total", "memory_map": True},
+    ),
+    "multihot-criteo-sample": (
+        multihot_criteo.MultihotCriteo,
+        multihot_criteo.pre_process_criteo_dlrm,
+        multihot_criteo.DlrmPostProcess(),
+        {"randomize": "total", "memory_map": True},
+    ),
+    "multihot-criteo": (
+        multihot_criteo.MultihotCriteo,
+        multihot_criteo.pre_process_criteo_dlrm,
+        multihot_criteo.DlrmPostProcess(),
+        {"randomize": "total", "memory_map": True},
+    ),
 }
 
 # pre-defined command line options so simplify things. They are used as defaults and can be
@@ -80,7 +89,7 @@
         "backend": "pytorch-native",
         "model": "dlrm",
         "max-batchsize": 2048,
-    }
+    },
 }
 
 SCENARIO_MAP = {
@@ -97,42 +106,118 @@ def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", help="name of the mlperf model, ie. dlrm")
-    parser.add_argument("--model-path", required=True, help="path to the model file")
-    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
-    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
-    parser.add_argument("--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles")
-    parser.add_argument("--scenario", default="SingleStream",
-                        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())))
+    parser.add_argument(
+        "--model-path",
+        required=True,
+        help="path to the model file")
+    parser.add_argument(
+        "--dataset",
+        choices=SUPPORTED_DATASETS.keys(),
+        help="dataset")
+    parser.add_argument(
+        "--dataset-path",
+        required=True,
+        help="path to the dataset")
+    parser.add_argument(
+        "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
+    )
+    parser.add_argument(
+        "--scenario",
+        default="SingleStream",
+        help="mlperf benchmark scenario, one of " +
+        str(list(SCENARIO_MAP.keys())),
+    )
     parser.add_argument("--max-ind-range", type=int, default=-1)
-    parser.add_argument("--max-batchsize", type=int, help="max batch size in a single inference")
+    parser.add_argument(
+        "--max-batchsize", type=int, help="max batch size in a single inference"
+    )
     parser.add_argument("--output", help="test results")
     parser.add_argument("--inputs", help="model inputs (currently not used)")
     parser.add_argument("--outputs", help="model outputs (currently not used)")
     parser.add_argument("--backend", help="runtime to use")
     parser.add_argument("--use-gpu", action="store_true", default=False)
-    parser.add_argument("--threads", default=os.cpu_count(), type=int, help="threads")
-    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
-    parser.add_argument("--find-peak-performance", action="store_true", help="enable finding peak performance pass")
+    parser.add_argument(
+        "--threads",
+        default=os.cpu_count(),
+        type=int,
+        help="threads")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--find-peak-performance",
+        action="store_true",
+        help="enable finding peak performance pass",
+    )
 
     # file to use mlperf rules compliant parameters
-    parser.add_argument("--mlperf_conf", default="mlperf.conf", help="mlperf rules config")
+    parser.add_argument(
+        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
+    )
     # file for user LoadGen settings such as target QPS
-    parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS")
-
-    # below will override mlperf rules compliant settings - don't use for official submission
-    parser.add_argument("--duration", type=int, help="duration in milliseconds (ms)")
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+
+    # below will override mlperf rules compliant settings - don't use for
+    # official submission
+    parser.add_argument(
+        "--duration",
+        type=int,
+        help="duration in milliseconds (ms)")
     parser.add_argument("--target-qps", type=int, help="target/expected qps")
-    parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile")
-    parser.add_argument("--count-samples", type=int, help="dataset items to use")
-    parser.add_argument("--count-queries", type=int, help="number of queries to use")
-    parser.add_argument("--samples-per-query-multistream", default=8, type=int, help="query length for multi-stream scenario (in terms of aggregated samples)")
+    parser.add_argument(
+        "--max-latency", type=float, help="mlperf max latency in pct tile"
+    )
+    parser.add_argument(
+        "--count-samples",
+        type=int,
+        help="dataset items to use")
+    parser.add_argument(
+        "--count-queries",
+        type=int,
+        help="number of queries to use")
+    parser.add_argument(
+        "--samples-per-query-multistream",
+        default=8,
+        type=int,
+        help="query length for multi-stream scenario (in terms of aggregated samples)",
+    )
     # --samples-per-query-offline is equivalent to perf_sample_count
-    parser.add_argument("--samples-per-query-offline", type=int, default=2048, help="query length for offline scenario (in terms of aggregated samples)")
-    parser.add_argument("--samples-to-aggregate-fix", type=int, help="number of samples to be treated as one")
-    parser.add_argument("--samples-to-aggregate-min", type=int, help="min number of samples to be treated as one in random query size")
-    parser.add_argument("--samples-to-aggregate-max", type=int, help="max number of samples to be treated as one in random query size")
-    parser.add_argument("--samples-to-aggregate-quantile-file", type=str, help="distribution quantile used to generate number of samples to be treated as one in random query size")
-    parser.add_argument("--samples-to-aggregate-trace-file", type=str, default="dlrm_trace_of_aggregated_samples.txt")
+    parser.add_argument(
+        "--samples-per-query-offline",
+        type=int,
+        default=2048,
+        help="query length for offline scenario (in terms of aggregated samples)",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-fix",
+        type=int,
+        help="number of samples to be treated as one",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-min",
+        type=int,
+        help="min number of samples to be treated as one in random query size",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-max",
+        type=int,
+        help="max number of samples to be treated as one in random query size",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-quantile-file",
+        type=str,
+        help="distribution quantile used to generate number of samples to be treated as one in random query size",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-trace-file",
+        type=str,
+        default="dlrm_trace_of_aggregated_samples.txt",
+    )
     parser.add_argument("--numpy-rand-seed", type=int, default=123)
     parser.add_argument("--debug", action="store_true", default=False)
     args = parser.parse_args()
@@ -165,85 +250,198 @@ def get_backend(backend, dataset, use_gpu, debug):
     if backend == "pytorch-native":
         from backend_pytorch_native import BackendPytorchNative
         from backend_dist_pytorch_native import BackendDistPytorchNative
+
         n_cores = int(os.environ.get("WORLD_SIZE", 1))
         if n_cores > 1:
             if dataset == "debug":
                 # 1. Syntetic debug dataset
                 backend = BackendDistPytorchNative(
-                    num_embeddings_per_feature = [2 for _ in range(26)],
+                    num_embeddings_per_feature=[2 for _ in range(26)],
                     embedding_dim=128,
                     dcn_num_layers=3,
                     dcn_low_rank_dim=512,
                     dense_arch_layer_sizes=[512, 256, 128],
                     over_arch_layer_sizes=[1024, 1024, 512, 256, 1],
                     use_gpu=use_gpu,
-                    debug=True
+                    debug=True,
                 )
             elif dataset == "multihot-criteo-sample":
                 # 2. Syntetic multihot criteo sample
                 backend = BackendDistPytorchNative(
-                    num_embeddings_per_feature = [40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36],
+                    num_embeddings_per_feature=[
+                        40000000,
+                        39060,
+                        17295,
+                        7424,
+                        20265,
+                        3,
+                        7122,
+                        1543,
+                        63,
+                        40000000,
+                        3067956,
+                        405282,
+                        10,
+                        2209,
+                        11938,
+                        155,
+                        4,
+                        976,
+                        14,
+                        40000000,
+                        40000000,
+                        40000000,
+                        590152,
+                        12973,
+                        108,
+                        36,
+                    ],
                     embedding_dim=128,
                     dcn_num_layers=3,
                     dcn_low_rank_dim=512,
                     dense_arch_layer_sizes=[512, 256, 128],
                     over_arch_layer_sizes=[1024, 1024, 512, 256, 1],
                     use_gpu=use_gpu,
-                    debug=debug
+                    debug=debug,
                 )
             elif dataset == "multihot-criteo":
                 # 3. Syntetic multihot criteo
                 backend = BackendDistPytorchNative(
-                    num_embeddings_per_feature = [40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36],
+                    num_embeddings_per_feature=[
+                        40000000,
+                        39060,
+                        17295,
+                        7424,
+                        20265,
+                        3,
+                        7122,
+                        1543,
+                        63,
+                        40000000,
+                        3067956,
+                        405282,
+                        10,
+                        2209,
+                        11938,
+                        155,
+                        4,
+                        976,
+                        14,
+                        40000000,
+                        40000000,
+                        40000000,
+                        590152,
+                        12973,
+                        108,
+                        36,
+                    ],
                     embedding_dim=128,
                     dcn_num_layers=3,
                     dcn_low_rank_dim=512,
                     dense_arch_layer_sizes=[512, 256, 128],
                     over_arch_layer_sizes=[1024, 1024, 512, 256, 1],
                     use_gpu=use_gpu,
-                    debug=debug
+                    debug=debug,
                 )
             else:
-                raise ValueError("only debug|multihot-criteo-sample|multihot-criteo dataset options are supported")
+                raise ValueError(
+                    "only debug|multihot-criteo-sample|multihot-criteo dataset options are supported"
+                )
         else:
             if dataset == "debug":
                 # 1. Syntetic debug dataset
                 backend = BackendPytorchNative(
-                    num_embeddings_per_feature = [2 for _ in range(26)],
+                    num_embeddings_per_feature=[2 for _ in range(26)],
                     embedding_dim=128,
                     dcn_num_layers=3,
                     dcn_low_rank_dim=512,
                     dense_arch_layer_sizes=[512, 256, 128],
                     over_arch_layer_sizes=[1024, 1024, 512, 256, 1],
                     use_gpu=use_gpu,
-                    debug=True
+                    debug=True,
                 )
             elif dataset == "multihot-criteo-sample":
                 # 2. Syntetic multihot criteo sample
                 backend = BackendPytorchNative(
-                    num_embeddings_per_feature = [40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36],
+                    num_embeddings_per_feature=[
+                        40000000,
+                        39060,
+                        17295,
+                        7424,
+                        20265,
+                        3,
+                        7122,
+                        1543,
+                        63,
+                        40000000,
+                        3067956,
+                        405282,
+                        10,
+                        2209,
+                        11938,
+                        155,
+                        4,
+                        976,
+                        14,
+                        40000000,
+                        40000000,
+                        40000000,
+                        590152,
+                        12973,
+                        108,
+                        36,
+                    ],
                     embedding_dim=128,
                     dcn_num_layers=3,
                     dcn_low_rank_dim=512,
                     dense_arch_layer_sizes=[512, 256, 128],
                     over_arch_layer_sizes=[1024, 1024, 512, 256, 1],
                     use_gpu=use_gpu,
-                    debug=debug
+                    debug=debug,
                 )
             elif dataset == "multihot-criteo":
                 # 3. Syntetic multihot criteo
                 backend = BackendPytorchNative(
-                    num_embeddings_per_feature = [40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36],
+                    num_embeddings_per_feature=[
+                        40000000,
+                        39060,
+                        17295,
+                        7424,
+                        20265,
+                        3,
+                        7122,
+                        1543,
+                        63,
+                        40000000,
+                        3067956,
+                        405282,
+                        10,
+                        2209,
+                        11938,
+                        155,
+                        4,
+                        976,
+                        14,
+                        40000000,
+                        40000000,
+                        40000000,
+                        590152,
+                        12973,
+                        108,
+                        36,
+                    ],
                     embedding_dim=128,
                     dcn_num_layers=3,
                     dcn_low_rank_dim=512,
                     dense_arch_layer_sizes=[512, 256, 128],
                     over_arch_layer_sizes=[1024, 1024, 512, 256, 1],
                     use_gpu=use_gpu,
-                    debug=debug
+                    debug=debug,
                 )
             else:
-                raise ValueError("only debug|multihot-criteo-sample|multihot-criteo dataset options are supported")
+                raise ValueError(
+                    "only debug|multihot-criteo-sample|multihot-criteo dataset options are supported"
+                )
 
     else:
         raise ValueError("unknown backend: " + backend)
@@ -253,7 +451,8 @@ def get_backend(backend, dataset, use_gpu, debug):
 class Item:
     """An item that we queue for processing by the thread pool."""
 
-    def __init__(self, query_id, content_id, features, batch_T=None, idx_offsets = None):
+    def __init__(self, query_id, content_id, features,
+                 batch_T=None, idx_offsets=None):
         self.query_id = query_id
         self.content_id = content_id
         self.features = features
@@ -261,6 +460,7 @@ def __init__(self, query_id, content_id, features, batch_T=None, idx_offsets = N
         self.idx_offsets = idx_offsets
         self.start = time.time()
 
+
 class RunnerBase:
     def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         self.take_accuracy = False
@@ -285,7 +485,9 @@ def run_one_item(self, qitem):
         processed_results = []
         try:
             results = self.model.predict(qitem.features, qitem.content_id)
-            processed_results = self.post_process(results, qitem.batch_T, self.result_dict)
+            processed_results = self.post_process(
+                results, qitem.batch_T, self.result_dict
+            )
             if self.take_accuracy:
                 self.post_process.add_results(processed_results)
             self.result_timing.append(time.time() - qitem.start)
@@ -305,7 +507,10 @@ def run_one_item(self, qitem):
                 # print("s,e:",s_idx,e_idx, len(processed_results))
                 s_idx = qitem.idx_offsets[idx]
                 e_idx = qitem.idx_offsets[idx + 1]
-                response_array = array.array("B", np.array(processed_results[s_idx:e_idx], np.float32).tobytes())
+                response_array = array.array(
+                    "B", np.array(
+                        processed_results[s_idx:e_idx], np.float32).tobytes()
+                )
                 response_array_refs.append(response_array)
                 bi = response_array.buffer_info()
                 response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1]))
@@ -314,20 +519,29 @@ def run_one_item(self, qitem):
     def enqueue(self, query_samples):
         idx = [q.index for q in query_samples]
         query_id = [q.id for q in query_samples]
-        #print(idx)
+        # print(idx)
         query_len = len(query_samples)
 
         if query_len < self.max_batchsize:
             samples, idx_offsets = self.ds.get_samples(idx)
             batch_T = [self.ds.get_labels(sample) for sample in samples]
-            self.run_one_item(Item(query_id, idx, samples, batch_T, idx_offsets))
+            self.run_one_item(
+                Item(
+                    query_id,
+                    idx,
+                    samples,
+                    batch_T,
+                    idx_offsets))
         else:
             bs = self.max_batchsize
             for i in range(0, query_len, bs):
                 ie = min(i + bs, query_len)
                 samples, idx_offsets = self.ds.get_samples(idx[i:ie])
                 batch_T = [self.ds.get_labels(sample) for sample in samples]
-                self.run_one_item(Item(query_id[i:ie], idx[i:ie], samples, batch_T, idx_offsets))
+                self.run_one_item(
+                    Item(query_id[i:ie], idx[i:ie],
+                         samples, batch_T, idx_offsets)
+                )
 
     def finish(self):
         pass
@@ -336,13 +550,17 @@ def finish(self):
 class QueueRunner(RunnerBase):
     def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         super().__init__(model, ds, threads, post_proc, max_batchsize)
-        queue_size_multiplier = 4 #(args.samples_per_query_offline + max_batchsize - 1) // max_batchsize)
+        queue_size_multiplier = (
+            4  # (args.samples_per_query_offline + max_batchsize - 1) // max_batchsize)
+        )
         self.tasks = JoinableQueue(maxsize=threads * queue_size_multiplier)
         self.workers = []
         self.result_dict = {}
 
         for _ in range(self.threads):
-            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
+            worker = threading.Thread(
+                target=self.handle_tasks, args=(
+                    self.tasks,))
             worker.daemon = True
             self.workers.append(worker)
             worker.start()
@@ -362,7 +580,7 @@ def enqueue(self, query_samples):
         idx = [q.index for q in query_samples]
         query_id = [q.id for q in query_samples]
         query_len = len(query_samples)
-        #print(idx)
+        # print(idx)
         if query_len < self.max_batchsize:
             samples, idx_offsets = self.ds.get_samples(idx)
             batch_T = [self.ds.get_labels(sample) for sample in samples]
@@ -373,7 +591,10 @@ def enqueue(self, query_samples):
                 ie = min(i + bs, query_len)
                 samples, idx_offsets = self.ds.get_samples(idx)
                 batch_T = [self.ds.get_labels(sample) for sample in samples]
-                self.tasks.put(Item(query_id[i:ie], idx[i:ie], samples, batch_T, idx_offsets))
+                self.tasks.put(
+                    Item(query_id[i:ie], idx[i:ie],
+                         samples, batch_T, idx_offsets)
+                )
 
     def finish(self):
         # exit all threads
@@ -383,11 +604,14 @@ def finish(self):
             worker.join()
 
 
-
-def add_results(final_results, name, result_dict, result_list, took, show_accuracy=False):
-    percentiles = [50., 80., 90., 95., 99., 99.9]
+def add_results(
+    final_results, name, result_dict, result_list, took, show_accuracy=False
+):
+    percentiles = [50.0, 80.0, 90.0, 95.0, 99.0, 99.9]
     buckets = np.percentile(result_list, percentiles).tolist()
-    buckets_str = ",".join(["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)])
+    buckets_str = ",".join(
+        ["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)]
+    )
 
     if result_dict["total"] == 0:
         result_dict["total"] = len(result_list)
@@ -404,19 +628,27 @@ def add_results(final_results, name, result_dict, result_list, took, show_accura
     }
     acc_str = ""
     if show_accuracy:
-        result["accuracy"] = 100. * result_dict["good"] / result_dict["total"]
+        result["accuracy"] = 100.0 * result_dict["good"] / result_dict["total"]
         acc_str = ", acc={:.3f}%".format(result["accuracy"])
         if "roc_auc" in result_dict:
-            result["roc_auc"] = 100. * result_dict["roc_auc"]
+            result["roc_auc"] = 100.0 * result_dict["roc_auc"]
             acc_str += ", auc={:.3f}%".format(result["roc_auc"])
 
     # add the result to the result dict
     final_results[name] = result
 
     # to stdout
-    print("{} qps={:.2f}, mean={:.4f}, time={:.3f}{}, queries={}, tiles={}".format(
-        name, result["qps"], result["mean"], took, acc_str,
-        len(result_list), buckets_str))
+    print(
+        "{} qps={:.2f}, mean={:.4f}, time={:.3f}{}, queries={}, tiles={}".format(
+            name,
+            result["qps"],
+            result["mean"],
+            took,
+            acc_str,
+            len(result_list),
+            buckets_str,
+        )
+    )
 
 
 def main():
@@ -426,26 +658,62 @@ def main():
     log.info(args)
 
     # find backend
-    backend = get_backend(args.backend, args.dataset, args.use_gpu, debug=args.debug)
+    backend = get_backend(
+        args.backend,
+        args.dataset,
+        args.use_gpu,
+        debug=args.debug)
 
     # dataset to use
     wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
 
     # --count-samples can be used to limit the number of samples used for testing
-    ds = wanted_dataset(num_embeddings_per_feature=[40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,3],
-                        data_path=args.dataset_path,
-                        name=args.dataset,
-                        pre_process=pre_proc,  # currently an identity function
-                        count=args.count_samples,
-                        samples_to_aggregate_fix=args.samples_to_aggregate_fix,
-                        samples_to_aggregate_min=args.samples_to_aggregate_min,
-                        samples_to_aggregate_max=args.samples_to_aggregate_max,
-                        samples_to_aggregate_quantile_file=args.samples_to_aggregate_quantile_file,
-                        samples_to_aggregate_trace_file=args.samples_to_aggregate_trace_file,
-                        max_ind_range=args.max_ind_range,
-                        **kwargs)
+    ds = wanted_dataset(
+        num_embeddings_per_feature=[
+            40000000,
+            39060,
+            17295,
+            7424,
+            20265,
+            3,
+            7122,
+            1543,
+            63,
+            40000000,
+            3067956,
+            405282,
+            10,
+            2209,
+            11938,
+            155,
+            4,
+            976,
+            14,
+            40000000,
+            40000000,
+            40000000,
+            590152,
+            12973,
+            108,
+            3,
+        ],
+        data_path=args.dataset_path,
+        name=args.dataset,
+        pre_process=pre_proc,  # currently an identity function
+        count=args.count_samples,
+        samples_to_aggregate_fix=args.samples_to_aggregate_fix,
+        samples_to_aggregate_min=args.samples_to_aggregate_min,
+        samples_to_aggregate_max=args.samples_to_aggregate_max,
+        samples_to_aggregate_quantile_file=args.samples_to_aggregate_quantile_file,
+        samples_to_aggregate_trace_file=args.samples_to_aggregate_trace_file,
+        max_ind_range=args.max_ind_range,
+        **kwargs
+    )
     # load model to backend
-    model = backend.load(args.model_path, inputs=args.inputs, outputs=args.outputs)
+    model = backend.load(
+        args.model_path,
+        inputs=args.inputs,
+        outputs=args.outputs)
     final_results = {
         "runtime": model.name(),
         "version": model.version(),
@@ -486,10 +754,12 @@ def main():
         lg.TestScenario.SingleStream: RunnerBase,
         lg.TestScenario.MultiStream: QueueRunner,
         lg.TestScenario.Server: QueueRunner,
-        lg.TestScenario.Offline: QueueRunner
+        lg.TestScenario.Offline: QueueRunner,
     }
 
-    runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize)
+    runner = runner_map[scenario](
+        model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+    )
 
     def issue_queries(query_samples):
         runner.enqueue(query_samples)
@@ -526,13 +796,23 @@ def flush_queries():
 
     if args.max_latency:
         settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
-        settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC)
+        settings.multi_stream_expected_latency_ns = int(
+            args.max_latency * NANO_SEC)
 
     sut = lg.ConstructSUT(issue_queries, flush_queries)
-    qsl = lg.ConstructQSL(count, min(count, args.samples_per_query_offline), ds.load_query_samples, ds.unload_query_samples)
+    qsl = lg.ConstructQSL(
+        count,
+        min(count, args.samples_per_query_offline),
+        ds.load_query_samples,
+        ds.unload_query_samples,
+    )
 
     log.info("starting {}".format(scenario))
-    result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)}
+    result_dict = {
+        "good": 0,
+        "total": 0,
+        "roc_auc": 0,
+        "scenario": str(scenario)}
     runner.start_run(result_dict, args.accuracy)
     lg.StartTest(sut, qsl, settings)
 
@@ -544,14 +824,20 @@ def flush_queries():
     if args.accuracy:
         post_proc.finalize(result_dict, ds, output_dir=args.output)
 
-    add_results(final_results, "{}".format(scenario),
-                result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy)
+    add_results(
+        final_results,
+        "{}".format(scenario),
+        result_dict,
+        last_timeing,
+        time.time() - ds.last_loaded,
+        args.accuracy,
+    )
 
     runner.finish()
     lg.DestroyQSL(qsl)
     lg.DestroySUT(sut)
     # If multiple subprocesses are running the model send a signal to stop them
-    if (int(os.environ.get("WORLD_SIZE", 1)) > 1):
+    if int(os.environ.get("WORLD_SIZE", 1)) > 1:
         backend.predict(None, None)
 
     #
diff --git a/recommendation/dlrm_v2/pytorch/python/multihot_criteo.py b/recommendation/dlrm_v2/pytorch/python/multihot_criteo.py
index efbea8195..c0f7ab3ad 100755
--- a/recommendation/dlrm_v2/pytorch/python/multihot_criteo.py
+++ b/recommendation/dlrm_v2/pytorch/python/multihot_criteo.py
@@ -4,6 +4,13 @@
 
 # pylint: disable=unused-argument,missing-docstring
 
+from torchrec.datasets.criteo import (
+    CAT_FEATURE_COUNT,
+    DAYS,
+    DEFAULT_CAT_NAMES,
+    DEFAULT_INT_NAMES,
+)
+from dataset import Dataset
 import logging
 import os
 import sys
@@ -24,14 +31,6 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("criteo")
 
-from dataset import Dataset
-from torchrec.datasets.criteo import (
-    CAT_FEATURE_COUNT,
-    DAYS,
-    DEFAULT_CAT_NAMES,
-    DEFAULT_INT_NAMES,
-)
-
 
 class MultihotCriteo(Dataset):
     def __init__(
@@ -74,19 +73,22 @@ def __init__(
         if name == "debug":
             stage_files = [
                 [os.path.join(data_path, f"day_{DAYS-1}_dense_debug.npy")],
-                [os.path.join(data_path, f"day_{DAYS-1}_sparse_multi_hot_debug.npz")],
+                [os.path.join(data_path,
+                              f"day_{DAYS-1}_sparse_multi_hot_debug.npz")],
                 [os.path.join(data_path, f"day_{DAYS-1}_labels_debug.npy")],
             ]
         elif name == "multihot-criteo-sample":
             stage_files = [
                 [os.path.join(data_path, f"day_{DAYS-1}_dense_sample.npy")],
-                [os.path.join(data_path, f"day_{DAYS-1}_sparse_multi_hot_sample.npz")],
+                [os.path.join(data_path,
+                              f"day_{DAYS-1}_sparse_multi_hot_sample.npz")],
                 [os.path.join(data_path, f"day_{DAYS-1}_labels_sample.npy")],
             ]
         elif name == "multihot-criteo":
             stage_files = [
                 [os.path.join(data_path, f"day_{DAYS-1}_dense.npy")],
-                [os.path.join(data_path, f"day_{DAYS-1}_sparse_multi_hot.npz")],
+                [os.path.join(data_path,
+                              f"day_{DAYS-1}_sparse_multi_hot.npz")],
                 [os.path.join(data_path, f"day_{DAYS-1}_labels.npy")],
             ]
         else:
@@ -112,7 +114,8 @@ def __init__(
         # of size samples_to_aggregate as an item we need to adjust the original dataset item_count.
         # On the other hand, data loader always returns number of batches.
         if self.use_fixed_size:
-            # the offsets for fixed query size will be generated on-the-fly later on
+            # the offsets for fixed query size will be generated on-the-fly
+            # later on
             print("Using fixed query size: " + str(self.samples_to_aggregate))
             self.num_aggregated_samples = (
                 self.num_individual_samples + self.samples_to_aggregate - 1
@@ -121,7 +124,8 @@ def __init__(
         else:
             # the offsets for variable query sizes will be pre-generated here
             if self.samples_to_aggregate_quantile_file is None:
-                # generate number of samples in a query from a uniform(min,max) distribution
+                # generate number of samples in a query from a uniform(min,max)
+                # distribution
                 print(
                     "Using variable query size: uniform distribution ("
                     + str(self.samples_to_aggregate_min)
@@ -160,7 +164,8 @@ def __init__(
                 # The inverse of its cdf with granularity of 0.05 can be written as
                 # quantile_p = [.05, .10, .15, .20, .25, .30, .35, .40, .45, .50, .55, .60, .65, .70, .75, .80, .85, .90, .95, 1.0] # p
                 # quantile_x = [100, 100, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 300, 300, 400, 500, 600, 700] # q(p) = x, such that f(x) >= p
-                # Notice that once we have quantile, we can apply inverse transform sampling method.
+                # Notice that once we have quantile, we can apply inverse
+                # transform sampling method.
                 print(
                     "Using variable query size: custom distribution (file "
                     + str(samples_to_aggregate_quantile_file)
@@ -186,7 +191,8 @@ def __init__(
                 self.random_offsets.append(int(qo))
 
                 # compute min and max number of samples
-                nas_max = (self.num_individual_samples + quantile[0] - 1) // quantile[0]
+                nas_max = (self.num_individual_samples +
+                           quantile[0] - 1) // quantile[0]
                 nas_min = (self.num_individual_samples + quantile[-1] - 1) // quantile[
                     -1
                 ]
@@ -203,7 +209,8 @@ def __init__(
 
         # limit number of items to count if needed
         if self.count is not None:
-            self.num_aggregated_samples = min(self.count, self.num_aggregated_samples)
+            self.num_aggregated_samples = min(
+                self.count, self.num_aggregated_samples)
 
         # dump the trace of aggregated samples
         if samples_to_aggregate_trace_file is not None:
@@ -268,7 +275,7 @@ def get_samples(self, id_list):
         for item in id_list:
             idx_offsets.append(idx_offsets[-1] + self.item_sizes[item])
         return [self.items_in_memory[item] for item in id_list], idx_offsets
-    
+
     def get_labels(self, sample):
         if isinstance(sample, list):
             labels = [s.labels for s in sample]
@@ -298,7 +305,7 @@ def __init__(
         self.batch_size = batch_size
         self.rank = rank
         self.world_size = world_size
-        self.split = (self.world_size > 1)
+        self.split = self.world_size > 1
 
         # Load arrays
         m = "r" if mmap_mode else None
@@ -320,7 +327,7 @@ def __init__(
 
         len_d0 = len(self.dense_arrs[0])
         second_half_start_index = int(len_d0 // 2 + len_d0 % 2)
-        if (stage == "val" and name == "multihot-criteo"):
+        if stage == "val" and name == "multihot-criteo":
             self.dense_arrs[0] = self.dense_arrs[0][:second_half_start_index, :]
             self.labels_arrs[0] = self.labels_arrs[0][:second_half_start_index, :]
             self.sparse_arrs[0] = [
@@ -364,7 +371,8 @@ def _load_from_npz(self, fname, npy_name):
         # read .npy header
         zf.open(npy_name, "r")
         version = np.lib.format.read_magic(zf.fp)
-        shape, fortran_order, dtype = np.lib.format._read_array_header(zf.fp, version)
+        shape, fortran_order, dtype = np.lib.format._read_array_header(
+            zf.fp, version)
         assert (
             dtype == "int32"
         ), f"sparse multi-hot dtype is {dtype} but should be int32"
@@ -380,20 +388,27 @@ def _load_from_npz(self, fname, npy_name):
         )
 
     def _np_arrays_to_batch(
-        self, dense: np.ndarray, sparse: List[np.ndarray], labels: np.ndarray,
+        self,
+        dense: np.ndarray,
+        sparse: List[np.ndarray],
+        labels: np.ndarray,
     ) -> Batch:
         batch_size = len(dense)
-        lengths = torch.ones((CAT_FEATURE_COUNT * batch_size), dtype=torch.int32)
+        lengths = torch.ones(
+            (CAT_FEATURE_COUNT * batch_size),
+            dtype=torch.int32)
         for k, multi_hot_size in enumerate(self.multi_hot_sizes):
-            lengths[k * batch_size : (k + 1) * batch_size] = multi_hot_size
-        offsets = torch.cumsum(torch.concat((torch.tensor([0]), lengths)), dim=0)
+            lengths[k * batch_size: (k + 1) * batch_size] = multi_hot_size
+        offsets = torch.cumsum(torch.concat(
+            (torch.tensor([0]), lengths)), dim=0)
         length_per_key = [
             batch_size * multi_hot_size for multi_hot_size in self.multi_hot_sizes
         ]
         offset_per_key = torch.cumsum(
             torch.concat((torch.tensor([0]), torch.tensor(length_per_key))), dim=0
         )
-        values = torch.concat([torch.from_numpy(feat).flatten() for feat in sparse])
+        values = torch.concat([torch.from_numpy(feat).flatten()
+                              for feat in sparse])
         return Batch(
             dense_features=torch.from_numpy(dense.copy()),
             sparse_features=KeyedJaggedTensor(
@@ -413,11 +428,16 @@ def load_batch(self, sample_list) -> Union[Batch, List[Batch]]:
         if self.split:
             batch = []
             n_samples = len(sample_list)
-            limits = [i*n_samples//self.world_size for i in range(self.world_size + 1)]
+            limits = [
+                i * n_samples // self.world_size for i in range(self.world_size + 1)
+            ]
             for i in range(self.world_size):
-                dense = self.dense_arrs[0][sample_list[limits[i]:limits[i+1]], :]
-                sparse = [arr[sample_list[limits[i]:limits[i+1]], :] for arr in self.sparse_arrs[0]]
-                labels = self.labels_arrs[0][sample_list[limits[i]:limits[i+1]], :]
+                dense = self.dense_arrs[0][sample_list[limits[i]: limits[i + 1]], :]
+                sparse = [
+                    arr[sample_list[limits[i]: limits[i + 1]], :]
+                    for arr in self.sparse_arrs[0]
+                ]
+                labels = self.labels_arrs[0][sample_list[limits[i]: limits[i + 1]], :]
                 batch.append(self._np_arrays_to_batch(dense, sparse, labels))
             return batch
         else:
@@ -445,7 +465,8 @@ def __call__(self, results, expected=None, result_dict=None):
         n = len(results)
         for idx in range(0, n):
             # NOTE: copy from GPU to CPU while post processing, if needed. Alternatively,
-            # we could do this on the output of predict function in backend_pytorch_native.py
+            # we could do this on the output of predict function in
+            # backend_pytorch_native.py
             result = results[idx].detach().cpu()
             target = expected[idx]
 
diff --git a/recommendation/dlrm_v2/pytorch/python/version.py b/recommendation/dlrm_v2/pytorch/python/version.py
index 1152dbb41..570348596 100644
--- a/recommendation/dlrm_v2/pytorch/python/version.py
+++ b/recommendation/dlrm_v2/pytorch/python/version.py
@@ -1,3 +1,2 @@
-
-version = '0.1.0'
-git_version = '05df3bae82ef9fc933277385eb778e3f22cd0c6a'
+version = "0.1.0"
+git_version = "05df3bae82ef9fc933277385eb778e3f22cd0c6a"
diff --git a/recommendation/dlrm_v2/pytorch/setup.py b/recommendation/dlrm_v2/pytorch/setup.py
index c1e2fbcf0..758d874fb 100644
--- a/recommendation/dlrm_v2/pytorch/setup.py
+++ b/recommendation/dlrm_v2/pytorch/setup.py
@@ -13,17 +13,20 @@
 from setuptools import setup, find_packages, Command
 
 TOP_DIR = os.path.realpath(os.path.dirname(__file__))
-SRC_DIR = os.path.join(TOP_DIR, 'python')
+SRC_DIR = os.path.join(TOP_DIR, "python")
 
 try:
-    git_version = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=TOP_DIR).decode('ascii').strip()
+    git_version = (
+        subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=TOP_DIR)
+        .decode("ascii")
+        .strip()
+    )
 except (OSError, subprocess.CalledProcessError):
     git_version = None
 
-with open(os.path.join(TOP_DIR, 'VERSION_NUMBER')) as version_file:
-    VersionInfo = namedtuple('VersionInfo', ['version', 'git_version'])(
-        version=version_file.read().strip(),
-        git_version=git_version
+with open(os.path.join(TOP_DIR, "VERSION_NUMBER")) as version_file:
+    VersionInfo = namedtuple("VersionInfo", ["version", "git_version"])(
+        version=version_file.read().strip(), git_version=git_version
     )
 
 
@@ -37,49 +40,67 @@ def finalize_options(self):
         pass
 
     def run(self):
-        with open(os.path.join(SRC_DIR, 'version.py'), 'w') as f:
-            f.write(dedent('''
+        with open(os.path.join(SRC_DIR, "version.py"), "w") as f:
+            f.write(
+                dedent(
+                    """
             version = '{version}'
             git_version = '{git_version}'
-            '''.format(**dict(VersionInfo._asdict()))))
+            """.format(
+                        **dict(VersionInfo._asdict())
+                    )
+                )
+            )
 
 
 class build_py(setuptools.command.build_py.build_py):
     def run(self):
-        self.run_command('create_version')
+        self.run_command("create_version")
         setuptools.command.build_py.build_py.run(self)
 
 
 class build(distutils.command.build.build):
     def run(self):
-        self.run_command('build_py')
+        self.run_command("build_py")
 
 
 class develop(setuptools.command.develop.develop):
     def run(self):
-        self.run_command('create_version')
-        self.run_command('build')
+        self.run_command("create_version")
+        self.run_command("build")
         setuptools.command.develop.develop.run(self)
 
 
 cmdclass = {
-    'create_version': create_version,
-    'build_py': build_py,
-    'build': build,
-    'develop': develop,
+    "create_version": create_version,
+    "build_py": build_py,
+    "build": build,
+    "develop": develop,
 }
 
 setup(
     name="mlperf-inference",
     version=VersionInfo.version,
-    description='mlperf inference benchmark',
-    setup_requires=['pytest-runner'],
-    tests_require=['graphviz', 'parameterized', 'pytest', 'pytest-cov', 'pyyaml'],
+    description="mlperf inference benchmark",
+    setup_requires=["pytest-runner"],
+    tests_require=[
+        "graphviz",
+        "parameterized",
+        "pytest",
+        "pytest-cov",
+        "pyyaml"],
     cmdclass=cmdclass,
     packages=find_packages(),
-    author='guschmue@microsoft.com',
-    author_email='guschmue@microsoft.com',
-    url='https://github.com/mlperf/inference',
-    install_requires=['numpy>=1.14.1', 'onnx>=1.5', 'pybind11', 'Cython',
-                        'pycocotools', 'mlperf_loadgen', 'opencv-python-headless']
+    author="guschmue@microsoft.com",
+    author_email="guschmue@microsoft.com",
+    url="https://github.com/mlperf/inference",
+    install_requires=[
+        "numpy>=1.14.1",
+        "onnx>=1.5",
+        "pybind11",
+        "Cython",
+        "pycocotools",
+        "mlperf_loadgen",
+        "opencv-python-headless",
+    ],
 )
diff --git a/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py b/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py
index ce662071f..2420dab30 100644
--- a/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py
+++ b/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py
@@ -16,25 +16,39 @@
 
 # pylint: disable=missing-docstring
 
+
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--day-23-file", default=None,
-        help="path to day_23 file. If present, it is assumed that the accuracy log contains only the prediction, not the ground truth label.")
-    parser.add_argument("--aggregation-trace-file", default=None,
-        help="path to dlrm_trace_of_aggregated_samples.txt. Only needed if --day-23-file is specified")
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
-    parser.add_argument("--dtype", default="float32", choices=["float32", "int32", "int64"], help="data type of the label")
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--day-23-file",
+        default=None,
+        help="path to day_23 file. If present, it is assumed that the accuracy log contains only the prediction, not the ground truth label.",
+    )
+    parser.add_argument(
+        "--aggregation-trace-file",
+        default=None,
+        help="path to dlrm_trace_of_aggregated_samples.txt. Only needed if --day-23-file is specified",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float32", "int32", "int64"],
+        help="data type of the label",
+    )
     args = parser.parse_args()
     return args
 
 
-dtype_map = {
-    "float32": np.float32,
-    "int32": np.int32,
-    "int64": np.int64
-}
+dtype_map = {"float32": np.float32, "int32": np.int32, "int64": np.int64}
+
 
 def get_targets(args, qsl_indices):
     # Parse aggregation trace file to know the sample -> user-item pair mapping
@@ -42,10 +56,14 @@ def get_targets(args, qsl_indices):
     sample_boundaries = [0]
     with open(args.aggregation_trace_file) as f:
         for line in f:
-            sample_boundaries.append(sample_boundaries[-1] + int(line.split(", ")[2]))
+            sample_boundaries.append(
+                sample_boundaries[-1] + int(line.split(", ")[2]))
     if len(sample_boundaries) != len(qsl_indices) + 1:
-        print("Warning: number of samples in trace file ({}) does not match number of samples ({}) in "
-              "loadgen accuracy log!".format(len(sample_boundaries)-1, len(qsl_indices)))
+        print(
+            "Warning: number of samples in trace file ({}) does not match number of samples ({}) in "
+            "loadgen accuracy log!".format(
+                len(sample_boundaries) - 1, len(qsl_indices))
+        )
     # Get all the ground truth labels in the original order in day_23
     print("Parsing ground truth labels from day_23 file...")
     ground_truths = []
@@ -54,18 +72,22 @@ def get_targets(args, qsl_indices):
             if line_idx >= sample_boundaries[-1]:
                 break
             ground_truths.append(int(line.split("\t")[0]))
-    # Re-order the ground truth labels according to the qsl indices in the loadgen log.
+    # Re-order the ground truth labels according to the qsl indices in the
+    # loadgen log.
     print("Re-ordering ground truth labels...")
     targets = []
     for qsl_idx in qsl_indices:
-        for i in range(sample_boundaries[qsl_idx], sample_boundaries[qsl_idx + 1]):
+        for i in range(sample_boundaries[qsl_idx],
+                       sample_boundaries[qsl_idx + 1]):
             targets.append(ground_truths[i])
     return targets
 
+
 def main():
     args = get_args()
 
-    # If "--day-23-file" is specified, assume that the accuracy log contains only the prediction, not the ground truth label.
+    # If "--day-23-file" is specified, assume that the accuracy log contains
+    # only the prediction, not the ground truth label.
     log_contains_gt = args.day_23_file is None
 
     if log_contains_gt:
@@ -79,12 +101,12 @@ def main():
 
     seen = set()
     good = 0
-    total= 0
+    total = 0
     all_results = []
     all_targets = []
     qsl_indices = []
     for j in results:
-        idx = j['qsl_idx']
+        idx = j["qsl_idx"]
 
         # de-dupe in case loadgen sends the same sample multiple times
         if idx in seen:
@@ -93,7 +115,7 @@ def main():
         qsl_indices.append(idx)
 
         # reconstruct label from mlperf accuracy log
-        data = np.frombuffer(bytes.fromhex(j['data']), dtype_map[args.dtype])
+        data = np.frombuffer(bytes.fromhex(j["data"]), dtype_map[args.dtype])
 
         # data stores both predictions and targets
         output_count = 2 if log_contains_gt else 1
@@ -116,7 +138,11 @@ def main():
                     good += 1
                 else:
                     if args.verbose:
-                        print("{}:{}, expected: {}, found {}".format(idx, k, target, result.round()))
+                        print(
+                            "{}:{}, expected: {}, found {}".format(
+                                idx, k, target, result.round()
+                            )
+                        )
 
     if not log_contains_gt:
         all_targets = get_targets(args, qsl_indices)
@@ -131,9 +157,16 @@ def main():
     roc_auc = sklearn.metrics.roc_auc_score(all_targets, all_results)
     # compute accuracy metric
     acc = good / total
-    print("AUC={:.3f}%, accuracy={:.3f}%, good={}, total={}, queries={}".format(100. * roc_auc, 100. * acc, good, total, len(seen)))
+    print(
+        "AUC={:.3f}%, accuracy={:.3f}%, good={}, total={}, queries={}".format(
+            100.0 * roc_auc, 100.0 * acc, good, total, len(seen)
+        )
+    )
     if args.verbose:
-        print("found and ignored {} query dupes".format(len(results) - len(seen)))
+        print(
+            "found and ignored {} query dupes".format(
+                len(results) -
+                len(seen)))
 
 
 if __name__ == "__main__":
diff --git a/recommendation/dlrm_v2/pytorch/tools/quickgen.py b/recommendation/dlrm_v2/pytorch/tools/quickgen.py
index 895a7cede..c9cf4cbd3 100644
--- a/recommendation/dlrm_v2/pytorch/tools/quickgen.py
+++ b/recommendation/dlrm_v2/pytorch/tools/quickgen.py
@@ -1,21 +1,26 @@
-'''
+"""
 quick generator of random samples for debugging
-'''
+"""
 
 import sys
 import argparse
 import numpy as np
 
+
 def quickgen(num_samples, num_t, num_d, multihot_sizes, text_file=None):
     # generate place holder random array, including dense features
-    dense_features = np.random.uniform(0., 9., size = (num_samples, num_d)).astype(np.float32)
+    dense_features = np.random.uniform(0.0, 9.0, size=(num_samples, num_d)).astype(
+        np.float32
+    )
     # generate targets
     labels = np.random.randint(0, 2, (num_samples, num_t), dtype=np.int32)
     # generate sparse features
     sparse_features = {}
     limit = 2
     for k, size in enumerate(multihot_sizes):
-        sparse_features[str(k)] = np.random.randint(0, limit, (num_samples, size), dtype=np.int32)
+        sparse_features[str(k)] = np.random.randint(
+            0, limit, (num_samples, size), dtype=np.int32
+        )
     # generate print format
     if text_file is not None:
         np.save(text_file + "_dense_debug.npy", dense_features)
@@ -24,28 +29,30 @@ def quickgen(num_samples, num_t, num_d, multihot_sizes, text_file=None):
 
     return dense_features, sparse_features, labels
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Quick generator of random samples for debugging."
     )
-    parser.add_argument("--num-samples",            type=int, default=4096)
-    parser.add_argument("--num-dense-features",     type=int, default=13)
-    parser.add_argument("--num-multihot-features",  type=str, default="4,3,2")
-    parser.add_argument("--num-targets",            type=int, default=1)
-    parser.add_argument("--day",                    type=int, default=23)
-    parser.add_argument("--numpy-rand-seed",        type=int, default=123)
-    parser.add_argument("--output-name",            type=str, default="day_")
-    parser.add_argument("--output-dir",             type=str, default="./")
+    parser.add_argument("--num-samples", type=int, default=4096)
+    parser.add_argument("--num-dense-features", type=int, default=13)
+    parser.add_argument("--num-multihot-features", type=str, default="4,3,2")
+    parser.add_argument("--num-targets", type=int, default=1)
+    parser.add_argument("--day", type=int, default=23)
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--output-name", type=str, default="day_")
+    parser.add_argument("--output-dir", type=str, default="./")
     args = parser.parse_args()
 
     np.random.seed(args.numpy_rand_seed)
 
     out_name = args.output_name
-    multihot_sizes = np.fromstring(args.num_multihot_features, dtype=int, sep=",")
+    multihot_sizes = np.fromstring(
+        args.num_multihot_features, dtype=int, sep=",")
 
-    num_d   = args.num_dense_features
-    num_t   = args.num_targets
+    num_d = args.num_dense_features
+    num_t = args.num_targets
     out_dir = args.output_dir
-    text_file =  out_dir + out_name + str(args.day)
+    text_file = out_dir + out_name + str(args.day)
     print(text_file)
     quickgen(args.num_samples, num_t, num_d, multihot_sizes, text_file)
diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/backend.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/backend.py
index 39605cd7b..828d26d55 100644
--- a/retired_benchmarks/never_adopted/language/gpt3/megatron/backend.py
+++ b/retired_benchmarks/never_adopted/language/gpt3/megatron/backend.py
@@ -54,15 +54,23 @@ def issue_queries(self, query_samples):
 
             response_array = array.array("B", pred_output_batch[0].tobytes())
             bi = response_array.buffer_info()
-            response = [lg.QuerySampleResponse(query_samples[i].id, bi[0], bi[1])]
+            response = [
+                lg.QuerySampleResponse(
+                    query_samples[i].id,
+                    bi[0],
+                    bi[1])]
             lg.QuerySamplesComplete(response)
             if i % 5 == 0:
                 print("Completed : ", i)
 
     def inference_call(self, input_ids_tensor, input_length_tensor):
         """Common for all scenarios"""
-        data = {"input_ids": input_ids_tensor, "input_length": input_length_tensor}
-        response = requests.put(self.url, data=json.dumps(data), headers=self.headers)
+        data = {"input_ids": input_ids_tensor,
+                "input_length": input_length_tensor}
+        response = requests.put(
+            self.url,
+            data=json.dumps(data),
+            headers=self.headers)
         if response.status_code != 200:
             # TODO: Manage exeption
             return None
@@ -80,7 +88,10 @@ def __del__(self):
 
 class SUT_Offline(SUT_base):
     def __init__(
-        self, dataset_path, max_examples, args,
+        self,
+        dataset_path,
+        max_examples,
+        args,
     ):
         SUT_base.__init__(
             self,
@@ -94,7 +105,10 @@ def __init__(
 
 class SUT_Server(SUT_base):
     def __init__(
-        self, dataset_path, max_examples, args,
+        self,
+        dataset_path,
+        max_examples,
+        args,
     ):
 
         SUT_base.__init__(
@@ -114,9 +128,8 @@ def issue_queries(self, query_samples):
         # input_masks_tensor = self.data_object.source_encoded_attn_masks[index]
         input_length_tensor = self.data_object.source_encoded_input_id_lengths[index]
 
-        pred_output_batch = (
-            self.inference_call(input_ids_tensor, input_length_tensor)
-        )
+        pred_output_batch = self.inference_call(
+            input_ids_tensor, input_length_tensor)
 
         response_array = array.array("B", pred_output_batch[0].tobytes())
         bi = response_array.buffer_info()
@@ -129,7 +142,10 @@ def issue_queries(self, query_samples):
 
 class SUT_SingleStream(SUT_base):
     def __init__(
-        self, dataset_path, max_examples, args,
+        self,
+        dataset_path,
+        max_examples,
+        args,
     ):
         SUT_base.__init__(
             self,
@@ -147,9 +163,8 @@ def issue_queries(self, query_samples):
         # input_masks_tensor = self.data_object.source_encoded_attn_masks[index]
         input_length_tensor = self.data_object.source_encoded_input_id_lengths[index]
 
-        pred_output_batch = (
-            self.inference_call(input_ids_tensor, input_length_tensor)
-        )
+        pred_output_batch = self.inference_call(
+            input_ids_tensor, input_length_tensor)
 
         response_array = array.array("B", pred_output_batch[0].tobytes())
         bi = response_array.buffer_info()
diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/dataset.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/dataset.py
index f8bf44388..5d67dd411 100644
--- a/retired_benchmarks/never_adopted/language/gpt3/megatron/dataset.py
+++ b/retired_benchmarks/never_adopted/language/gpt3/megatron/dataset.py
@@ -1,22 +1,17 @@
+from argparse import Namespace
+import json
+import utils
+import argparse
+import torch
+from megatron.utils import get_ltor_masks_and_position_ids
+from megatron.tokenizer import build_tokenizer
 import os
 import sys
 
 sys.path.append(os.environ["MEGATRON_PATH"])
-from megatron.tokenizer import build_tokenizer
-from megatron.utils import get_ltor_masks_and_position_ids
-import torch
-import argparse
-import utils
-import json
-
-from argparse import Namespace
 
 
-PROMPT_DICT = {
-    "prompt_input": (
-        "{instruction}{input}"
-    )
-}
+PROMPT_DICT = {"prompt_input": ("{instruction}{input}")}
 
 
 class Dataset:
@@ -38,18 +33,17 @@ def __init__(
         self.debug = debug
         self.gen_kwards = gen_kwards
 
-        ## TODO: provide arguments in command line
+        # TODO: provide arguments in command line
         args.rank = 0
         args.tokenizer_type = "SentencePieceTokenizer"
         args.vocab_extra_ids = 0
-        if 'make_vocab_size_divisible_by' not in vars(args):
+        if "make_vocab_size_divisible_by" not in vars(args):
             args.make_vocab_size_divisible_by = 128
-        if 'tensor_model_parallel_size' not in vars(args):
+        if "tensor_model_parallel_size" not in vars(args):
             args.tensor_model_parallel_size = 8
-        if 'tokenizer_model' not in vars(args):
+        if "tokenizer_model" not in vars(args):
             args.tokenizer_model = "./data/c4_en_301_5Mexp2_spm.model"
-        
-        
+
         self.tokenizer = build_tokenizer(args)
 
         self.list_data_dict = utils.jload(self.dataset_path)
@@ -59,7 +53,8 @@ def __init__(
         self.sources = [
             prompt_input.format_map(example) for example in self.list_data_dict
         ]
-        self.targets = [f"{example['output']}" for example in self.list_data_dict]
+        self.targets = [
+            f"{example['output']}" for example in self.list_data_dict]
 
         (
             self.source_encoded_input_ids,
@@ -99,20 +94,25 @@ def tokenize_prompts(self, prompts, tokens_to_generate, add_BOS):
                 for prompt in prompts
             ]
         else:
-            prompts_tokens = [self.tokenizer.tokenize(prompt)[:self.max_input_tokens] for prompt in prompts]
+            prompts_tokens = [
+                self.tokenizer.tokenize(prompt)[: self.max_input_tokens]
+                for prompt in prompts
+            ]
 
         # Now we have a list of list of tokens which each list has a different
         # size. We want to extend this list to:
         #   - incorporate the tokens that need to be generated
         #   - make all the sequences equal length.
         # Get the prompts length.
-        prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
+        prompts_length = [len(prompt_tokens)
+                          for prompt_tokens in prompts_tokens]
         # Get the max prompts length.
         max_prompt_len = max(prompts_length)
         # Number of tokens in the each sample of the batch.
         samples_length = max_prompt_len + tokens_to_generate
         # Now update the list of list to be of the same size: samples_length.
-        for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
+        for prompt_tokens, prompt_length in zip(
+                prompts_tokens, prompts_length):
             padding_size = samples_length - prompt_length
             prompt_tokens.extend([self.tokenizer.pad] * padding_size)
 
@@ -139,7 +139,8 @@ def encode_samples(self):
                     "--------------------------------------------------------------------------------"
                 )
             tokens, length = self.tokenize_prompts(
-                [self.sources[i]], self.gen_kwards.get("max_new_tokens", 128), None
+                [self.sources[i]], self.gen_kwards.get(
+                    "max_new_tokens", 128), None
             )
             # attn_mask = self._build_attention_mask(tokens)
             source_encoded_input_ids.append(tokens)
@@ -164,4 +165,3 @@ def UnloadSamplesFromRam(self, sample_list):
 
     def __del__(self):
         print("Finished destroying QSL.")
-
diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/download_cnndm.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/download_cnndm.py
index ec47912ea..1c2303a97 100644
--- a/retired_benchmarks/never_adopted/language/gpt3/megatron/download_cnndm.py
+++ b/retired_benchmarks/never_adopted/language/gpt3/megatron/download_cnndm.py
@@ -1,15 +1,14 @@
 # experiment config
+import sys
+import simplejson as json
+import os
+import numpy as np
+from datasets import load_dataset
 dataset_id = "cnn_dailymail"
 dataset_config = "3.0.0"
 text_column = "article"
 summary_column = "highlights"
 
-from datasets import load_dataset
-
-import numpy as np
-import os
-import simplejson as json
-import sys
 
 save_dataset_path = os.environ.get("DATASET_CNNDM_PATH", "data")
 
diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/evaluation.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/evaluation.py
index 4e32289fc..a1ad32b49 100644
--- a/retired_benchmarks/never_adopted/language/gpt3/megatron/evaluation.py
+++ b/retired_benchmarks/never_adopted/language/gpt3/megatron/evaluation.py
@@ -1,4 +1,4 @@
-from dataset import Dataset 
+from dataset import Dataset
 import numpy as np
 import json
 import nltk
@@ -11,17 +11,26 @@
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--dataset-file", required=True, help="path to cnn_eval.json")
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--dataset-file",
+        required=True,
+        help="path to cnn_eval.json")
     parser.add_argument(
         "--tokenizer-model",
         default="./data/c4_en_301_5Mexp2_spm.model",
         help="Path to tokenizer model",
     )
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
     args = parser.parse_args()
     return args
 
+
 def postprocess_text(preds, targets):
     preds = [pred.strip() for pred in preds]
     targets = [target.strip() for target in targets]
@@ -37,9 +46,9 @@ def main():
     args = get_args()
     dataset_path = args.dataset_file
     metric = evaluate.load("rouge")
-    nltk.download('punkt')
-    
-    dataset_args = Namespace(tokenizer_model = args.tokenizer_model)
+    nltk.download("punkt")
+
+    dataset_args = Namespace(tokenizer_model=args.tokenizer_model)
     data_object = Dataset(dataset_path, args=dataset_args)
 
     targets = data_object.targets
@@ -47,24 +56,25 @@ def main():
     with open(args.mlperf_accuracy_file, "r") as f:
         results = json.load(f)
 
-
     target_required = []
     preds_token_ids = []
 
     for pred in results:
-        qsl_idx = pred['qsl_idx']
+        qsl_idx = pred["qsl_idx"]
         target = targets[qsl_idx]
         target_required.append(target)
-        preds = np.frombuffer(bytes.fromhex(pred['data']), np.int64).tolist()
+        preds = np.frombuffer(bytes.fromhex(pred["data"]), np.int64).tolist()
         preds = [int(p) for p in preds]
         preds_token_ids.append(preds)
-        
 
-    preds_decoded_text = [data_object.tokenizer.detokenize(ids) for ids in preds_token_ids]
+    preds_decoded_text = [
+        data_object.tokenizer.detokenize(ids) for ids in preds_token_ids
+    ]
     preds, targets = postprocess_text(preds_decoded_text, target_required)
 
-
-    result = metric.compute(predictions=preds, references=targets, use_stemmer=True,use_aggregator=False)
+    result = metric.compute(
+        predictions=preds, references=targets, use_stemmer=True, use_aggregator=False
+    )
     result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()}
     prediction_lens = [len(pred) for pred in preds]
     result["gen_len"] = np.sum(prediction_lens)
@@ -72,5 +82,6 @@ def main():
     print("\nResults\n")
     print(result)
 
+
 if __name__ == "__main__":
     main()
diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/main.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/main.py
index 183278e52..2da5ea949 100644
--- a/retired_benchmarks/never_adopted/language/gpt3/megatron/main.py
+++ b/retired_benchmarks/never_adopted/language/gpt3/megatron/main.py
@@ -18,8 +18,14 @@ def get_args():
         default="Offline",
         help="Scenario",
     )
-    parser.add_argument("--dataset-path", default="./data/cnn_eval.json", help="")
-    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
+    parser.add_argument(
+        "--dataset-path",
+        default="./data/cnn_eval.json",
+        help="")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
     parser.add_argument(
         "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
     )
diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/prepare-calibration.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/prepare-calibration.py
index 846e492d3..91bcb0826 100644
--- a/retired_benchmarks/never_adopted/language/gpt3/megatron/prepare-calibration.py
+++ b/retired_benchmarks/never_adopted/language/gpt3/megatron/prepare-calibration.py
@@ -4,32 +4,39 @@
 from argparse import ArgumentParser
 from datasets import load_dataset
 
+
 def get_args():
     parser = ArgumentParser()
-    parser.add_argument("--calibration-list-file", required=True, help="Path to calibration list")
-    parser.add_argument("--output-dir", help="Output directory", default="calibration-data")
+    parser.add_argument(
+        "--calibration-list-file", required=True, help="Path to calibration list"
+    )
+    parser.add_argument(
+        "--output-dir", help="Output directory", default="calibration-data"
+    )
 
     return parser.parse_args()
 
-dataset_id='cnn_dailymail'
-version='3.0.0'
-split='train'
 
-instruction_template="Summarize the following news article:"
+dataset_id = "cnn_dailymail"
+version = "3.0.0"
+split = "train"
+
+instruction_template = "Summarize the following news article:"
+
 
 def check_path(path):
     return os.path.exists(path)
 
+
 def prepare_calibration_data(calibration_list_file, output_dir):
     if not check_path(calibration_list_file):
         print("Calibration list file not found: {}".format(calibration_list_file))
         sys.exit(1)
 
-    dataset = load_dataset("cnn_dailymail", name="3.0.0", split='train')
-    train = dict((x['id'], x) for x in dataset)
+    dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+    train = dict((x["id"], x) for x in dataset)
 
-    
-    with open(calibration_list_file, 'r') as fid:
+    with open(calibration_list_file, "r") as fid:
         calibration_ids = fid.read().splitlines()
 
     inputs = []
@@ -40,20 +47,22 @@ def prepare_calibration_data(calibration_list_file, output_dir):
         x["input"] = calibration_sample["article"]
         x["output"] = calibration_sample["highlights"]
         inputs.append(x)
-    
+
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir)
 
-    output_path = os.path.join(output_dir,"cnn_dailymail_calibration.json")
-    with open(output_path, 'w') as write_f:
+    output_path = os.path.join(output_dir, "cnn_dailymail_calibration.json")
+    with open(output_path, "w") as write_f:
         json.dump(inputs, write_f, indent=4, ensure_ascii=False)
 
     print("Calibration data saved at {}".format(output_path))
 
+
 def main():
 
     args = get_args()
     prepare_calibration_data(args.calibration_list_file, args.output_dir)
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     main()
diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/text_generation_server.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/text_generation_server.py
index 963938435..d4a259100 100644
--- a/retired_benchmarks/never_adopted/language/gpt3/megatron/text_generation_server.py
+++ b/retired_benchmarks/never_adopted/language/gpt3/megatron/text_generation_server.py
@@ -1,29 +1,24 @@
-import os
-import sys
-
-sys.path.append(os.environ["MEGATRON_PATH"])
-
-import datetime
-import torch
-import json
-import threading
-from flask import Flask, request, jsonify, current_app
-from flask_restful import Resource, Api
-from megatron import get_args
+from megatron.training import get_model
+from megatron.model import GPTModel
+from megatron.initialize import initialize_megatron
+from megatron.checkpointing import load_checkpoint
+from megatron import mpu
+from megatron import print_rank_0
 from megatron.text_generation.generation import (
     generate_tokens_probs_and_return_on_first_stage,
-    beam_search_and_return_on_first_stage
+    beam_search_and_return_on_first_stage,
 )
-
-
 from megatron import get_args
-from megatron import print_rank_0
-from megatron import mpu
-from megatron.checkpointing import load_checkpoint
-from megatron.initialize import initialize_megatron
-from megatron.model import GPTModel
-from megatron.training import get_model
+from flask_restful import Resource, Api
+from flask import Flask, request, jsonify, current_app
+import threading
+import json
 import torch
+import datetime
+import os
+import sys
+
+sys.path.append(os.environ["MEGATRON_PATH"])
 
 
 GENERATE_NUM = 0
@@ -76,9 +71,11 @@ def put(self):
             try:
                 if self.use_beam_search:
                     try:
-                        MegatronGenerate.send_do_beam_search()  # Tell other ranks we're doing beam_search
-                        input_ids_tensor, input_length_tensor = MegatronGenerate.sync_input(
-                            input_ids, input_length
+                        # Tell other ranks we're doing beam_search
+                        MegatronGenerate.send_do_beam_search()
+                        input_ids_tensor, input_length_tensor = (
+                            MegatronGenerate.sync_input(
+                                input_ids, input_length)
                         )
                         (
                             output_tokens,
@@ -88,13 +85,20 @@ def put(self):
                             input_ids_tensor,
                             input_length_tensor,
                             beam_size=self.gen_kwargs.get("beam_size", 4),
-                            stop_token = self.gen_kwargs.get("beam_stop_token", 1),
-                            num_return_gen = self.gen_kwargs.get("beam_num_return_gen", 1),
-                            length_penalty = self.gen_kwargs.get("beam_length_penalty", 1),
-                            min_length = self.gen_kwargs.get("min_new_tokens", 30),
+                            stop_token=self.gen_kwargs.get(
+                                "beam_stop_token", 1),
+                            num_return_gen=self.gen_kwargs.get(
+                                "beam_num_return_gen", 1
+                            ),
+                            length_penalty=self.gen_kwargs.get(
+                                "beam_length_penalty", 1
+                            ),
+                            min_length=self.gen_kwargs.get(
+                                "min_new_tokens", 30),
                         )
                         output_batch_truncated = []
-                        for data, source_len in zip(output_tokens, input_length_tensor):
+                        for data, source_len in zip(
+                                output_tokens, input_length_tensor):
                             output_batch_truncated.append(
                                 data[source_len:].cpu().numpy().tolist()
                             )
@@ -108,8 +112,9 @@ def put(self):
                 else:
                     try:
                         MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
-                        input_ids_tensor, input_length_tensor = MegatronGenerate.sync_input(
-                            input_ids, input_length
+                        input_ids_tensor, input_length_tensor = (
+                            MegatronGenerate.sync_input(
+                                input_ids, input_length)
                         )
                         (
                             output_tokens,
@@ -120,11 +125,13 @@ def put(self):
                             input_ids_tensor,
                             input_length_tensor,
                             top_k=self.gen_kwargs.get("top_k", 4),
-                            temperature=self.gen_kwargs.get("temperature", 0.0),
-                            min_length = gen_kwargs.get("min_new_tokens", 30),
+                            temperature=self.gen_kwargs.get(
+                                "temperature", 0.0),
+                            min_length=gen_kwargs.get("min_new_tokens", 30),
                         )
                         output_batch_truncated = []
-                        for data, source_len in zip(output_tokens, input_length_tensor):
+                        for data, source_len in zip(
+                                output_tokens, input_length_tensor):
                             output_batch_truncated.append(
                                 data[source_len:].cpu().numpy().tolist()
                             )
@@ -167,8 +174,8 @@ def model_provider(pre_process=True, post_process=True):
 
 
 def add_text_generate_args(parser):
-    group = parser.add_argument_group(title='text generation')
-    group.add_argument("--use-beam-search", action = "store_true")
+    group = parser.add_argument_group(title="text generation")
+    group.add_argument("--use-beam-search", action="store_true")
     return parser
 
 
@@ -179,7 +186,7 @@ def add_text_generate_args(parser):
             "tokenizer_type": "SentencePieceTokenizer",
             "no_load_rng": True,
             "no_load_optim": True,
-        }
+        },
     )
 
     args = get_args()
@@ -193,7 +200,7 @@ def add_text_generate_args(parser):
         "beam_size": 4,
         "beam_stop_token": 1,
         "beam_num_return_gen": 1,
-        "beam_length_penalty": 1
+        "beam_length_penalty": 1,
     }
     if args.num_layers_per_virtual_pipeline_stage is not None:
         print("Interleaved pipeline schedule is not yet supported for text generation.")
@@ -236,7 +243,7 @@ def add_text_generate_args(parser):
                     input_length_tensor,
                     top_k=gen_kwargs.get("top_k", 4),
                     temperature=gen_kwargs.get("temperature", 1.0),
-                    min_length = gen_kwargs.get("min_new_tokens", 30),
+                    min_length=gen_kwargs.get("min_new_tokens", 30),
                 )
             except ValueError as ve:
                 pass
@@ -261,10 +268,10 @@ def add_text_generate_args(parser):
                     input_ids_tensor,
                     input_length_tensor,
                     beam_size=gen_kwargs.get("beam_size", 4),
-                    stop_token = gen_kwargs.get("beam_stop_token", 1),
-                    num_return_gen = gen_kwargs.get("beam_num_return_gen", 1),
-                    length_penalty = gen_kwargs.get("beam_length_penalty", 1),
-                    min_length = gen_kwargs.get("min_new_tokens", 30),
+                    stop_token=gen_kwargs.get("beam_stop_token", 1),
+                    num_return_gen=gen_kwargs.get("beam_num_return_gen", 1),
+                    length_penalty=gen_kwargs.get("beam_length_penalty", 1),
+                    min_length=gen_kwargs.get("min_new_tokens", 30),
                 )
             except ValueError as ve:
                 pass
diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/utils.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/utils.py
index 0fb2fbcd3..b0ceeaebc 100644
--- a/retired_benchmarks/never_adopted/language/gpt3/megatron/utils.py
+++ b/retired_benchmarks/never_adopted/language/gpt3/megatron/utils.py
@@ -2,11 +2,13 @@
 import os
 import io
 
+
 def _make_r_io_base(f, mode: str):
     if not isinstance(f, io.IOBase):
-        f = open(f, mode=mode, encoding='utf-8')
+        f = open(f, mode=mode, encoding="utf-8")
     return f
 
+
 def jload(f, mode="r"):
     """Load a .json file into a dictionary."""
     f = _make_r_io_base(f, mode)
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/backend.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/backend.py
index 955eddb88..6fc13454a 100755
--- a/retired_benchmarks/recommendation/dlrm/pytorch/python/backend.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/backend.py
@@ -2,10 +2,10 @@
 abstract backend class
 """
 
-
 # pylint: disable=unused-argument,missing-docstring
 
-class Backend():
+
+class Backend:
     def __init__(self):
         self.inputs = []
         self.outputs = []
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_onnxruntime.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_onnxruntime.py
index 79ed48842..e58de806a 100755
--- a/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_onnxruntime.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_onnxruntime.py
@@ -11,7 +11,8 @@
 
 
 class BackendOnnxruntime(backend.Backend):
-    def __init__(self, m_spa, ln_emb, ln_bot, ln_top, use_gpu=False, mini_batch_size=1):
+    def __init__(self, m_spa, ln_emb, ln_bot, ln_top,
+                 use_gpu=False, mini_batch_size=1):
         super(BackendOnnxruntime, self).__init__()
 
     def version(self):
@@ -30,11 +31,11 @@ def load(self, model_path, inputs=None, outputs=None):
         # print("onnx load", model_path, inputs, outputs)
         self.sess = rt.InferenceSession(model_path, opt)
         # get input and output names
-        if True: #not inputs:
+        if True:  # not inputs:
             self.inputs = [meta.name for meta in self.sess.get_inputs()]
         else:
             self.inputs = inputs
-        if True: #not outputs:
+        if True:  # not outputs:
             self.outputs = [meta.name for meta in self.sess.get_outputs()]
         else:
             self.outputs = outputs
@@ -63,16 +64,22 @@ def predict(self, batch_dense_X, batch_lS_o, batch_lS_i):
             dict_inputs["offsets"] = batch_lS_o.numpy().astype(np.int64)
         else:  # list
             for i in range(len(batch_lS_o)):
-                dict_inputs["offsets_"+str(i)] = batch_lS_o[i].numpy().astype(np.int64)
+                dict_inputs["offsets_" + str(i)] = (
+                    batch_lS_o[i].numpy().astype(np.int64)
+                )
         if torch.is_tensor(batch_lS_i):
             dict_inputs["indices"] = batch_lS_i.numpy().astype(np.int64)
         else:  # list
             for i in range(len(batch_lS_i)):
-                dict_inputs["indices_"+str(i)] = batch_lS_i[i].numpy().astype(np.int64)
+                dict_inputs["indices_" + str(i)] = (
+                    batch_lS_i[i].numpy().astype(np.int64)
+                )
 
         # predict and return output
         # print("dict_inputs", dict_inputs)
-        output = self.sess.run(output_names=self.outputs, input_feed=dict_inputs)
+        output = self.sess.run(
+            output_names=self.outputs,
+            input_feed=dict_inputs)
         output = torch.tensor(output, requires_grad=False).view(-1, 1)
         # print("output", output)
         # print("output.shape", output.shape)
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_pytorch_native.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_pytorch_native.py
index a0ae91d65..d861c44dc 100755
--- a/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_pytorch_native.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_pytorch_native.py
@@ -1,14 +1,17 @@
 """
 pytoch native backend for dlrm
 """
+
 # pylint: disable=unused-argument,missing-docstring
 import torch  # currently supports pytorch1.0
 import backend
 from dlrm_s_pytorch import DLRM_Net
 import numpy as np
 
+
 class BackendPytorchNative(backend.Backend):
-    def __init__(self, m_spa, ln_emb, ln_bot, ln_top, use_gpu=False, mini_batch_size=1):
+    def __init__(self, m_spa, ln_emb, ln_bot, ln_top,
+                 use_gpu=False, mini_batch_size=1):
         super(BackendPytorchNative, self).__init__()
         self.sess = None
         self.model = None
@@ -73,12 +76,12 @@ def load(self, model_path, inputs=None, outputs=None):
                 # note that the call to .to(device) has already happened
                 ld_model = torch.load(
                     model_path,
-                    map_location=torch.device('cuda')
+                    map_location=torch.device("cuda"),
                     # map_location=lambda storage, loc: storage.cuda(0)
                 )
         else:
             # when targeting inference on CPU
-            ld_model = torch.load(model_path, map_location=torch.device('cpu'))
+            ld_model = torch.load(model_path, map_location=torch.device("cpu"))
         # debug print
         # print(ld_model)
         dlrm.load_state_dict(ld_model["state_dict"])
@@ -114,13 +117,21 @@ def predict(self, batch_dense_X, batch_lS_o, batch_lS_i):
         if self.use_gpu:
             batch_dense_X = batch_dense_X.to(self.device)
 
-            batch_lS_i = [S_i.to(self.device) for S_i in batch_lS_i] if isinstance(batch_lS_i, list) \
+            batch_lS_i = (
+                [S_i.to(self.device) for S_i in batch_lS_i]
+                if isinstance(batch_lS_i, list)
                 else batch_lS_i.to(self.device)
+            )
 
-
-            batch_lS_o = [S_o.to(self.device) for S_o in batch_lS_o] if isinstance(batch_lS_o, list) \
+            batch_lS_o = (
+                [S_o.to(self.device) for S_o in batch_lS_o]
+                if isinstance(batch_lS_o, list)
                 else batch_lS_o.to(self.device)
+            )
 
         with torch.no_grad():
-             output = self.model(dense_x=batch_dense_X, lS_o=batch_lS_o, lS_i=batch_lS_i)
+            output = self.model(
+                dense_x=batch_dense_X,
+                lS_o=batch_lS_o,
+                lS_i=batch_lS_i)
         return output
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_tf.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_tf.py
index 3b3bcf619..9a699f65b 100644
--- a/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_tf.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/backend_tf.py
@@ -1,9 +1,11 @@
 """
 pytoch native backend for dlrm
 """
+
 # pylint: disable=unused-argument,missing-docstring
 import torch  # currently supports pytorch1.0
 import backend
+
 # from dlrm_s_pytorch import DLRM_Net
 import tensorflow as tf
 from tf_dlrm import logits_fn, rand_features_np
@@ -12,6 +14,7 @@
 from typing import Dict, Any
 import sys
 
+
 class BackendTF(backend.Backend):
     def __init__(self, dim_embed, vocab_sizes, mlp_bottom, mlp_top):
         super(BackendTF, self).__init__()
@@ -55,9 +58,12 @@ def load(self, model_path, inputs=None, outputs=None):
 
         with self.graph.as_default():
 
-            features_int_np, features_cat_np = rand_features_np(1, num_d, num_s, minsize)
+            features_int_np, features_cat_np = rand_features_np(
+                1, num_d, num_s, minsize
+            )
 
-            features_int = tf.placeholder(tf.float32, [None, num_d], name="ph_1")
+            features_int = tf.placeholder(
+                tf.float32, [None, num_d], name="ph_1")
             features_cat = tf.placeholder(tf.int32, [None, num_s], name="ph_2")
 
             preds = logits_fn(features_int, features_cat, self.params)
@@ -68,7 +74,13 @@ def load(self, model_path, inputs=None, outputs=None):
             self.sess = tf.compat.v1.Session(graph=self.graph)
 
             self.sess.run(init_op)
-            self.sess.run(preds, feed_dict = {features_int : features_int_np, features_cat : features_cat_np} )
+            self.sess.run(
+                preds,
+                feed_dict={
+                    features_int: features_int_np,
+                    features_cat: features_cat_np,
+                },
+            )
 
         self.params["is_training"] = False
 
@@ -92,12 +104,15 @@ def predict(self, batch_dense_X, batch_lS_o, batch_lS_i):
 
         # print_op_preds = tf.print(estim.predictions, output_stream=sys.stdout)
 
-        out_operation = self.graph.get_operation_by_name('preds')
+        out_operation = self.graph.get_operation_by_name("preds")
 
-        ph_1 = self.graph.get_tensor_by_name('ph_1:0')
-        ph_2 = self.graph.get_tensor_by_name('ph_2:0')
+        ph_1 = self.graph.get_tensor_by_name("ph_1:0")
+        ph_2 = self.graph.get_tensor_by_name("ph_2:0")
 
-        np_tensor_out = out_operation.outputs[0].eval(session=self.sess, feed_dict = {ph_1 : np_tensor_int, ph_2 : np_tensor_cat})
+        np_tensor_out = out_operation.outputs[0].eval(
+            session=self.sess, feed_dict={
+                ph_1: np_tensor_int, ph_2: np_tensor_cat}
+        )
         # print("1st output element: ", np_tensor_out[:1])
 
         output = torch.from_numpy(np_tensor_out)
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/criteo.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/criteo.py
index 3ea73b0ae..139edb848 100755
--- a/retired_benchmarks/recommendation/dlrm/pytorch/python/criteo.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/criteo.py
@@ -4,6 +4,8 @@
 
 # pylint: disable=unused-argument,missing-docstring
 
+import data_loader_terabyte
+import dlrm_data_pytorch as dp
 import logging
 import os
 import sys
@@ -14,6 +16,7 @@
 import numpy as np
 import sklearn.metrics
 import inspect
+
 # pytorch
 import torch
 from torch.utils.data import Dataset, RandomSampler
@@ -23,44 +26,47 @@
 
 # add dlrm code path
 try:
-    dlrm_dir_path = os.environ['DLRM_DIR']
+    dlrm_dir_path = os.environ["DLRM_DIR"]
     sys.path.append(dlrm_dir_path)
 except KeyError:
     print("ERROR: Please set DLRM_DIR environment variable to the dlrm code location")
     sys.exit(0)
-#import dataset
-import dlrm_data_pytorch as dp
-import data_loader_terabyte
+# import dataset
 
 
 class Criteo(Dataset):
 
-    def __init__(self,
-                 data_path,
-                 name,
-                 pre_process,
-                 use_cache,
-                 count=None,
-                 samples_to_aggregate_fix=None,
-                 samples_to_aggregate_min=None,
-                 samples_to_aggregate_max=None,
-                 samples_to_aggregate_quantile_file=None,
-                 samples_to_aggregate_trace_file=None,
-                 test_num_workers=0,
-                 max_ind_range=-1,
-                 sub_sample_rate=0.0,
-                 mlperf_bin_loader=False,
-                 randomize="total",
-                 memory_map=False):
+    def __init__(
+        self,
+        data_path,
+        name,
+        pre_process,
+        use_cache,
+        count=None,
+        samples_to_aggregate_fix=None,
+        samples_to_aggregate_min=None,
+        samples_to_aggregate_max=None,
+        samples_to_aggregate_quantile_file=None,
+        samples_to_aggregate_trace_file=None,
+        test_num_workers=0,
+        max_ind_range=-1,
+        sub_sample_rate=0.0,
+        mlperf_bin_loader=False,
+        randomize="total",
+        memory_map=False,
+    ):
         super().__init__()
 
         self.count = count
         self.random_offsets = []
-        self.use_fixed_size = ((samples_to_aggregate_quantile_file is None) and
-                               (samples_to_aggregate_min is None or samples_to_aggregate_max is None))
+        self.use_fixed_size = (samples_to_aggregate_quantile_file is None) and (
+            samples_to_aggregate_min is None or samples_to_aggregate_max is None
+        )
         if self.use_fixed_size:
             # fixed size queries
-            self.samples_to_aggregate = 1 if samples_to_aggregate_fix is None else samples_to_aggregate_fix
+            self.samples_to_aggregate = (
+                1 if samples_to_aggregate_fix is None else samples_to_aggregate_fix
+            )
             self.samples_to_aggregate_min = None
             self.samples_to_aggregate_max = None
         else:
@@ -77,8 +83,11 @@ def __init__(self,
             raw_data_file = data_path + "/day"
             processed_data_file = data_path + "/terabyte_processed.npz"
         else:
-            raise ValueError("only kaggle|terabyte dataset options are supported")
-        self.use_mlperf_bin_loader = mlperf_bin_loader and memory_map and name == "terabyte"
+            raise ValueError(
+                "only kaggle|terabyte dataset options are supported")
+        self.use_mlperf_bin_loader = (
+            mlperf_bin_loader and memory_map and name == "terabyte"
+        )
         # debug prints
         # print("dataset filenames", raw_data_file, processed_data_file)
 
@@ -90,25 +99,26 @@ def __init__(self,
             split="test",
             raw_path=raw_data_file,
             pro_data=processed_data_file,
-            memory_map=memory_map
+            memory_map=memory_map,
         )
         self.num_individual_samples = len(self.test_data)
 
         if self.use_mlperf_bin_loader:
 
             test_file = data_path + "/terabyte_processed_test.bin"
-            counts_file = raw_data_file + '_fea_count.npz'
+            counts_file = raw_data_file + "_fea_count.npz"
 
             data_loader_terabyte.numpy_to_binary(
-                input_files=[raw_data_file + '_23_reordered.npz'],
+                input_files=[raw_data_file + "_23_reordered.npz"],
                 output_file_path=data_path + "/terabyte_processed_test.bin",
-                split="test")
+                split="test",
+            )
 
             self.test_data = data_loader_terabyte.CriteoBinDataset(
                 data_file=test_file,
                 counts_file=counts_file,
                 batch_size=self.samples_to_aggregate,
-                max_ind_range=max_ind_range
+                max_ind_range=max_ind_range,
             )
 
             self.test_loader = torch.utils.data.DataLoader(
@@ -138,32 +148,48 @@ def __init__(self,
         # of size samples_to_aggregate as an item we need to adjust the original dataset item_count.
         # On the other hand, data loader always returns number of batches.
         if self.use_fixed_size:
-            # the offsets for fixed query size will be generated on-the-fly later on
+            # the offsets for fixed query size will be generated on-the-fly
+            # later on
             print("Using fixed query size: " + str(self.samples_to_aggregate))
             if self.use_mlperf_bin_loader:
                 self.num_aggregated_samples = len(self.test_data)
                 # self.num_aggregated_samples2 = len(self.test_loader)
             else:
-                self.num_aggregated_samples = (self.num_individual_samples + self.samples_to_aggregate - 1) // self.samples_to_aggregate
+                self.num_aggregated_samples = (
+                    self.num_individual_samples + self.samples_to_aggregate - 1
+                ) // self.samples_to_aggregate
                 # self.num_aggregated_samples2 = len(self.test_loader)
         else:
             # the offsets for variable query sizes will be pre-generated here
             if self.samples_to_aggregate_quantile_file is None:
-                # generate number of samples in a query from a uniform(min,max) distribution
-                print("Using variable query size: uniform distribution (" + str(self.samples_to_aggregate_min) + "," + str(self.samples_to_aggregate_max) +  ")")
+                # generate number of samples in a query from a uniform(min,max)
+                # distribution
+                print(
+                    "Using variable query size: uniform distribution ("
+                    + str(self.samples_to_aggregate_min)
+                    + ","
+                    + str(self.samples_to_aggregate_max)
+                    + ")"
+                )
                 done = False
                 qo = 0
                 while done == False:
                     self.random_offsets.append(int(qo))
-                    qs = random.randint(self.samples_to_aggregate_min, self.samples_to_aggregate_max)
+                    qs = random.randint(
+                        self.samples_to_aggregate_min, self.samples_to_aggregate_max
+                    )
                     qo = min(qo + qs, self.num_individual_samples)
                     if qo >= self.num_individual_samples:
                         done = True
                 self.random_offsets.append(int(qo))
 
                 # compute min and max number of samples
-                nas_max = (self.num_individual_samples + self.samples_to_aggregate_min - 1) // self.samples_to_aggregate_min
-                nas_min = (self.num_individual_samples + self.samples_to_aggregate_max - 1) // self.samples_to_aggregate_max
+                nas_max = (
+                    self.num_individual_samples + self.samples_to_aggregate_min - 1
+                ) // self.samples_to_aggregate_min
+                nas_min = (
+                    self.num_individual_samples + self.samples_to_aggregate_max - 1
+                ) // self.samples_to_aggregate_max
             else:
                 # generate number of samples in a query from a custom distribution,
                 # with quantile (inverse of its cdf) given in the file. Note that
@@ -176,9 +202,14 @@ def __init__(self,
                 # The inverse of its cdf with granularity of 0.05 can be written as
                 # quantile_p = [.05, .10, .15, .20, .25, .30, .35, .40, .45, .50, .55, .60, .65, .70, .75, .80, .85, .90, .95, 1.0] # p
                 # quantile_x = [100, 100, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 300, 300, 400, 500, 600, 700] # q(p) = x, such that f(x) >= p
-                # Notice that once we have quantile, we can apply inverse transform sampling method.
-                print("Using variable query size: custom distribution (file " + str(samples_to_aggregate_quantile_file) + ")")
-                with open(self.samples_to_aggregate_quantile_file, 'r') as f:
+                # Notice that once we have quantile, we can apply inverse
+                # transform sampling method.
+                print(
+                    "Using variable query size: custom distribution (file "
+                    + str(samples_to_aggregate_quantile_file)
+                    + ")"
+                )
+                with open(self.samples_to_aggregate_quantile_file, "r") as f:
                     line = f.readline()
                     quantile = np.fromstring(line, dtype=int, sep=", ")
                 # debug prints
@@ -198,41 +229,53 @@ def __init__(self,
                 self.random_offsets.append(int(qo))
 
                 # compute min and max number of samples
-                nas_max = (self.num_individual_samples + quantile[0] - 1) // quantile[0]
-                nas_min = (self.num_individual_samples + quantile[-1]- 1) // quantile[-1]
+                nas_max = (self.num_individual_samples +
+                           quantile[0] - 1) // quantile[0]
+                nas_min = (self.num_individual_samples + quantile[-1] - 1) // quantile[
+                    -1
+                ]
 
             # reset num_aggregated_samples
             self.num_aggregated_samples = len(self.random_offsets) - 1
 
             # check num_aggregated_samples
-            if self.num_aggregated_samples < nas_min or nas_max < self.num_aggregated_samples:
+            if (
+                self.num_aggregated_samples < nas_min
+                or nas_max < self.num_aggregated_samples
+            ):
                 raise ValueError("Sannity check failed")
 
         # limit number of items to count if needed
         if self.count is not None:
-            self.num_aggregated_samples = min(self.count, self.num_aggregated_samples)
+            self.num_aggregated_samples = min(
+                self.count, self.num_aggregated_samples)
 
         # dump the trace of aggregated samples
         if samples_to_aggregate_trace_file is not None:
-            with open(samples_to_aggregate_trace_file, 'w') as f:
+            with open(samples_to_aggregate_trace_file, "w") as f:
                 for l in range(self.num_aggregated_samples):
                     if self.use_fixed_size:
                         s = l * self.samples_to_aggregate
-                        e = min((l + 1) * self.samples_to_aggregate, self.num_individual_samples)
+                        e = min(
+                            (l + 1) * self.samples_to_aggregate,
+                            self.num_individual_samples,
+                        )
                     else:
                         s = self.random_offsets[l]
-                        e = self.random_offsets[l+1]
-                    f.write(str(s) + ", " + str(e) + ", " + str(e-s) + "\n")
+                        e = self.random_offsets[l + 1]
+                    f.write(str(s) + ", " + str(e) + ", " + str(e - s) + "\n")
 
     def get_item_count(self):
         # get number of items in the dataset
         return self.num_aggregated_samples
 
-    ''' lg compatibilty routine '''
+    """ lg compatibilty routine """
+
     def unload_query_samples(self, sample_list):
         self.items_in_memory = {}
 
-    ''' lg compatibilty routine '''
+    """ lg compatibilty routine """
+
     def load_query_samples(self, sample_list):
         self.items_in_memory = {}
 
@@ -242,16 +285,18 @@ def load_query_samples(self, sample_list):
         # while we can index into the dataset itself.
         for l in sample_list:
             # approach 1: single sample as an item
-            '''
+            """
             self.items_in_memory[l] = self.test_data[l]
-            '''
+            """
             # approach 2: multiple samples as an item
             if self.use_fixed_size:
                 s = l * self.samples_to_aggregate
-                e = min((l + 1) * self.samples_to_aggregate, self.num_individual_samples)
+                e = min(
+                    (l + 1) * self.samples_to_aggregate, self.num_individual_samples
+                )
             else:
                 s = self.random_offsets[l]
-                e = self.random_offsets[l+1]
+                e = self.random_offsets[l + 1]
 
             if self.use_mlperf_bin_loader and self.samples_to_aggregate > 1:
                 ls = [self.test_data[l]]
@@ -262,7 +307,8 @@ def load_query_samples(self, sample_list):
                 ls_t = list(zip(*ls))
                 X = torch.cat(ls_t[0])
                 (num_s, len_ls) = torch.cat(ls_t[1], dim=1).size()
-                lS_o = torch.stack([torch.tensor(range(len_ls)) for _ in range(num_s)])
+                lS_o = torch.stack([torch.tensor(range(len_ls))
+                                   for _ in range(num_s)])
                 lS_i = torch.cat(ls_t[2], dim=1)
                 T = torch.cat(ls_t[3])
                 self.items_in_memory[l] = (X, lS_o, lS_i, T)
@@ -273,7 +319,8 @@ def load_query_samples(self, sample_list):
 
         self.last_loaded = time.time()
 
-    ''' lg compatibilty routine '''
+    """ lg compatibilty routine """
+
     def get_samples(self, id_list):
 
         # build list tuples as need by the batch conversion routine
@@ -289,7 +336,7 @@ def get_samples(self, id_list):
         # print(idx_offsets)
 
         # approach 1: collate a mini-batch of single samples
-        '''
+        """
         if self.use_mlperf_bin_loader:
             # NOTE: in binary dataset the values are transformed
             ls_t = list(zip(*ls))
@@ -302,7 +349,7 @@ def get_samples(self, id_list):
         else:
             # NOTE: in original dataset the values are not transformed and collate besides stacking transforms them
             X, lS_o, lS_i, T = self.test_loader.collate_fn(ls)
-        '''
+        """
         # approach 2: collate a mini-batch of multiple samples
         # NOTE: recall that the samples have already been transformed for both datasets
         # (by earlier calls in load_query_samples), therefore we just need to stack them
@@ -336,7 +383,8 @@ def __call__(self, results, expected=None, result_dict=None):
         n = len(results)
         for idx in range(0, n):
             # NOTE: copy from GPU to CPU while post processing, if needed. Alternatively,
-            # we could do this on the output of predict function in backend_pytorch_native.py
+            # we could do this on the output of predict function in
+            # backend_pytorch_native.py
             result = results[idx].detach().cpu()
             target = expected[idx]
             processed_results.append([result, target])
@@ -359,7 +407,7 @@ def start(self):
         self.roc_auc = 0
         self.results = []
 
-    def finalize(self, result_dict, ds=False,  output_dir=None):
+    def finalize(self, result_dict, ds=False, output_dir=None):
         # AUC metric
         self.results = np.concatenate(self.results, axis=0)
         results, targets = list(zip(*self.results))
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/dataset.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/dataset.py
index 597c7519f..15dd2707b 100755
--- a/retired_benchmarks/recommendation/dlrm/pytorch/python/dataset.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/dataset.py
@@ -15,7 +15,8 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("dataset")
 
-class Item():
+
+class Item:
     def __init__(self, label, img, idx):
         self.label = label
         self.img = img
@@ -24,19 +25,25 @@ def __init__(self, label, img, idx):
 
 
 def usleep(sec):
-    if sys.platform == 'win32':
+    if sys.platform == "win32":
         # on windows time.sleep() doesn't work to well
         import ctypes
+
         kernel32 = ctypes.windll.kernel32
-        timer = kernel32.CreateWaitableTimerA(ctypes.c_void_p(), True, ctypes.c_void_p())
+        timer = kernel32.CreateWaitableTimerA(
+            ctypes.c_void_p(), True, ctypes.c_void_p()
+        )
         delay = ctypes.c_longlong(int(-1 * (10 * 1000000 * sec)))
-        kernel32.SetWaitableTimer(timer, ctypes.byref(delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False)
-        kernel32.WaitForSingleObject(timer, 0xffffffff)
+        kernel32.SetWaitableTimer(
+            timer, ctypes.byref(
+                delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False
+        )
+        kernel32.WaitForSingleObject(timer, 0xFFFFFFFF)
     else:
         time.sleep(sec)
 
 
-class Dataset():
+class Dataset:
     def __init__(self):
         self.arrival = None
         self.image_list = []
@@ -62,7 +69,7 @@ def load_query_samples(self, sample_list):
     def unload_query_samples(self, sample_list):
         if sample_list:
             for sample in sample_list:
-                if sample in self.image_list_inmemory :
+                if sample in self.image_list_inmemory:
                     del self.image_list_inmemory[sample]
         else:
             self.image_list_inmemory = {}
@@ -102,7 +109,7 @@ def start(self):
         self.good = 0
         self.total = 0
 
-    def finalize(self, results, ds=False,  output_dir=None):
+    def finalize(self, results, ds=False, output_dir=None):
         results["good"] = self.good
         results["total"] = self.total
 
@@ -141,6 +148,7 @@ def finalize(self, results, ds=False, output_dir=None):
 # pre-processing
 #
 
+
 def center_crop(img, out_height, out_width):
     height, width, _ = img.shape
     left = int((width - out_width) / 2)
@@ -151,10 +159,12 @@ def center_crop(img, out_height, out_width):
     return img
 
 
-def resize_with_aspectratio(img, out_height, out_width, scale=87.5, inter_pol=cv2.INTER_LINEAR):
+def resize_with_aspectratio(
+    img, out_height, out_width, scale=87.5, inter_pol=cv2.INTER_LINEAR
+):
     height, width, _ = img.shape
-    new_height = int(100. * out_height / scale)
-    new_width = int(100. * out_width / scale)
+    new_height = int(100.0 * out_height / scale)
+    new_width = int(100.0 * out_width / scale)
     if height > width:
         w = new_width
         h = int(new_height * height / width)
@@ -170,9 +180,11 @@ def pre_process_vgg(img, dims=None, need_transpose=False):
 
     output_height, output_width, _ = dims
     cv2_interpol = cv2.INTER_AREA
-    img = resize_with_aspectratio(img, output_height, output_width, inter_pol=cv2_interpol)
+    img = resize_with_aspectratio(
+        img, output_height, output_width, inter_pol=cv2_interpol
+    )
     img = center_crop(img, output_height, output_width)
-    img = np.asarray(img, dtype='float32')
+    img = np.asarray(img, dtype="float32")
 
     # normalize image
     means = np.array([123.68, 116.78, 103.94], dtype=np.float32)
@@ -188,9 +200,11 @@ def pre_process_mobilenet(img, dims=None, need_transpose=False):
     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 
     output_height, output_width, _ = dims
-    img = resize_with_aspectratio(img, output_height, output_width, inter_pol=cv2.INTER_LINEAR)
+    img = resize_with_aspectratio(
+        img, output_height, output_width, inter_pol=cv2.INTER_LINEAR
+    )
     img = center_crop(img, output_height, output_width)
-    img = np.asarray(img, dtype='float32')
+    img = np.asarray(img, dtype="float32")
 
     img /= 255.0
     img -= 0.5
@@ -208,9 +222,10 @@ def maybe_resize(img, dims):
         # some images might be grayscale
         img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    if dims != None:
+    if dims is not None:
         im_height, im_width, _ = dims
-        img = cv2.resize(img, (im_width, im_height), interpolation=cv2.INTER_LINEAR)
+        img = cv2.resize(img, (im_width, im_height),
+                         interpolation=cv2.INTER_LINEAR)
     return img
 
 
@@ -238,7 +253,7 @@ def pre_process_coco_resnet34(img, dims=None, need_transpose=False):
     mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
     std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
 
-    img = img / 255. - mean
+    img = img / 255.0 - mean
     img = img / std
 
     if need_transpose:
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/main.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/main.py
index fae913888..68e86b429 100755
--- a/retired_benchmarks/recommendation/dlrm/pytorch/python/main.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/main.py
@@ -25,7 +25,7 @@
 
 # add dlrm code path
 try:
-    dlrm_dir_path = os.environ['DLRM_DIR']
+    dlrm_dir_path = os.environ["DLRM_DIR"]
     sys.path.append(dlrm_dir_path)
 except KeyError:
     print("ERROR: Please set DLRM_DIR environment variable to the dlrm code location")
@@ -41,12 +41,18 @@
 
 # the datasets we support
 SUPPORTED_DATASETS = {
-    "kaggle":
-        (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(),
-         {"randomize": 'total',  "memory_map": True}),
-    "terabyte":
-        (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(),
-         {"randomize": 'total',  "memory_map": True}),
+    "kaggle": (
+        criteo.Criteo,
+        criteo.pre_process_criteo_dlrm,
+        criteo.DlrmPostProcess(),
+        {"randomize": "total", "memory_map": True},
+    ),
+    "terabyte": (
+        criteo.Criteo,
+        criteo.pre_process_criteo_dlrm,
+        criteo.DlrmPostProcess(),
+        {"randomize": "total", "memory_map": True},
+    ),
 }
 
 # pre-defined command line options so simplify things. They are used as defaults and can be
@@ -125,46 +131,129 @@ def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", help="name of the mlperf model, ie. dlrm")
-    parser.add_argument("--model-path", required=True, help="path to the model file")
-    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
-    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
-    parser.add_argument("--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles")
-    parser.add_argument("--scenario", default="SingleStream",
-                        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())))
-    parser.add_argument("--test-num-workers", type=int, default=0, help='# of workers reading the data')
+    parser.add_argument(
+        "--model-path",
+        required=True,
+        help="path to the model file")
+    parser.add_argument(
+        "--dataset",
+        choices=SUPPORTED_DATASETS.keys(),
+        help="dataset")
+    parser.add_argument(
+        "--dataset-path",
+        required=True,
+        help="path to the dataset")
+    parser.add_argument(
+        "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
+    )
+    parser.add_argument(
+        "--scenario",
+        default="SingleStream",
+        help="mlperf benchmark scenario, one of " +
+        str(list(SCENARIO_MAP.keys())),
+    )
+    parser.add_argument(
+        "--test-num-workers", type=int, default=0, help="# of workers reading the data"
+    )
     parser.add_argument("--max-ind-range", type=int, default=-1)
     parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)
-    parser.add_argument("--mlperf-bin-loader", action='store_true', default=False)
-    parser.add_argument("--max-batchsize", type=int, help="max batch size in a single inference")
+    parser.add_argument(
+        "--mlperf-bin-loader",
+        action="store_true",
+        default=False)
+    parser.add_argument(
+        "--max-batchsize", type=int, help="max batch size in a single inference"
+    )
     parser.add_argument("--output", help="test results")
     parser.add_argument("--inputs", help="model inputs (currently not used)")
     parser.add_argument("--outputs", help="model outputs (currently not used)")
     parser.add_argument("--backend", help="runtime to use")
     parser.add_argument("--use-gpu", action="store_true", default=False)
-    parser.add_argument("--threads", default=os.cpu_count(), type=int, help="threads")
-    parser.add_argument("--cache", type=int, default=0, help="use cache (currently not used)")
-    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
-    parser.add_argument("--find-peak-performance", action="store_true", help="enable finding peak performance pass")
+    parser.add_argument(
+        "--threads",
+        default=os.cpu_count(),
+        type=int,
+        help="threads")
+    parser.add_argument(
+        "--cache", type=int, default=0, help="use cache (currently not used)"
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--find-peak-performance",
+        action="store_true",
+        help="enable finding peak performance pass",
+    )
 
     # file to use mlperf rules compliant parameters
-    parser.add_argument("--mlperf_conf", default="mlperf.conf", help="mlperf rules config")
+    parser.add_argument(
+        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
+    )
     # file for user LoadGen settings such as target QPS
-    parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS")
-
-    # below will override mlperf rules compliant settings - don't use for official submission
-    parser.add_argument("--duration", type=int, help="duration in milliseconds (ms)")
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+
+    # below will override mlperf rules compliant settings - don't use for
+    # official submission
+    parser.add_argument(
+        "--duration",
+        type=int,
+        help="duration in milliseconds (ms)")
     parser.add_argument("--target-qps", type=int, help="target/expected qps")
-    parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile")
-    parser.add_argument("--count-samples", type=int, help="dataset items to use")
-    parser.add_argument("--count-queries", type=int, help="number of queries to use")
-    parser.add_argument("--samples-per-query-multistream", default=8, type=int, help="query length for multi-stream scenario (in terms of aggregated samples)")
+    parser.add_argument(
+        "--max-latency", type=float, help="mlperf max latency in pct tile"
+    )
+    parser.add_argument(
+        "--count-samples",
+        type=int,
+        help="dataset items to use")
+    parser.add_argument(
+        "--count-queries",
+        type=int,
+        help="number of queries to use")
+    parser.add_argument(
+        "--samples-per-query-multistream",
+        default=8,
+        type=int,
+        help="query length for multi-stream scenario (in terms of aggregated samples)",
+    )
     # --samples-per-query-offline is equivalent to perf_sample_count
-    parser.add_argument("--samples-per-query-offline", type=int, default=2048, help="query length for offline scenario (in terms of aggregated samples)")
-    parser.add_argument("--samples-to-aggregate-fix", type=int, help="number of samples to be treated as one")
-    parser.add_argument("--samples-to-aggregate-min", type=int, help="min number of samples to be treated as one in random query size")
-    parser.add_argument("--samples-to-aggregate-max", type=int, help="max number of samples to be treated as one in random query size")
-    parser.add_argument("--samples-to-aggregate-quantile-file", type=str, help="distribution quantile used to generate number of samples to be treated as one in random query size")
-    parser.add_argument("--samples-to-aggregate-trace-file", type=str, default="dlrm_trace_of_aggregated_samples.txt")
+    parser.add_argument(
+        "--samples-per-query-offline",
+        type=int,
+        default=2048,
+        help="query length for offline scenario (in terms of aggregated samples)",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-fix",
+        type=int,
+        help="number of samples to be treated as one",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-min",
+        type=int,
+        help="min number of samples to be treated as one in random query size",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-max",
+        type=int,
+        help="max number of samples to be treated as one in random query size",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-quantile-file",
+        type=str,
+        help="distribution quantile used to generate number of samples to be treated as one in random query size",
+    )
+    parser.add_argument(
+        "--samples-to-aggregate-trace-file",
+        type=str,
+        default="dlrm_trace_of_aggregated_samples.txt",
+    )
     parser.add_argument("--numpy-rand-seed", type=int, default=123)
     args = parser.parse_args()
 
@@ -192,113 +281,390 @@ def get_args():
     return args
 
 
-def get_backend(backend, dataset, max_ind_range, data_sub_sample_rate, use_gpu):
+def get_backend(backend, dataset, max_ind_range,
+                data_sub_sample_rate, use_gpu):
 
     if backend == "pytorch-native":
         from backend_pytorch_native import BackendPytorchNative
+
         # NOTE: pass model parameters here, the following options are available
         if dataset == "kaggle":
-            # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh)
+            # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see
+            # ./bench/dlrm_s_criteo_kaggle.sh)
             backend = BackendPytorchNative(
                 m_spa=16,
-                ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572]),
-                ln_bot=np.array([13,512,256,64,16]),
-                ln_top=np.array([367,512,256,1]),
-                use_gpu=use_gpu
+                ln_emb=np.array(
+                    [
+                        1460,
+                        583,
+                        10131227,
+                        2202608,
+                        305,
+                        24,
+                        12517,
+                        633,
+                        3,
+                        93145,
+                        5683,
+                        8351593,
+                        3194,
+                        27,
+                        14992,
+                        5461306,
+                        10,
+                        5652,
+                        2173,
+                        4,
+                        7046547,
+                        18,
+                        15,
+                        286181,
+                        105,
+                        142572,
+                    ]
+                ),
+                ln_bot=np.array([13, 512, 256, 64, 16]),
+                ln_top=np.array([367, 512, 256, 1]),
+                use_gpu=use_gpu,
             )
         elif dataset == "terabyte":
             if max_ind_range == 10000000:
-                # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000)
+                # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh
+                # [--sub-sample=0.875] --max-in-range=10000000)
                 backend = BackendPytorchNative(
                     m_spa=64,
-                    ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36]),
-                    ln_bot=np.array([13,512,256,64]),
-                    ln_top=np.array([415,512,512,256,1]),
-                    use_gpu=use_gpu
+                    ln_emb=np.array(
+                        [
+                            9980333,
+                            36084,
+                            17217,
+                            7378,
+                            20134,
+                            3,
+                            7112,
+                            1442,
+                            61,
+                            9758201,
+                            1333352,
+                            313829,
+                            10,
+                            2208,
+                            11156,
+                            122,
+                            4,
+                            970,
+                            14,
+                            9994222,
+                            7267859,
+                            9946608,
+                            415421,
+                            12420,
+                            101,
+                            36,
+                        ]
+                    ),
+                    ln_bot=np.array([13, 512, 256, 64]),
+                    ln_top=np.array([415, 512, 512, 256, 1]),
+                    use_gpu=use_gpu,
                 )
             elif max_ind_range == 40000000:
-                # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000)
+                # 3. Criteo Terabyte MLPerf training (see
+                # ./bench/run_and_time.sh --max-in-range=40000000)
                 backend = BackendPytorchNative(
                     m_spa=128,
-                    ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36]),
-                    ln_bot=np.array([13,512,256,128]),
-                    ln_top=np.array([479,1024,1024,512,256,1]),
-                    use_gpu=use_gpu
+                    ln_emb=np.array(
+                        [
+                            39884406,
+                            39043,
+                            17289,
+                            7420,
+                            20263,
+                            3,
+                            7120,
+                            1543,
+                            63,
+                            38532951,
+                            2953546,
+                            403346,
+                            10,
+                            2208,
+                            11938,
+                            155,
+                            4,
+                            976,
+                            14,
+                            39979771,
+                            25641295,
+                            39664984,
+                            585935,
+                            12972,
+                            108,
+                            36,
+                        ]
+                    ),
+                    ln_bot=np.array([13, 512, 256, 128]),
+                    ln_top=np.array([479, 1024, 1024, 512, 256, 1]),
+                    use_gpu=use_gpu,
                 )
             else:
-                raise ValueError("only --max-ind-range 10M or 40M is supported")
+                raise ValueError(
+                    "only --max-ind-range 10M or 40M is supported")
         else:
-            raise ValueError("only kaggle|terabyte dataset options are supported")
+            raise ValueError(
+                "only kaggle|terabyte dataset options are supported")
 
     elif backend == "onnxruntime":
         from backend_onnxruntime import BackendOnnxruntime
 
         # NOTE: pass model parameters here, the following options are available
         if dataset == "kaggle":
-            # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh)
+            # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see
+            # ./bench/dlrm_s_criteo_kaggle.sh)
             backend = BackendOnnxruntime(
                 m_spa=16,
-                ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572]),
-                ln_bot=np.array([13,512,256,64,16]),
-                ln_top=np.array([367,512,256,1]),
-                use_gpu=use_gpu
+                ln_emb=np.array(
+                    [
+                        1460,
+                        583,
+                        10131227,
+                        2202608,
+                        305,
+                        24,
+                        12517,
+                        633,
+                        3,
+                        93145,
+                        5683,
+                        8351593,
+                        3194,
+                        27,
+                        14992,
+                        5461306,
+                        10,
+                        5652,
+                        2173,
+                        4,
+                        7046547,
+                        18,
+                        15,
+                        286181,
+                        105,
+                        142572,
+                    ]
+                ),
+                ln_bot=np.array([13, 512, 256, 64, 16]),
+                ln_top=np.array([367, 512, 256, 1]),
+                use_gpu=use_gpu,
             )
         elif dataset == "terabyte":
             if max_ind_range == 10000000:
-                # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000)
+                # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh
+                # [--sub-sample=0.875] --max-in-range=10000000)
                 backend = BackendOnnxruntime(
                     m_spa=64,
-                    ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36]),
-                    ln_bot=np.array([13,512,256,64]),
-                    ln_top=np.array([415,512,512,256,1]),
-                    use_gpu=use_gpu
+                    ln_emb=np.array(
+                        [
+                            9980333,
+                            36084,
+                            17217,
+                            7378,
+                            20134,
+                            3,
+                            7112,
+                            1442,
+                            61,
+                            9758201,
+                            1333352,
+                            313829,
+                            10,
+                            2208,
+                            11156,
+                            122,
+                            4,
+                            970,
+                            14,
+                            9994222,
+                            7267859,
+                            9946608,
+                            415421,
+                            12420,
+                            101,
+                            36,
+                        ]
+                    ),
+                    ln_bot=np.array([13, 512, 256, 64]),
+                    ln_top=np.array([415, 512, 512, 256, 1]),
+                    use_gpu=use_gpu,
                 )
             elif max_ind_range == 40000000:
-                # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000)
+                # 3. Criteo Terabyte MLPerf training (see
+                # ./bench/run_and_time.sh --max-in-range=40000000)
                 backend = BackendOnnxruntime(
                     m_spa=128,
-                    ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36]),
-                    ln_bot=np.array([13,512,256,128]),
-                    ln_top=np.array([479,1024,1024,512,256,1]),
-                    use_gpu=use_gpu
+                    ln_emb=np.array(
+                        [
+                            39884406,
+                            39043,
+                            17289,
+                            7420,
+                            20263,
+                            3,
+                            7120,
+                            1543,
+                            63,
+                            38532951,
+                            2953546,
+                            403346,
+                            10,
+                            2208,
+                            11938,
+                            155,
+                            4,
+                            976,
+                            14,
+                            39979771,
+                            25641295,
+                            39664984,
+                            585935,
+                            12972,
+                            108,
+                            36,
+                        ]
+                    ),
+                    ln_bot=np.array([13, 512, 256, 128]),
+                    ln_top=np.array([479, 1024, 1024, 512, 256, 1]),
+                    use_gpu=use_gpu,
                 )
             else:
                 raise ValueError("only --max-in-range 10M or 40M is supported")
         else:
-            raise ValueError("only kaggle|terabyte dataset options are supported")
+            raise ValueError(
+                "only kaggle|terabyte dataset options are supported")
 
     elif backend == "tensorflow":
         from backend_tf import BackendTF
+
         # NOTE: pass model parameters here, the following options are available
         if dataset == "kaggle":
-            # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh)
+            # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see
+            # ./bench/dlrm_s_criteo_kaggle.sh)
             backend = BackendTF(
                 dim_embed=16,
-                vocab_sizes=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572]),
-                mlp_bottom=np.array([13,512,256,64,16]),
-                mlp_top=np.array([367,512,256,1]),
+                vocab_sizes=np.array(
+                    [
+                        1460,
+                        583,
+                        10131227,
+                        2202608,
+                        305,
+                        24,
+                        12517,
+                        633,
+                        3,
+                        93145,
+                        5683,
+                        8351593,
+                        3194,
+                        27,
+                        14992,
+                        5461306,
+                        10,
+                        5652,
+                        2173,
+                        4,
+                        7046547,
+                        18,
+                        15,
+                        286181,
+                        105,
+                        142572,
+                    ]
+                ),
+                mlp_bottom=np.array([13, 512, 256, 64, 16]),
+                mlp_top=np.array([367, 512, 256, 1]),
             )
         elif dataset == "terabyte":
             if max_ind_range == 10000000:
-                # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000)
+                # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh
+                # [--sub-sample=0.875] --max-in-range=10000000)
                 backend = BackendTF(
                     dim_embed=64,
-                    vocab_sizes=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36]),
-                    mlp_bottom=np.array([13,512,256,64]),
-                    mlp_top=np.array([415,512,512,256,1]),
+                    vocab_sizes=np.array(
+                        [
+                            9980333,
+                            36084,
+                            17217,
+                            7378,
+                            20134,
+                            3,
+                            7112,
+                            1442,
+                            61,
+                            9758201,
+                            1333352,
+                            313829,
+                            10,
+                            2208,
+                            11156,
+                            122,
+                            4,
+                            970,
+                            14,
+                            9994222,
+                            7267859,
+                            9946608,
+                            415421,
+                            12420,
+                            101,
+                            36,
+                        ]
+                    ),
+                    mlp_bottom=np.array([13, 512, 256, 64]),
+                    mlp_top=np.array([415, 512, 512, 256, 1]),
                 )
             elif max_ind_range == 40000000:
-                # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000)
+                # 3. Criteo Terabyte MLPerf training (see
+                # ./bench/run_and_time.sh --max-in-range=40000000)
                 backend = BackendTF(
                     dim_embed=128,
-                    vocab_sizes=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36]),
-                    mlp_bottom=np.array([13,512,256,128]),
-                    mlp_top=np.array([479,1024,1024,512,256,1]),
+                    vocab_sizes=np.array(
+                        [
+                            39884406,
+                            39043,
+                            17289,
+                            7420,
+                            20263,
+                            3,
+                            7120,
+                            1543,
+                            63,
+                            38532951,
+                            2953546,
+                            403346,
+                            10,
+                            2208,
+                            11938,
+                            155,
+                            4,
+                            976,
+                            14,
+                            39979771,
+                            25641295,
+                            39664984,
+                            585935,
+                            12972,
+                            108,
+                            36,
+                        ]
+                    ),
+                    mlp_bottom=np.array([13, 512, 256, 128]),
+                    mlp_top=np.array([479, 1024, 1024, 512, 256, 1]),
                 )
             else:
                 raise ValueError("only --max-in-range 10M or 40M is supported")
         else:
-            raise ValueError("only kaggle|terabyte dataset options are supported")
+            raise ValueError(
+                "only kaggle|terabyte dataset options are supported")
 
     else:
         raise ValueError("unknown backend: " + backend)
@@ -308,7 +674,16 @@ def get_backend(backend, dataset, max_ind_range, data_sub_sample_rate, use_gpu):
 class Item:
     """An item that we queue for processing by the thread pool."""
 
-    def __init__(self, query_id, content_id, batch_dense_X, batch_lS_o, batch_lS_i, batch_T=None, idx_offsets=None):
+    def __init__(
+        self,
+        query_id,
+        content_id,
+        batch_dense_X,
+        batch_lS_o,
+        batch_lS_i,
+        batch_T=None,
+        idx_offsets=None,
+    ):
         self.query_id = query_id
         self.content_id = content_id
         self.batch_dense_X = batch_dense_X
@@ -318,6 +693,7 @@ def __init__(self, query_id, content_id, batch_dense_X, batch_lS_o, batch_lS_i,
         self.idx_offsets = idx_offsets
         self.start = time.time()
 
+
 class RunnerBase:
     def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         self.take_accuracy = False
@@ -341,8 +717,12 @@ def run_one_item(self, qitem):
         # run the prediction
         processed_results = []
         try:
-            results = self.model.predict(qitem.batch_dense_X, qitem.batch_lS_o, qitem.batch_lS_i)
-            processed_results = self.post_process(results, qitem.batch_T, self.result_dict)
+            results = self.model.predict(
+                qitem.batch_dense_X, qitem.batch_lS_o, qitem.batch_lS_i
+            )
+            processed_results = self.post_process(
+                results, qitem.batch_T, self.result_dict
+            )
             if self.take_accuracy:
                 self.post_process.add_results(processed_results)
             self.result_timing.append(time.time() - qitem.start)
@@ -362,7 +742,10 @@ def run_one_item(self, qitem):
                 e_idx = qitem.idx_offsets[idx + 1]
                 # debug prints
                 # print("s,e:",s_idx,e_idx, len(processed_results))
-                response_array = array.array("B", np.array(processed_results[s_idx:e_idx], np.float32).tobytes())
+                response_array = array.array(
+                    "B", np.array(
+                        processed_results[s_idx:e_idx], np.float32).tobytes()
+                )
                 response_array_refs.append(response_array)
                 bi = response_array.buffer_info()
                 response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1]))
@@ -374,14 +757,38 @@ def enqueue(self, query_samples):
         query_len = len(query_samples)
 
         if query_len < self.max_batchsize:
-            batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx)
-            self.run_one_item(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets))
+            batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = (
+                self.ds.get_samples(idx)
+            )
+            self.run_one_item(
+                Item(
+                    query_id,
+                    idx,
+                    batch_dense_X,
+                    batch_lS_o,
+                    batch_lS_i,
+                    batch_T,
+                    idx_offsets,
+                )
+            )
         else:
             bs = self.max_batchsize
             for i in range(0, query_len, bs):
                 ie = min(i + bs, query_len)
-                batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx[i:ie])
-                self.run_one_item(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets))
+                batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = (
+                    self.ds.get_samples(idx[i:ie])
+                )
+                self.run_one_item(
+                    Item(
+                        query_id[i:ie],
+                        idx[i:ie],
+                        batch_dense_X,
+                        batch_lS_o,
+                        batch_lS_i,
+                        batch_T,
+                        idx_offsets,
+                    )
+                )
 
     def finish(self):
         pass
@@ -390,13 +797,17 @@ def finish(self):
 class QueueRunner(RunnerBase):
     def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         super().__init__(model, ds, threads, post_proc, max_batchsize)
-        queue_size_multiplier = 4 #(args.samples_per_query_offline + max_batchsize - 1) // max_batchsize)
+        queue_size_multiplier = (
+            4  # (args.samples_per_query_offline + max_batchsize - 1) // max_batchsize)
+        )
         self.tasks = JoinableQueue(maxsize=threads * queue_size_multiplier)
         self.workers = []
         self.result_dict = {}
 
         for _ in range(self.threads):
-            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
+            worker = threading.Thread(
+                target=self.handle_tasks, args=(
+                    self.tasks,))
             worker.daemon = True
             self.workers.append(worker)
             worker.start()
@@ -418,14 +829,38 @@ def enqueue(self, query_samples):
         query_len = len(query_samples)
 
         if query_len < self.max_batchsize:
-            batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx)
-            self.tasks.put(Item(query_id, idx, batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets))
+            batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = (
+                self.ds.get_samples(idx)
+            )
+            self.tasks.put(
+                Item(
+                    query_id,
+                    idx,
+                    batch_dense_X,
+                    batch_lS_o,
+                    batch_lS_i,
+                    batch_T,
+                    idx_offsets,
+                )
+            )
         else:
             bs = self.max_batchsize
             for i in range(0, query_len, bs):
                 ie = min(i + bs, query_len)
-                batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = self.ds.get_samples(idx[i:ie])
-                self.tasks.put(Item(query_id[i:ie], idx[i:ie], batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets))
+                batch_dense_X, batch_lS_o, batch_lS_i, batch_T, idx_offsets = (
+                    self.ds.get_samples(idx[i:ie])
+                )
+                self.tasks.put(
+                    Item(
+                        query_id[i:ie],
+                        idx[i:ie],
+                        batch_dense_X,
+                        batch_lS_o,
+                        batch_lS_i,
+                        batch_T,
+                        idx_offsets,
+                    )
+                )
 
     def finish(self):
         # exit all threads
@@ -435,11 +870,14 @@ def finish(self):
             worker.join()
 
 
-
-def add_results(final_results, name, result_dict, result_list, took, show_accuracy=False):
-    percentiles = [50., 80., 90., 95., 99., 99.9]
+def add_results(
+    final_results, name, result_dict, result_list, took, show_accuracy=False
+):
+    percentiles = [50.0, 80.0, 90.0, 95.0, 99.0, 99.9]
     buckets = np.percentile(result_list, percentiles).tolist()
-    buckets_str = ",".join(["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)])
+    buckets_str = ",".join(
+        ["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)]
+    )
 
     if result_dict["total"] == 0:
         result_dict["total"] = len(result_list)
@@ -456,19 +894,27 @@ def add_results(final_results, name, result_dict, result_list, took, show_accura
     }
     acc_str = ""
     if show_accuracy:
-        result["accuracy"] = 100. * result_dict["good"] / result_dict["total"]
+        result["accuracy"] = 100.0 * result_dict["good"] / result_dict["total"]
         acc_str = ", acc={:.3f}%".format(result["accuracy"])
         if "roc_auc" in result_dict:
-            result["roc_auc"] = 100. * result_dict["roc_auc"]
+            result["roc_auc"] = 100.0 * result_dict["roc_auc"]
             acc_str += ", auc={:.3f}%".format(result["roc_auc"])
 
     # add the result to the result dict
     final_results[name] = result
 
     # to stdout
-    print("{} qps={:.2f}, mean={:.4f}, time={:.3f}{}, queries={}, tiles={}".format(
-        name, result["qps"], result["mean"], took, acc_str,
-        len(result_list), buckets_str))
+    print(
+        "{} qps={:.2f}, mean={:.4f}, time={:.3f}{}, queries={}, tiles={}".format(
+            name,
+            result["qps"],
+            result["mean"],
+            took,
+            acc_str,
+            len(result_list),
+            buckets_str,
+        )
+    )
 
 
 def main():
@@ -478,29 +924,40 @@ def main():
     log.info(args)
 
     # find backend
-    backend = get_backend(args.backend, args.dataset, args.max_ind_range, args.data_sub_sample_rate, args.use_gpu)
+    backend = get_backend(
+        args.backend,
+        args.dataset,
+        args.max_ind_range,
+        args.data_sub_sample_rate,
+        args.use_gpu,
+    )
 
     # dataset to use
     wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
 
     # --count-samples can be used to limit the number of samples used for testing
-    ds = wanted_dataset(data_path=args.dataset_path,
-                        name=args.dataset,
-                        pre_process=pre_proc,  # currently an identity function
-                        use_cache=args.cache,  # currently not used
-                        count=args.count_samples,
-                        samples_to_aggregate_fix=args.samples_to_aggregate_fix,
-                        samples_to_aggregate_min=args.samples_to_aggregate_min,
-                        samples_to_aggregate_max=args.samples_to_aggregate_max,
-                        samples_to_aggregate_quantile_file=args.samples_to_aggregate_quantile_file,
-                        samples_to_aggregate_trace_file=args.samples_to_aggregate_trace_file,
-                        test_num_workers=args.test_num_workers,
-                        max_ind_range=args.max_ind_range,
-                        sub_sample_rate=args.data_sub_sample_rate,
-                        mlperf_bin_loader=args.mlperf_bin_loader,
-                        **kwargs)
+    ds = wanted_dataset(
+        data_path=args.dataset_path,
+        name=args.dataset,
+        pre_process=pre_proc,  # currently an identity function
+        use_cache=args.cache,  # currently not used
+        count=args.count_samples,
+        samples_to_aggregate_fix=args.samples_to_aggregate_fix,
+        samples_to_aggregate_min=args.samples_to_aggregate_min,
+        samples_to_aggregate_max=args.samples_to_aggregate_max,
+        samples_to_aggregate_quantile_file=args.samples_to_aggregate_quantile_file,
+        samples_to_aggregate_trace_file=args.samples_to_aggregate_trace_file,
+        test_num_workers=args.test_num_workers,
+        max_ind_range=args.max_ind_range,
+        sub_sample_rate=args.data_sub_sample_rate,
+        mlperf_bin_loader=args.mlperf_bin_loader,
+        **kwargs
+    )
     # load model to backend
-    model = backend.load(args.model_path, inputs=args.inputs, outputs=args.outputs)
+    model = backend.load(
+        args.model_path,
+        inputs=args.inputs,
+        outputs=args.outputs)
     final_results = {
         "runtime": model.name(),
         "version": model.version(),
@@ -541,10 +998,12 @@ def main():
         lg.TestScenario.SingleStream: RunnerBase,
         lg.TestScenario.MultiStream: QueueRunner,
         lg.TestScenario.Server: QueueRunner,
-        lg.TestScenario.Offline: QueueRunner
+        lg.TestScenario.Offline: QueueRunner,
     }
 
-    runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize)
+    runner = runner_map[scenario](
+        model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+    )
 
     def issue_queries(query_samples):
         runner.enqueue(query_samples)
@@ -581,13 +1040,23 @@ def flush_queries():
 
     if args.max_latency:
         settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
-        settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC)
+        settings.multi_stream_expected_latency_ns = int(
+            args.max_latency * NANO_SEC)
 
     sut = lg.ConstructSUT(issue_queries, flush_queries)
-    qsl = lg.ConstructQSL(count, min(count, args.samples_per_query_offline), ds.load_query_samples, ds.unload_query_samples)
+    qsl = lg.ConstructQSL(
+        count,
+        min(count, args.samples_per_query_offline),
+        ds.load_query_samples,
+        ds.unload_query_samples,
+    )
 
     log.info("starting {}".format(scenario))
-    result_dict = {"good": 0, "total": 0, "roc_auc": 0, "scenario": str(scenario)}
+    result_dict = {
+        "good": 0,
+        "total": 0,
+        "roc_auc": 0,
+        "scenario": str(scenario)}
     runner.start_run(result_dict, args.accuracy)
     lg.StartTest(sut, qsl, settings)
 
@@ -599,9 +1068,14 @@ def flush_queries():
     if args.accuracy:
         post_proc.finalize(result_dict, ds, output_dir=args.output)
 
-    add_results(final_results, "{}".format(scenario),
-                result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy)
-
+    add_results(
+        final_results,
+        "{}".format(scenario),
+        result_dict,
+        last_timeing,
+        time.time() - ds.last_loaded,
+        args.accuracy,
+    )
 
     runner.finish()
     lg.DestroyQSL(qsl)
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/tf_dlrm.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/tf_dlrm.py
index f99117e98..269d1fd34 100644
--- a/retired_benchmarks/recommendation/dlrm/pytorch/python/tf_dlrm.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/tf_dlrm.py
@@ -22,231 +22,251 @@
 import tensorflow as tf
 
 from tensorflow.compiler.tf2xla.python import xla
-from tensorflow.contrib import  layers as contrib_layers
+from tensorflow.contrib import layers as contrib_layers
+
 # import utils  # ti
 
-import sys # ti
+import sys  # ti
+
 
 def rand_features(batch_size):
-  """Emits random input features, used for testing."""
-  features = {}
-  pos_size = batch_size // 2
-  neg_size = batch_size - pos_size
-  features["clicked"] = tf.concat([
-      tf.ones([pos_size, 1], dtype=tf.float32),
-      tf.zeros([neg_size, 1], dtype=tf.float32)
-  ], axis=0)
-  features["int-features"] = tf.random.uniform(
-      shape=(batch_size, 13),
-      maxval=100)
-  features["cat-features"] = tf.random.uniform(
-      shape=(batch_size, 26),
-      maxval=100,
-      dtype=tf.int32)
-  return features
+    """Emits random input features, used for testing."""
+    features = {}
+    pos_size = batch_size // 2
+    neg_size = batch_size - pos_size
+    features["clicked"] = tf.concat(
+        [
+            tf.ones([pos_size, 1], dtype=tf.float32),
+            tf.zeros([neg_size, 1], dtype=tf.float32),
+        ],
+        axis=0,
+    )
+    features["int-features"] = tf.random.uniform(
+        shape=(batch_size, 13), maxval=100)
+    features["cat-features"] = tf.random.uniform(
+        shape=(batch_size, 26), maxval=100, dtype=tf.int32
+    )
+    return features
+
 
 def rand_features_np(batch_size, num_d, num_s, minsize):
-  """Emits random input features, used for testing."""
-  # features = {}
-  # pos_size = batch_size // 2
-  # neg_size = batch_size - pos_size
-  # features["clicked"] = tf.concat([
-  #     tf.ones([pos_size, 1], dtype=tf.float32),
-  #     tf.zeros([neg_size, 1], dtype=tf.float32)
-  # ], axis=0)
-  features_int_np = np.random.randint(100,
-      size=(batch_size, num_d)
-  )
-  features_cat_np = np.random.randint(minsize,
-      size=(batch_size, num_s),
-      dtype=np.int32
-  )
-  return features_int_np, features_cat_np
+    """Emits random input features, used for testing."""
+    # features = {}
+    # pos_size = batch_size // 2
+    # neg_size = batch_size - pos_size
+    # features["clicked"] = tf.concat([
+    #     tf.ones([pos_size, 1], dtype=tf.float32),
+    #     tf.zeros([neg_size, 1], dtype=tf.float32)
+    # ], axis=0)
+    features_int_np = np.random.randint(100, size=(batch_size, num_d))
+    features_cat_np = np.random.randint(
+        minsize, size=(batch_size, num_s), dtype=np.int32
+    )
+    return features_int_np, features_cat_np
 
 
 def dot_interact(concat_features, params=None):
-  """Performs feature interaction operation between dense and sparse.
-
-  Input tensors represent dense and sparse features.
-  Pre-condition: The tensors have been stacked along dimension 1.
-
-  Args:
-    concat_features: Tensor of features with shape [B, n_features, feature_dim].
-    params: Model params.
-
-  Returns:
-    activations: Tensor representing interacted features.
-  """
-  # batch_size = concat_features.shape[0]
-  if not params:
-    params = {}
-
-  # Interact features, select lower-triangular portion, and re-shape.
-  xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
-  tf.logging.info("Model_FN: xactions shape: %s", xactions.get_shape())
-  ones = tf.ones_like(xactions)
-  upper_tri_mask = tf.linalg.band_part(ones, 0, -1)
-  feature_dim = xactions.shape[-1]
-
-  if params["opt_skip_gather"]:
-    upper_tri_bool = tf.cast(upper_tri_mask, tf.bool)
-    activations = tf.where(
-        condition=upper_tri_bool, x=tf.zeros_like(xactions), y=xactions)
-    tf.logging.info("Model_FN: activations shape: %s", activations.get_shape())
-    out_dim = feature_dim * feature_dim
-  else:
-    lower_tri_mask = ones - upper_tri_mask
-    activations = tf.boolean_mask(xactions, lower_tri_mask)
-    tf.logging.info("Model_FN: activations shape: %s", activations.get_shape())
-    out_dim = feature_dim * (feature_dim - 1) // 2
-
-  activations = tf.reshape(activations, (-1, out_dim))
-  return activations
+    """Performs feature interaction operation between dense and sparse.
+
+    Input tensors represent dense and sparse features.
+    Pre-condition: The tensors have been stacked along dimension 1.
+
+    Args:
+      concat_features: Tensor of features with shape [B, n_features, feature_dim].
+      params: Model params.
+
+    Returns:
+      activations: Tensor representing interacted features.
+    """
+    # batch_size = concat_features.shape[0]
+    if not params:
+        params = {}
+
+    # Interact features, select lower-triangular portion, and re-shape.
+    xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
+    tf.logging.info("Model_FN: xactions shape: %s", xactions.get_shape())
+    ones = tf.ones_like(xactions)
+    upper_tri_mask = tf.linalg.band_part(ones, 0, -1)
+    feature_dim = xactions.shape[-1]
+
+    if params["opt_skip_gather"]:
+        upper_tri_bool = tf.cast(upper_tri_mask, tf.bool)
+        activations = tf.where(
+            condition=upper_tri_bool, x=tf.zeros_like(xactions), y=xactions
+        )
+        tf.logging.info(
+            "Model_FN: activations shape: %s",
+            activations.get_shape())
+        out_dim = feature_dim * feature_dim
+    else:
+        lower_tri_mask = ones - upper_tri_mask
+        activations = tf.boolean_mask(xactions, lower_tri_mask)
+        tf.logging.info(
+            "Model_FN: activations shape: %s",
+            activations.get_shape())
+        out_dim = feature_dim * (feature_dim - 1) // 2
+
+    activations = tf.reshape(activations, (-1, out_dim))
+    return activations
 
 
 def logits_fn(features_int, features_cat, params):
-  """Calculate predictions."""
-  # tf.logging.info("Model_FN: Number of input features: %d", len(features))
-  # for ft in sorted(features.keys()):
-  #   tf.logging.info("Model_FN: Feature %s -- shape %s", ft,
-  #                   features[ft].get_shape())
-
-  reuse = False if params["is_training"] else True
-
-  bot_mlp_input = features_int
-  tf.logging.info("Model_FN: Bottom MLP input (int features) shape: %s",
-                  bot_mlp_input.get_shape())
-  mlp_dims_bottom = params["mlp_bottom"]
-
-  for layer_idx in range(len(mlp_dims_bottom)):
-    bot_mlp_input = tf.layers.dense(
-        bot_mlp_input,
-        mlp_dims_bottom[layer_idx],
-        activation="relu",
-        # ti: modules dont exist
-        # kernel_initializer=tf.compat.v2.initializers.GlorotNormal(),
-        kernel_initializer=tf.compat.v1.keras.initializers.glorot_normal(),
-        # bias_initializer=tf.compat.v2.initializers.RandomNormal(
-        #     mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_bottom[layer_idx])),
-        bias_initializer=tf.compat.v1.random_normal_initializer(
-          mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_bottom[layer_idx])),
-        name="bottom_mlp_layer_%d" % layer_idx,
-        reuse=reuse
+    """Calculate predictions."""
+    # tf.logging.info("Model_FN: Number of input features: %d", len(features))
+    # for ft in sorted(features.keys()):
+    #   tf.logging.info("Model_FN: Feature %s -- shape %s", ft,
+    #                   features[ft].get_shape())
+
+    reuse = False if params["is_training"] else True
+
+    bot_mlp_input = features_int
+    tf.logging.info(
+        "Model_FN: Bottom MLP input (int features) shape: %s", bot_mlp_input.get_shape(
+        )
+    )
+    mlp_dims_bottom = params["mlp_bottom"]
+
+    for layer_idx in range(len(mlp_dims_bottom)):
+        bot_mlp_input = tf.layers.dense(
+            bot_mlp_input,
+            mlp_dims_bottom[layer_idx],
+            activation="relu",
+            # ti: modules dont exist
+            # kernel_initializer=tf.compat.v2.initializers.GlorotNormal(),
+            kernel_initializer=tf.compat.v1.keras.initializers.glorot_normal(),
+            # bias_initializer=tf.compat.v2.initializers.RandomNormal(
+            # mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_bottom[layer_idx])),
+            bias_initializer=tf.compat.v1.random_normal_initializer(
+                mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_bottom[layer_idx])
+            ),
+            name="bottom_mlp_layer_%d" % layer_idx,
+            reuse=reuse,
+        )
+    bot_mlp_output = bot_mlp_input
+
+    cat_features = []
+    emb_tables = []
+    # for feature_name, value in sorted(features.items()):
+    #   if "categorical-feature" in feature_name:
+    #     cat_features.append(value)
+    num_s = params["num_sparse_features"]
+    for j in range(num_s):
+        emb_init = tf.random_uniform(
+            [params["vocab_sizes"][j], params["dim_embed"]], -1.0, 1.0
+        )
+        emb_matrix = tf.get_variable(
+            name="embedding_table%02d" % j,
+            dtype=tf.float32,
+            trainable=True,
+            initializer=emb_init,
+        )
+        emb_tables.append(emb_matrix)
+
+    for j in range(num_s):
+        col = tf.slice(features_cat, [0, j], [-1, 1])
+        ecol = tf.nn.embedding_lookup(emb_tables[j], col)
+        ecol = tf.reshape(ecol, [-1, params["dim_embed"]])
+        cat_features.append(ecol)
+
+    # tc_features = []
+    # if "tc-features" in features:
+    #   # Compute offsets for single concatenated table.
+    #   batch_size = features_tc.shape[0]
+    #   num_tc_features = features_tc.shape[1]
+    #   num_tables_in_ec = params["num_tables_in_ec"]
+    #   tc_table_sizes = feature_config.get_sorted_table_size()[num_tables_in_ec:]  # params["vocab_sizes"]
+    #   total_tbl_size = sum(tc_table_sizes)
+    #   idx_offsets = [0] + list(np.cumsum(tc_table_sizes[:-1]))
+    #   idx_offsets = tf.broadcast_to(
+    #       tf.constant(idx_offsets), (batch_size, num_tc_features))
+    #   idxs = idx_offsets + features_tc
+
+    #   def _create_init_table():
+    #     """Table initialization varies depending on the vocab size."""
+    #     full_tbl = np.zeros(
+    #         shape=(total_tbl_size, params["dim_embed"]), dtype=np.float32)
+    #     start_idx = 0
+    #     for idx, tbl_size in enumerate(tc_table_sizes):
+    #       end_idx = start_idx + tc_table_sizes[idx]
+    #       cur_tbl_init = np.random.uniform(
+    #           low=-1 / np.sqrt(tbl_size),
+    #           high=1 / np.sqrt(tbl_size),
+    #           size=(tbl_size, params["dim_embed"])).astype(np.float32)
+    #       full_tbl[start_idx:end_idx, :] = cur_tbl_init
+    #       start_idx += tc_table_sizes[idx]
+    #     return tf.constant(full_tbl)
+
+    #   tc_embedding_table = tf.get_variable(
+    #       name="tc_embedding_table",
+    #       dtype=tf.float32,
+    #       trainable=True,
+    #       # pylint: disable=unnecessary-lambda
+    #       initializer=lambda: _create_init_table())
+    #   tc_features = tf.gather(tc_embedding_table, idxs)
+    #   tf.logging.info("TC features shape: {}".format(tc_features.get_shape()))
+
+    # Dot feature interaction
+    # Concat and reshape, instead of stack. Better for performance.
+    # batch_size = bot_mlp_output.shape[0]
+    feature_stack = tf.concat([bot_mlp_output] + cat_features, axis=-1)
+    feature_stack = tf.reshape(
+        feature_stack, [-1, params["num_sparse_features"] +
+                        1, params["dim_embed"]]
+    )
+
+    # if "tc-features" in features:
+    #   feature_stack = tf.concat([feature_stack, tc_features], axis=1)
+    tf.logging.info(
+        "Model_FN: concated feature shape: %s",
+        feature_stack.get_shape())
+    dot_interact_output = dot_interact(
+        concat_features=feature_stack, params=params)
+    top_mlp_input = tf.concat([bot_mlp_output, dot_interact_output], axis=1)
+    tf.logging.info(
+        "Model_FN: Top MLP input (full features) shape: %s", top_mlp_input.get_shape(
+        )
+    )
+
+    # Capture original MLP fan-in for proper kernel initialization.
+    num_fts = len(cat_features) + 1
+    orig_top_mlp_dim = (num_fts * (num_fts - 1)) / 2 + params["dim_embed"]
+    tf.logging.info(
+        "Model_FN: Original feature len: {}".format(orig_top_mlp_dim))
+
+    # Top MLP
+    # NOTE: For the top MLP, the last layer is a sigmoid. The loss function should
+    # therefore take [0,1] probability values as inputs, instead of logits.
+    mlp_dims_top = params["mlp_top"]
+    num_layers_top = len(mlp_dims_top)
+    sigmoid_layer_top = num_layers_top - 1
+    for layer_idx in range(num_layers_top):
+        fan_in = orig_top_mlp_dim if layer_idx == 0 else mlp_dims_top[layer_idx - 1]
+        fan_out = mlp_dims_top[layer_idx]
+        tf.logging.info(
+            "  layer {}: fan_in={} fan_out={}".format(
+                layer_idx, fan_in, fan_out)
         )
-  bot_mlp_output = bot_mlp_input
-
-  cat_features = []
-  emb_tables = []
-  # for feature_name, value in sorted(features.items()):
-  #   if "categorical-feature" in feature_name:
-  #     cat_features.append(value)
-  num_s = params["num_sparse_features"]
-  for j in range(num_s):
-    emb_init = tf.random_uniform([params["vocab_sizes"][j], params["dim_embed"]], -1.0, 1.0)
-    emb_matrix = tf.get_variable(
-        name="embedding_table%02d" % j,
-        dtype=tf.float32,
-        trainable=True,
-        initializer=emb_init)
-    emb_tables.append(emb_matrix)
-
-  for j in range(num_s):
-    col = tf.slice(features_cat, [0, j], [-1, 1])
-    ecol = tf.nn.embedding_lookup(emb_tables[j], col)
-    ecol = tf.reshape(ecol, [-1, params["dim_embed"]])
-    cat_features.append(ecol)
-
-  # tc_features = []
-  # if "tc-features" in features:
-  #   # Compute offsets for single concatenated table.
-  #   batch_size = features_tc.shape[0]
-  #   num_tc_features = features_tc.shape[1]
-  #   num_tables_in_ec = params["num_tables_in_ec"]
-  #   tc_table_sizes = feature_config.get_sorted_table_size()[num_tables_in_ec:]  # params["vocab_sizes"]
-  #   total_tbl_size = sum(tc_table_sizes)
-  #   idx_offsets = [0] + list(np.cumsum(tc_table_sizes[:-1]))
-  #   idx_offsets = tf.broadcast_to(
-  #       tf.constant(idx_offsets), (batch_size, num_tc_features))
-  #   idxs = idx_offsets + features_tc
-
-  #   def _create_init_table():
-  #     """Table initialization varies depending on the vocab size."""
-  #     full_tbl = np.zeros(
-  #         shape=(total_tbl_size, params["dim_embed"]), dtype=np.float32)
-  #     start_idx = 0
-  #     for idx, tbl_size in enumerate(tc_table_sizes):
-  #       end_idx = start_idx + tc_table_sizes[idx]
-  #       cur_tbl_init = np.random.uniform(
-  #           low=-1 / np.sqrt(tbl_size),
-  #           high=1 / np.sqrt(tbl_size),
-  #           size=(tbl_size, params["dim_embed"])).astype(np.float32)
-  #       full_tbl[start_idx:end_idx, :] = cur_tbl_init
-  #       start_idx += tc_table_sizes[idx]
-  #     return tf.constant(full_tbl)
-
-  #   tc_embedding_table = tf.get_variable(
-  #       name="tc_embedding_table",
-  #       dtype=tf.float32,
-  #       trainable=True,
-  #       # pylint: disable=unnecessary-lambda
-  #       initializer=lambda: _create_init_table())
-  #   tc_features = tf.gather(tc_embedding_table, idxs)
-  #   tf.logging.info("TC features shape: {}".format(tc_features.get_shape()))
-
-  # Dot feature interaction
-  # Concat and reshape, instead of stack. Better for performance.
-  # batch_size = bot_mlp_output.shape[0]
-  feature_stack = tf.concat([bot_mlp_output] + cat_features, axis=-1)
-  feature_stack = tf.reshape(feature_stack,
-                             [-1, params["num_sparse_features"] + 1, params["dim_embed"]])
-
-  # if "tc-features" in features:
-  #   feature_stack = tf.concat([feature_stack, tc_features], axis=1)
-  tf.logging.info("Model_FN: concated feature shape: %s",
-                  feature_stack.get_shape())
-  dot_interact_output = dot_interact(
-      concat_features=feature_stack, params=params)
-  top_mlp_input = tf.concat([bot_mlp_output, dot_interact_output], axis=1)
-  tf.logging.info("Model_FN: Top MLP input (full features) shape: %s",
-                  top_mlp_input.get_shape())
-
-  # Capture original MLP fan-in for proper kernel initialization.
-  num_fts = len(cat_features) + 1
-  orig_top_mlp_dim = (num_fts * (num_fts - 1)) / 2 + params["dim_embed"]
-  tf.logging.info("Model_FN: Original feature len: {}".format(orig_top_mlp_dim))
-
-  # Top MLP
-  # NOTE: For the top MLP, the last layer is a sigmoid. The loss function should
-  #       therefore take [0,1] probability values as inputs, instead of logits.
-  mlp_dims_top = params["mlp_top"]
-  num_layers_top = len(mlp_dims_top)
-  sigmoid_layer_top = num_layers_top - 1
-  for layer_idx in range(num_layers_top):
-    fan_in = orig_top_mlp_dim if layer_idx == 0 else mlp_dims_top[layer_idx - 1]
-    fan_out = mlp_dims_top[layer_idx]
-    tf.logging.info("  layer {}: fan_in={} fan_out={}".format(
-        layer_idx, fan_in, fan_out))
-    top_mlp_input = tf.layers.dense(
-        top_mlp_input,
-        mlp_dims_top[layer_idx],
-        activation="sigmoid" if layer_idx == sigmoid_layer_top else "relu",
-        # NOTE: We would usually use GlorotNormal() initializer here. But due to
-        # the skip_gather optimization, the GlorotNormal would result in a
-        # mathematical error, as Glorot is a function of the fan-in.
-        # The fan-in will be larger for skip-gather activations since we also
-        # pass in the zeros. Therefore we explicitly set the kernel intializer
-        # to RandomNormal(0, sqrt(2/(fan_in+fan_out))
-
-        # ti: see above
-        # kernel_initializer=tf.compat.v2.initializers.RandomNormal(
-        #     mean=0.0, stddev=math.sqrt(2.0 / (fan_in + fan_out))),
-        # bias_initializer=tf.compat.v2.initializers.RandomNormal(
-        #     mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_top[layer_idx])),
-        kernel_initializer=tf.compat.v1.keras.initializers.glorot_normal(),
-        bias_initializer=tf.compat.v1.random_normal_initializer(
-          mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_top[layer_idx])),
-        name="top_mlp_layer_%d" % layer_idx,
-        reuse=reuse
+        top_mlp_input = tf.layers.dense(
+            top_mlp_input,
+            mlp_dims_top[layer_idx],
+            activation="sigmoid" if layer_idx == sigmoid_layer_top else "relu",
+            # NOTE: We would usually use GlorotNormal() initializer here. But due to
+            # the skip_gather optimization, the GlorotNormal would result in a
+            # mathematical error, as Glorot is a function of the fan-in.
+            # The fan-in will be larger for skip-gather activations since we also
+            # pass in the zeros. Therefore we explicitly set the kernel intializer
+            # to RandomNormal(0, sqrt(2/(fan_in+fan_out))
+            # ti: see above
+            # kernel_initializer=tf.compat.v2.initializers.RandomNormal(
+            #     mean=0.0, stddev=math.sqrt(2.0 / (fan_in + fan_out))),
+            # bias_initializer=tf.compat.v2.initializers.RandomNormal(
+            #     mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_top[layer_idx])),
+            kernel_initializer=tf.compat.v1.keras.initializers.glorot_normal(),
+            bias_initializer=tf.compat.v1.random_normal_initializer(
+                mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_top[layer_idx])
+            ),
+            name="top_mlp_layer_%d" % layer_idx,
+            reuse=reuse,
         )
-  predictions = top_mlp_input
-  return predictions
+    predictions = top_mlp_input
+    return predictions
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/version.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/version.py
index 1152dbb41..570348596 100644
--- a/retired_benchmarks/recommendation/dlrm/pytorch/python/version.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/version.py
@@ -1,3 +1,2 @@
-
-version = '0.1.0'
-git_version = '05df3bae82ef9fc933277385eb778e3f22cd0c6a'
+version = "0.1.0"
+git_version = "05df3bae82ef9fc933277385eb778e3f22cd0c6a"
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/setup.py b/retired_benchmarks/recommendation/dlrm/pytorch/setup.py
index c1e2fbcf0..758d874fb 100644
--- a/retired_benchmarks/recommendation/dlrm/pytorch/setup.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/setup.py
@@ -13,17 +13,20 @@
 from setuptools import setup, find_packages, Command
 
 TOP_DIR = os.path.realpath(os.path.dirname(__file__))
-SRC_DIR = os.path.join(TOP_DIR, 'python')
+SRC_DIR = os.path.join(TOP_DIR, "python")
 
 try:
-    git_version = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=TOP_DIR).decode('ascii').strip()
+    git_version = (
+        subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=TOP_DIR)
+        .decode("ascii")
+        .strip()
+    )
 except (OSError, subprocess.CalledProcessError):
     git_version = None
 
-with open(os.path.join(TOP_DIR, 'VERSION_NUMBER')) as version_file:
-    VersionInfo = namedtuple('VersionInfo', ['version', 'git_version'])(
-        version=version_file.read().strip(),
-        git_version=git_version
+with open(os.path.join(TOP_DIR, "VERSION_NUMBER")) as version_file:
+    VersionInfo = namedtuple("VersionInfo", ["version", "git_version"])(
+        version=version_file.read().strip(), git_version=git_version
     )
 
 
@@ -37,49 +40,67 @@ def finalize_options(self):
         pass
 
     def run(self):
-        with open(os.path.join(SRC_DIR, 'version.py'), 'w') as f:
-            f.write(dedent('''
+        with open(os.path.join(SRC_DIR, "version.py"), "w") as f:
+            f.write(
+                dedent(
+                    """
             version = '{version}'
             git_version = '{git_version}'
-            '''.format(**dict(VersionInfo._asdict()))))
+            """.format(
+                        **dict(VersionInfo._asdict())
+                    )
+                )
+            )
 
 
 class build_py(setuptools.command.build_py.build_py):
     def run(self):
-        self.run_command('create_version')
+        self.run_command("create_version")
         setuptools.command.build_py.build_py.run(self)
 
 
 class build(distutils.command.build.build):
     def run(self):
-        self.run_command('build_py')
+        self.run_command("build_py")
 
 
 class develop(setuptools.command.develop.develop):
     def run(self):
-        self.run_command('create_version')
-        self.run_command('build')
+        self.run_command("create_version")
+        self.run_command("build")
         setuptools.command.develop.develop.run(self)
 
 
 cmdclass = {
-    'create_version': create_version,
-    'build_py': build_py,
-    'build': build,
-    'develop': develop,
+    "create_version": create_version,
+    "build_py": build_py,
+    "build": build,
+    "develop": develop,
 }
 
 setup(
     name="mlperf-inference",
     version=VersionInfo.version,
-    description='mlperf inference benchmark',
-    setup_requires=['pytest-runner'],
-    tests_require=['graphviz', 'parameterized', 'pytest', 'pytest-cov', 'pyyaml'],
+    description="mlperf inference benchmark",
+    setup_requires=["pytest-runner"],
+    tests_require=[
+        "graphviz",
+        "parameterized",
+        "pytest",
+        "pytest-cov",
+        "pyyaml"],
     cmdclass=cmdclass,
     packages=find_packages(),
-    author='guschmue@microsoft.com',
-    author_email='guschmue@microsoft.com',
-    url='https://github.com/mlperf/inference',
-    install_requires=['numpy>=1.14.1', 'onnx>=1.5', 'pybind11', 'Cython',
-                        'pycocotools', 'mlperf_loadgen', 'opencv-python-headless']
+    author="guschmue@microsoft.com",
+    author_email="guschmue@microsoft.com",
+    url="https://github.com/mlperf/inference",
+    install_requires=[
+        "numpy>=1.14.1",
+        "onnx>=1.5",
+        "pybind11",
+        "Cython",
+        "pycocotools",
+        "mlperf_loadgen",
+        "opencv-python-headless",
+    ],
 )
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/tools/accuracy-dlrm.py b/retired_benchmarks/recommendation/dlrm/pytorch/tools/accuracy-dlrm.py
index 873f6a0e1..fc8332ac4 100644
--- a/retired_benchmarks/recommendation/dlrm/pytorch/tools/accuracy-dlrm.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/tools/accuracy-dlrm.py
@@ -16,25 +16,39 @@
 
 # pylint: disable=missing-docstring
 
+
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--day-23-file", default=None,
-        help="path to day_23 file. If present, it is assumed that the accuracy log contains only the prediction, not the ground truth label.")
-    parser.add_argument("--aggregation-trace-file", default=None,
-        help="path to dlrm_trace_of_aggregated_samples.txt. Only needed if --day-23-file is specified")
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
-    parser.add_argument("--dtype", default="float32", choices=["float32", "int32", "int64"], help="data type of the label")
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--day-23-file",
+        default=None,
+        help="path to day_23 file. If present, it is assumed that the accuracy log contains only the prediction, not the ground truth label.",
+    )
+    parser.add_argument(
+        "--aggregation-trace-file",
+        default=None,
+        help="path to dlrm_trace_of_aggregated_samples.txt. Only needed if --day-23-file is specified",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float32", "int32", "int64"],
+        help="data type of the label",
+    )
     args = parser.parse_args()
     return args
 
 
-dtype_map = {
-    "float32": np.float32,
-    "int32": np.int32,
-    "int64": np.int64
-}
+dtype_map = {"float32": np.float32, "int32": np.int32, "int64": np.int64}
+
 
 def get_targets(args, qsl_indices):
     # Parse aggregation trace file to know the sample -> user-item pair mapping
@@ -42,8 +56,11 @@ def get_targets(args, qsl_indices):
     sample_boundaries = [0]
     with open(args.aggregation_trace_file) as f:
         for line in f:
-            sample_boundaries.append(sample_boundaries[-1] + int(line.split(", ")[2]))
-    assert len(sample_boundaries) == len(qsl_indices) + 1, "Number of samples in trace file does not match number of samples in loadgen accuracy log!"
+            sample_boundaries.append(
+                sample_boundaries[-1] + int(line.split(", ")[2]))
+    assert (
+        len(sample_boundaries) == len(qsl_indices) + 1
+    ), "Number of samples in trace file does not match number of samples in loadgen accuracy log!"
     # Get all the ground truth labels in the original order in day_23
     print("Parsing ground truth labels from day_23 file...")
     ground_truths = []
@@ -52,18 +69,22 @@ def get_targets(args, qsl_indices):
             if line_idx >= sample_boundaries[-1]:
                 break
             ground_truths.append(int(line.split("\t")[0]))
-    # Re-order the ground truth labels according to the qsl indices in the loadgen log.
+    # Re-order the ground truth labels according to the qsl indices in the
+    # loadgen log.
     print("Re-ordering ground truth labels...")
     targets = []
     for qsl_idx in qsl_indices:
-        for i in range(sample_boundaries[qsl_idx], sample_boundaries[qsl_idx + 1]):
+        for i in range(sample_boundaries[qsl_idx],
+                       sample_boundaries[qsl_idx + 1]):
             targets.append(ground_truths[i])
     return targets
 
+
 def main():
     args = get_args()
 
-    # If "--day-23-file" is specified, assume that the accuracy log contains only the prediction, not the ground truth label.
+    # If "--day-23-file" is specified, assume that the accuracy log contains
+    # only the prediction, not the ground truth label.
     log_contains_gt = args.day_23_file is None
 
     if log_contains_gt:
@@ -77,12 +98,12 @@ def main():
 
     seen = set()
     good = 0
-    total= 0
+    total = 0
     all_results = []
     all_targets = []
     qsl_indices = []
     for j in results:
-        idx = j['qsl_idx']
+        idx = j["qsl_idx"]
 
         # de-dupe in case loadgen sends the same sample multiple times
         if idx in seen:
@@ -91,7 +112,7 @@ def main():
         qsl_indices.append(idx)
 
         # reconstruct label from mlperf accuracy log
-        data = np.frombuffer(bytes.fromhex(j['data']), dtype_map[args.dtype])
+        data = np.frombuffer(bytes.fromhex(j["data"]), dtype_map[args.dtype])
 
         # data stores both predictions and targets
         output_count = 2 if log_contains_gt else 1
@@ -114,7 +135,11 @@ def main():
                     good += 1
                 else:
                     if args.verbose:
-                        print("{}:{}, expected: {}, found {}".format(idx, k, target, result.round()))
+                        print(
+                            "{}:{}, expected: {}, found {}".format(
+                                idx, k, target, result.round()
+                            )
+                        )
 
     if not log_contains_gt:
         all_targets = get_targets(args, qsl_indices)
@@ -129,9 +154,16 @@ def main():
     roc_auc = sklearn.metrics.roc_auc_score(all_targets, all_results)
     # compute accuracy metric
     acc = good / total
-    print("AUC={:.3f}%, accuracy={:.3f}%, good={}, total={}, queries={}".format(100. * roc_auc, 100. * acc, good, total, len(seen)))
+    print(
+        "AUC={:.3f}%, accuracy={:.3f}%, good={}, total={}, queries={}".format(
+            100.0 * roc_auc, 100.0 * acc, good, total, len(seen)
+        )
+    )
     if args.verbose:
-        print("found and ignored {} query dupes".format(len(results) - len(seen)))
+        print(
+            "found and ignored {} query dupes".format(
+                len(results) -
+                len(seen)))
 
 
 if __name__ == "__main__":
diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/tools/quickgen.py b/retired_benchmarks/recommendation/dlrm/pytorch/tools/quickgen.py
index 9a8232271..bed524348 100644
--- a/retired_benchmarks/recommendation/dlrm/pytorch/tools/quickgen.py
+++ b/retired_benchmarks/recommendation/dlrm/pytorch/tools/quickgen.py
@@ -1,11 +1,12 @@
-'''
+"""
 quick generator of random samples for debugging
-'''
+"""
 
 import sys
 import argparse
 import numpy as np
 
+
 def quickgen(num_samples, num_t, num_d, num_s, ln_emb, text_file=None):
     # generate place holder random array, including dense features
     a = np.random.randint(0, 10, (num_t + num_d + num_s, num_samples))
@@ -23,23 +24,31 @@ def quickgen(num_samples, num_t, num_d, num_s, ln_emb, text_file=None):
     for _ in range(num_s):
         lstr.append("%x")
     if text_file is not None:
-        np.savetxt(text_file, a, fmt=lstr, delimiter='\t',)
+        np.savetxt(
+            text_file,
+            a,
+            fmt=lstr,
+            delimiter="\t",
+        )
 
     return a
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Quick generator of random samples for debugging."
     )
-    parser.add_argument("--num-samples",         type=int, default=4096)
-    parser.add_argument("--num-dense-features",  type=int, default=13)
+    parser.add_argument("--num-samples", type=int, default=4096)
+    parser.add_argument("--num-dense-features", type=int, default=13)
     parser.add_argument("--num-sparse-features", type=str, default="4-3-2")
-    parser.add_argument("--num-targets",         type=int, default=1)
-    parser.add_argument("--profile",             type=str, default="")  # kaggle|terabyte0875|terabyte
-    parser.add_argument("--num-days",            type=int, default=24)
-    parser.add_argument("--numpy-rand-seed",     type=int, default=123)
-    parser.add_argument("--output-name",         type=str, default="day_")
-    parser.add_argument("--output-dir",          type=str, default="./")
+    parser.add_argument("--num-targets", type=int, default=1)
+    parser.add_argument(
+        "--profile", type=str, default=""
+    )  # kaggle|terabyte0875|terabyte
+    parser.add_argument("--num-days", type=int, default=24)
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--output-name", type=str, default="day_")
+    parser.add_argument("--output-dir", type=str, default="./")
     args = parser.parse_args()
 
     np.random.seed(args.numpy_rand_seed)
@@ -48,27 +57,118 @@ def quickgen(num_samples, num_t, num_d, num_s, ln_emb, text_file=None):
     out_name = args.output_name
     ln_emb = np.fromstring(args.num_sparse_features, dtype=int, sep="-")
     if args.profile == "kaggle":
-        # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh)
+        # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see
+        # ./bench/dlrm_s_criteo_kaggle.sh)
         num_days = 1
         out_name = "train.txt"
-        ln_emb = np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572])
+        ln_emb = np.array(
+            [
+                1460,
+                583,
+                10131227,
+                2202608,
+                305,
+                24,
+                12517,
+                633,
+                3,
+                93145,
+                5683,
+                8351593,
+                3194,
+                27,
+                14992,
+                5461306,
+                10,
+                5652,
+                2173,
+                4,
+                7046547,
+                18,
+                15,
+                286181,
+                105,
+                142572,
+            ]
+        )
     elif args.profile == "terabyte0875":
-        # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000)
+        # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh
+        # [--sub-sample=0.875] --max-in-range=10000000)
         num_days = 24
         out_name = "day_"
-        ln_emb = np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36])
+        ln_emb = np.array(
+            [
+                9980333,
+                36084,
+                17217,
+                7378,
+                20134,
+                3,
+                7112,
+                1442,
+                61,
+                9758201,
+                1333352,
+                313829,
+                10,
+                2208,
+                11156,
+                122,
+                4,
+                970,
+                14,
+                9994222,
+                7267859,
+                9946608,
+                415421,
+                12420,
+                101,
+                36,
+            ]
+        )
     elif args.profile == "terabyte":
-        # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000)
+        # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh
+        # --max-in-range=40000000)
         num_days = 24
         out_name = "day_"
-        ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36])
+        ln_emb = np.array(
+            [
+                39884406,
+                39043,
+                17289,
+                7420,
+                20263,
+                3,
+                7120,
+                1543,
+                63,
+                38532951,
+                2953546,
+                403346,
+                10,
+                2208,
+                11938,
+                155,
+                4,
+                976,
+                14,
+                39979771,
+                25641295,
+                39664984,
+                585935,
+                12972,
+                108,
+                36,
+            ]
+        )
 
-    num_d   = args.num_dense_features
-    num_s   = len(ln_emb)
-    num_t   = args.num_targets
+    num_d = args.num_dense_features
+    num_s = len(ln_emb)
+    num_t = args.num_targets
     out_dir = args.output_dir
     for k in range(num_days):
-        text_file =  out_dir + out_name + ("" if args.profile == "kaggle" else str(k))
+        text_file = out_dir + out_name + \
+            ("" if args.profile == "kaggle" else str(k))
         print(text_file)
 
         quickgen(args.num_samples, num_t, num_d, num_s, ln_emb, text_file)
diff --git a/retired_benchmarks/recommendation/dlrm/tf/common.py b/retired_benchmarks/recommendation/dlrm/tf/common.py
index 20e0ce70a..76b2beacd 100644
--- a/retired_benchmarks/recommendation/dlrm/tf/common.py
+++ b/retired_benchmarks/recommendation/dlrm/tf/common.py
@@ -13,216 +13,240 @@
 # limitations under the License.
 # ==============================================================================
 """Flags and common definitions for all modules in the DLRM module."""
- 
+
 import collections
 from typing import Dict, Any
 from absl import flags
- 
- 
+
+
 FLAGS = flags.FLAGS
- 
+
 PARAMS = collections.defaultdict(
     lambda: None,  # Set default value to None.
     default_batch_size=32,
- 
     # TPU-specific parameters
     use_tpu=True,
 )
- 
+
 FAKE_DATA_VOCAB_SIZE = 1000
 
+
 def get_params() -> Dict[str, Any]:
-  """Provides param dict and sets defaults.
- 
-  Returns:
-    A dict representing the params for the model execution.
-  Raises:
-    ValueError: If parameters violate model architecture constraints.
-  """
- 
-  params = PARAMS.copy()
- 
-  params["data_dir"] = FLAGS.data_dir
-  params["model_dir"] = FLAGS.model_dir
-  params["summary_every_n_steps"] = FLAGS.summary_every_n_steps
-  params["batch_size"] = FLAGS.batch_size
-  params["eval_batch_size"] = FLAGS.eval_batch_size
-  params["dim_embed"] = FLAGS.dim_embed
-  params["vocab_sizes"] = [int(x) for x in FLAGS.vocab_sizes_embed]
-  params["num_dense_features"] = FLAGS.num_dense_features
-  params["num_tables_in_ec"] = FLAGS.num_tables_in_ec
-  params["mlp_bottom"] = [int(x) for x in FLAGS.mlp_bottom]
-  params["mlp_top"] = [int(x) for x in FLAGS.mlp_top]
-  params["learning_rate"] = FLAGS.learning_rate
-  params["lr_warmup_steps"] = FLAGS.lr_warmup_steps
-  params["decay_steps"] = FLAGS.decay_steps
-  params["decay_start_step"] = FLAGS.decay_start_step
-  params["optimizer"] = FLAGS.optimizer
-  params["adagrad_init_accum"] = FLAGS.adagrad_init_accum
-  params["num_shards"] = FLAGS.num_tpu_shards
-  params["eval_steps"] = FLAGS.eval_steps
-  params["replicas_per_host"] = FLAGS.replicas_per_host
-  params["bfloat16_grads_all_reduce"] = FLAGS.bfloat16_grads_all_reduce
-  # Dataset
-  params["terabyte"] = FLAGS.terabyte
-  params["use_synthetic_data"] = FLAGS.use_synthetic_data
-  params["use_cached_data"] = FLAGS.use_cached_data
-  if params["use_synthetic_data"]:
-    params["vocab_sizes"] = [FAKE_DATA_VOCAB_SIZE for _ in FLAGS.vse]
-  # Optimization
-  params["opt_skip_gather"] = True
- 
-  if params["dim_embed"] != params["mlp_bottom"][-1]:
-    raise ValueError("Dimensionality of latent features (embedding dim) " +
-                     "must be equal to size of last layer of the bottom MLP.")
-  if params["batch_size"] % params["num_shards"]:
-    raise ValueError("Training batch size {} must be a multiple of num_cores {}"
-                     .format(params["batch_size"], params["num_shards"]))
-  if params["eval_batch_size"] % params["num_shards"]:
-    raise ValueError("Eval batch size {} must be a multiple of num_cores {}"
-                     .format(params["eval_batch_size"], params["num_shards"]))
-  return params
- 
- 
+    """Provides param dict and sets defaults.
+
+    Returns:
+      A dict representing the params for the model execution.
+    Raises:
+      ValueError: If parameters violate model architecture constraints.
+    """
+
+    params = PARAMS.copy()
+
+    params["data_dir"] = FLAGS.data_dir
+    params["model_dir"] = FLAGS.model_dir
+    params["summary_every_n_steps"] = FLAGS.summary_every_n_steps
+    params["batch_size"] = FLAGS.batch_size
+    params["eval_batch_size"] = FLAGS.eval_batch_size
+    params["dim_embed"] = FLAGS.dim_embed
+    params["vocab_sizes"] = [int(x) for x in FLAGS.vocab_sizes_embed]
+    params["num_dense_features"] = FLAGS.num_dense_features
+    params["num_tables_in_ec"] = FLAGS.num_tables_in_ec
+    params["mlp_bottom"] = [int(x) for x in FLAGS.mlp_bottom]
+    params["mlp_top"] = [int(x) for x in FLAGS.mlp_top]
+    params["learning_rate"] = FLAGS.learning_rate
+    params["lr_warmup_steps"] = FLAGS.lr_warmup_steps
+    params["decay_steps"] = FLAGS.decay_steps
+    params["decay_start_step"] = FLAGS.decay_start_step
+    params["optimizer"] = FLAGS.optimizer
+    params["adagrad_init_accum"] = FLAGS.adagrad_init_accum
+    params["num_shards"] = FLAGS.num_tpu_shards
+    params["eval_steps"] = FLAGS.eval_steps
+    params["replicas_per_host"] = FLAGS.replicas_per_host
+    params["bfloat16_grads_all_reduce"] = FLAGS.bfloat16_grads_all_reduce
+    # Dataset
+    params["terabyte"] = FLAGS.terabyte
+    params["use_synthetic_data"] = FLAGS.use_synthetic_data
+    params["use_cached_data"] = FLAGS.use_cached_data
+    if params["use_synthetic_data"]:
+        params["vocab_sizes"] = [FAKE_DATA_VOCAB_SIZE for _ in FLAGS.vse]
+    # Optimization
+    params["opt_skip_gather"] = True
+
+    if params["dim_embed"] != params["mlp_bottom"][-1]:
+        raise ValueError(
+            "Dimensionality of latent features (embedding dim) "
+            + "must be equal to size of last layer of the bottom MLP."
+        )
+    if params["batch_size"] % params["num_shards"]:
+        raise ValueError(
+            "Training batch size {} must be a multiple of num_cores {}".format(
+                params["batch_size"], params["num_shards"]
+            )
+        )
+    if params["eval_batch_size"] % params["num_shards"]:
+        raise ValueError(
+            "Eval batch size {} must be a multiple of num_cores {}".format(
+                params["eval_batch_size"], params["num_shards"]
+            )
+        )
+    return params
+
+
 def define_dlrm_flags() -> None:
-  """Flags for running dlrm_main."""
- 
-  # TODO(tayo): Merge flags with the low level runner.
-  flags.DEFINE_string(
-      "data_dir",
-      default=None,
-      help="Path to the data directory.")
-  flags.DEFINE_integer(
-      "batch_size",
-      default=32,
-      help="Batch size for training.")
-  flags.DEFINE_bool(
-      "use_synthetic_data",
-      default=False,
-      help="If true, uses synthetic data.")
-  flags.DEFINE_enum(
-      "optimizer",
-      default="sgd",
-      enum_values=["sgd", "adagrad"],
-      help="Optimizer to use for parameter updates.")
-  flags.DEFINE_float(
-      name="adagrad_init_accum",
-      default=0.01,
-      help="Adagrad initial accumulator values.")
-  flags.DEFINE_integer(
-      name="lr_warmup_steps",
-      default=0,
-      help="Number of warmup steps in learning rate.")
-  flags.DEFINE_integer(
-      name="decay_steps",
-      default=0,
-      help="Number of decay steps used in polynomial decay.")
-  flags.DEFINE_integer(
-      name="decay_start_step",
-      default=0,
-      help="Step to begin decay, if decay_steps > 0.")
-  flags.DEFINE_integer(
-      name="num_tpu_shards",
-      default=8,
-      help="Number of shards (cores).")
-  flags.DEFINE_integer(
-      name="eval_batch_size",
-      short_name="ebs",
-      default=16384,
-      help="Global batch size to use during eval.")
-  flags.DEFINE_float(
-      name="learning_rate",
-      short_name="lr",
-      default=0.01,
-      help="The learning rate.")
-  flags.DEFINE_integer(
-      name="train_steps",
-      short_name="ts",
-      default=1000,
-      help="The number of steps used to train.")
-  flags.DEFINE_integer(
-      name="eval_steps",
-      short_name="es",
-      default=5440,
-      help="The number of steps used to eval.")
-  flags.DEFINE_integer(
-      name="steps_between_evals",
-      short_name="sbe",
-      default=100,
-      help="The Number of training steps to run between evaluations. This is "
-           "used if --train_steps is defined.")
-  flags.DEFINE_integer(
-      name="summary_every_n_steps",
-      default=100,
-      help="Number of training steps to run before communicating with host to "
-           "send summaries.")
-  flags.DEFINE_bool(
-      name="use_cached_data",
-      default=False,
-      help="If true, take a few samples and repeat.")
-  flags.DEFINE_string(
-      name="mode",
-      default="train",
-      help="mode: train or eval")
-  flags.DEFINE_bool(
-      name="use_ctl",
-      default=False,
-      help="Whether the model runs with custom training loop.")
-  flags.DEFINE_bool(
-      name="terabyte",
-      default=True,
-      help="If true, data paths use terabyte format. Else kaggle.")
-  # System params.
-  flags.DEFINE_bool(
-      name="pipeline_execution",
-      default=False,
-      help="If true, pipeline embedding execution with TensorCore.")
-  flags.DEFINE_bool(
-      name="use_batched_tfrecords",
-      default=False,
-      help="If true, use dataset of batched TFRecords, instead of csv.")
-  flags.DEFINE_bool(
-      name="bfloat16_grads_all_reduce",
-      default=False,
-      help="If true, use bfloat16 for all-reduce computation.")
-  flags.DEFINE_string(
-      name="data_cell",
-      default="mb",
-      help="Data cell. Path to data directory is determined dynamically.")
-  flags.DEFINE_enum(
-      "partition_strategy",
-      default="div",
-      enum_values=["div", "mod"],
-      help="Partition strategy for the embeddings.")
-  # Model architecture params.
-  flags.DEFINE_integer(
-      name="dim_embed",
-      short_name="de",
-      default=4,
-      help="Embedding dimension.")
-  flags.DEFINE_list(
-      name="mlp_bottom",
-      default="8, 4",
-      help="Hidden layers for the bottom MLP. "
-           "To specify different sizes of MLP layers: --layers=32,16,8,4")
-  flags.DEFINE_list(
-      name="mlp_top",
-      default="128, 64, 1",
-      help="The sizes of hidden layers for MLP. "
-           "To specify different sizes of MLP layers: --layers=32,16,8,4")
-  flags.DEFINE_list(
-      name="vocab_sizes_embed",
-      short_name="vse",
-      default="8, 8, 8, 8",
-      help="Vocab sizes for each of the sparse features. The order agrees with "
-           "the order of the input data.")
-  flags.DEFINE_integer(
-      name="num_dense_features",
-      short_name="ndf",
-      default=3,
-      help="Number of dense features.")
-  flags.DEFINE_integer(
-      name="num_tables_in_ec",
-      default=26,
-      help="Number of embedding tables in the embedding core.")
+    """Flags for running dlrm_main."""
 
+    # TODO(tayo): Merge flags with the low level runner.
+    flags.DEFINE_string(
+        "data_dir",
+        default=None,
+        help="Path to the data directory.")
+    flags.DEFINE_integer(
+        "batch_size",
+        default=32,
+        help="Batch size for training.")
+    flags.DEFINE_bool(
+        "use_synthetic_data", default=False, help="If true, uses synthetic data."
+    )
+    flags.DEFINE_enum(
+        "optimizer",
+        default="sgd",
+        enum_values=["sgd", "adagrad"],
+        help="Optimizer to use for parameter updates.",
+    )
+    flags.DEFINE_float(
+        name="adagrad_init_accum",
+        default=0.01,
+        help="Adagrad initial accumulator values.",
+    )
+    flags.DEFINE_integer(
+        name="lr_warmup_steps",
+        default=0,
+        help="Number of warmup steps in learning rate.",
+    )
+    flags.DEFINE_integer(
+        name="decay_steps",
+        default=0,
+        help="Number of decay steps used in polynomial decay.",
+    )
+    flags.DEFINE_integer(
+        name="decay_start_step",
+        default=0,
+        help="Step to begin decay, if decay_steps > 0.",
+    )
+    flags.DEFINE_integer(
+        name="num_tpu_shards", default=8, help="Number of shards (cores)."
+    )
+    flags.DEFINE_integer(
+        name="eval_batch_size",
+        short_name="ebs",
+        default=16384,
+        help="Global batch size to use during eval.",
+    )
+    flags.DEFINE_float(
+        name="learning_rate", short_name="lr", default=0.01, help="The learning rate."
+    )
+    flags.DEFINE_integer(
+        name="train_steps",
+        short_name="ts",
+        default=1000,
+        help="The number of steps used to train.",
+    )
+    flags.DEFINE_integer(
+        name="eval_steps",
+        short_name="es",
+        default=5440,
+        help="The number of steps used to eval.",
+    )
+    flags.DEFINE_integer(
+        name="steps_between_evals",
+        short_name="sbe",
+        default=100,
+        help="The Number of training steps to run between evaluations. This is "
+        "used if --train_steps is defined.",
+    )
+    flags.DEFINE_integer(
+        name="summary_every_n_steps",
+        default=100,
+        help="Number of training steps to run before communicating with host to "
+        "send summaries.",
+    )
+    flags.DEFINE_bool(
+        name="use_cached_data",
+        default=False,
+        help="If true, take a few samples and repeat.",
+    )
+    flags.DEFINE_string(
+        name="mode",
+        default="train",
+        help="mode: train or eval")
+    flags.DEFINE_bool(
+        name="use_ctl",
+        default=False,
+        help="Whether the model runs with custom training loop.",
+    )
+    flags.DEFINE_bool(
+        name="terabyte",
+        default=True,
+        help="If true, data paths use terabyte format. Else kaggle.",
+    )
+    # System params.
+    flags.DEFINE_bool(
+        name="pipeline_execution",
+        default=False,
+        help="If true, pipeline embedding execution with TensorCore.",
+    )
+    flags.DEFINE_bool(
+        name="use_batched_tfrecords",
+        default=False,
+        help="If true, use dataset of batched TFRecords, instead of csv.",
+    )
+    flags.DEFINE_bool(
+        name="bfloat16_grads_all_reduce",
+        default=False,
+        help="If true, use bfloat16 for all-reduce computation.",
+    )
+    flags.DEFINE_string(
+        name="data_cell",
+        default="mb",
+        help="Data cell. Path to data directory is determined dynamically.",
+    )
+    flags.DEFINE_enum(
+        "partition_strategy",
+        default="div",
+        enum_values=["div", "mod"],
+        help="Partition strategy for the embeddings.",
+    )
+    # Model architecture params.
+    flags.DEFINE_integer(
+        name="dim_embed", short_name="de", default=4, help="Embedding dimension."
+    )
+    flags.DEFINE_list(
+        name="mlp_bottom",
+        default="8, 4",
+        help="Hidden layers for the bottom MLP. "
+        "To specify different sizes of MLP layers: --layers=32,16,8,4",
+    )
+    flags.DEFINE_list(
+        name="mlp_top",
+        default="128, 64, 1",
+        help="The sizes of hidden layers for MLP. "
+        "To specify different sizes of MLP layers: --layers=32,16,8,4",
+    )
+    flags.DEFINE_list(
+        name="vocab_sizes_embed",
+        short_name="vse",
+        default="8, 8, 8, 8",
+        help="Vocab sizes for each of the sparse features. The order agrees with "
+        "the order of the input data.",
+    )
+    flags.DEFINE_integer(
+        name="num_dense_features",
+        short_name="ndf",
+        default=3,
+        help="Number of dense features.",
+    )
+    flags.DEFINE_integer(
+        name="num_tables_in_ec",
+        default=26,
+        help="Number of embedding tables in the embedding core.",
+    )
diff --git a/retired_benchmarks/recommendation/dlrm/tf/dataloader.py b/retired_benchmarks/recommendation/dlrm/tf/dataloader.py
index daa73c0c7..681d9773a 100644
--- a/retired_benchmarks/recommendation/dlrm/tf/dataloader.py
+++ b/retired_benchmarks/recommendation/dlrm/tf/dataloader.py
@@ -20,244 +20,275 @@
 
 
 def rand_features(batch_size):
-  """Emits random input features, used for testing."""
-  features = {}
-  pos_size = batch_size // 2
-  neg_size = batch_size - pos_size
-  features[fc.LABEL_FEATURE] = tf.concat([
-      tf.ones([pos_size, 1], dtype=tf.float32),
-      tf.zeros([neg_size, 1], dtype=tf.float32)
-  ],
-                                         axis=0)
-  features["int-features"] = tf.random.uniform(
-      shape=(batch_size, len(fc.INT_FEATURES)),
-      maxval=fc.FAKE_DATA_INT_MAX)
-  features["cat-features"] = tf.random.uniform(
-      shape=(batch_size, len(fc.CATEGORICAL_FEATURES)),
-      maxval=fc.FAKE_DATA_VOCAB_SIZE,
-      dtype=tf.int32)
-  return features
+    """Emits random input features, used for testing."""
+    features = {}
+    pos_size = batch_size // 2
+    neg_size = batch_size - pos_size
+    features[fc.LABEL_FEATURE] = tf.concat(
+        [
+            tf.ones([pos_size, 1], dtype=tf.float32),
+            tf.zeros([neg_size, 1], dtype=tf.float32),
+        ],
+        axis=0,
+    )
+    features["int-features"] = tf.random.uniform(
+        shape=(batch_size, len(fc.INT_FEATURES)), maxval=fc.FAKE_DATA_INT_MAX
+    )
+    features["cat-features"] = tf.random.uniform(
+        shape=(batch_size, len(fc.CATEGORICAL_FEATURES)),
+        maxval=fc.FAKE_DATA_VOCAB_SIZE,
+        dtype=tf.int32,
+    )
+    return features
 
 
 class CriteoTFRecordReader(object):
-  """Input reader fn for TFRecords that have been serialized in batched form."""
-
-  def __init__(self,
-               file_path=None,
-               feature_config=None,
-               is_training=True,
-               use_cached_data=False,
-               use_synthetic_data=False,
-               params=None):
-    self._file_path = file_path
-    self._feature_config = feature_config
-    self._is_training = is_training
-    self._use_cached_data = use_cached_data
-    self._use_synthetic_data = use_synthetic_data
-    self._params = params
-
-  def __call__(self, params):
-
-    batch_size = params["batch_size"]
-    if self._use_synthetic_data:
-      ds = tf.data.Dataset.from_tensor_slices(rand_features(batch_size))
-      ds = ds.batch(batch_size, drop_remainder=True)
-      ds = ds.take(1).cache().repeat()
-      return ds
-
-    def _get_feature_spec():
-      feature_spec = {}
-      feature_spec[fc.LABEL_FEATURE] = tf.FixedLenFeature([
-          batch_size,
-      ],
-                                                          dtype=tf.float32)
-      for int_ft in fc.INT_FEATURES:
-        feature_spec[int_ft] = tf.FixedLenFeature([
-            batch_size,
-        ],
-                                                  dtype=tf.float32)
-      for cat_ft in fc.CATEGORICAL_FEATURES:
-        feature_spec[cat_ft] = tf.FixedLenFeature([], dtype=tf.string)
-      return feature_spec
-
-    def _parse_fn(serialized_example):
-      feature_spec = _get_feature_spec()
-      p_features = tf.parse_single_example(serialized_example, feature_spec)
-
-      features = {}
-      features[fc.LABEL_FEATURE] = tf.reshape(p_features[fc.LABEL_FEATURE],
-                                              (batch_size, 1))
-
-      int_features = []
-      for int_ft in fc.INT_FEATURES:
-        cur_feature = tf.reshape(p_features[int_ft], (batch_size, 1))
-        int_features.append(cur_feature)
-      features["int-features"] = tf.concat(int_features, axis=-1)
-      cat_features = []
-      tc_features = []
-
-      tbl_idxs_sorted = self._feature_config.get_table_idx_orderd_by_size()
-      for idx in range(len(fc.CATEGORICAL_FEATURES)):
-        # Add features from largest-vocab to smallest-vocab.
-        raw_tbl_idx = tbl_idxs_sorted[idx]
-        cat_feature_idx = raw_tbl_idx + 14
-        cat_feature = "categorical-feature-%d" % cat_feature_idx
-
-        # Decode from bytes to int32.
-        cat_ft_int32 = tf.io.decode_raw(p_features[cat_feature], tf.int32)
-        cat_ft_int32 = tf.reshape(cat_ft_int32, (batch_size, 1))
-        if idx < self._feature_config.get_num_tables_in_ec():
-          cat_features.append(cat_ft_int32)
-        else:
-          tc_features.append(cat_ft_int32)
-      features["cat-features"] = tf.concat(cat_features, axis=-1)
-      if tc_features:
-        features["tc-features"] = tf.concat(tc_features, axis=-1)
-
-      return features
-
-    ds = tf.data.Dataset.list_files(self._file_path, shuffle=False)
-    ds = ds.shard(params["dataset_num_shards"],
-                  params["dataset_index"])
-
-    if self._is_training:
-      ds = ds.shuffle(
-          tf.to_int64(
-              max(256, params["dataset_num_shards"]) /
-              params["dataset_num_shards"]))
-      ds = ds.repeat()
-
-    ds = tf.data.TFRecordDataset(
-        ds, buffer_size=64 * 1024 * 1024, num_parallel_reads=8)
-    ds = ds.map(_parse_fn, num_parallel_calls=8)
-
-    if not self._is_training:
-      num_dataset_samples = self._params["eval_steps"] * (
-          self._params["eval_batch_size"] // params["dataset_num_shards"])
-      num_dataset_batches = num_dataset_samples // batch_size
-      def _mark_as_padding(features):
-        """Padding will be denoted with a label value of -1."""
-        features[fc.LABEL_FEATURE] = -1 * tf.ones(
-            (batch_size, 1), dtype=tf.float32)
-        return features
-      # 100 steps worth of padding.
-      padding_ds = ds.take(self._params["replicas_per_host"])
-      padding_ds = padding_ds.map(_mark_as_padding).repeat(100)
-      ds = ds.concatenate(padding_ds).take(num_dataset_batches)
-    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
-
-    if self._use_cached_data:
-      ds = ds.take(100).cache().repeat()
-    return ds
+    """Input reader fn for TFRecords that have been serialized in batched form."""
+
+    def __init__(
+        self,
+        file_path=None,
+        feature_config=None,
+        is_training=True,
+        use_cached_data=False,
+        use_synthetic_data=False,
+        params=None,
+    ):
+        self._file_path = file_path
+        self._feature_config = feature_config
+        self._is_training = is_training
+        self._use_cached_data = use_cached_data
+        self._use_synthetic_data = use_synthetic_data
+        self._params = params
+
+    def __call__(self, params):
+
+        batch_size = params["batch_size"]
+        if self._use_synthetic_data:
+            ds = tf.data.Dataset.from_tensor_slices(rand_features(batch_size))
+            ds = ds.batch(batch_size, drop_remainder=True)
+            ds = ds.take(1).cache().repeat()
+            return ds
+
+        def _get_feature_spec():
+            feature_spec = {}
+            feature_spec[fc.LABEL_FEATURE] = tf.FixedLenFeature(
+                [
+                    batch_size,
+                ],
+                dtype=tf.float32,
+            )
+            for int_ft in fc.INT_FEATURES:
+                feature_spec[int_ft] = tf.FixedLenFeature(
+                    [
+                        batch_size,
+                    ],
+                    dtype=tf.float32,
+                )
+            for cat_ft in fc.CATEGORICAL_FEATURES:
+                feature_spec[cat_ft] = tf.FixedLenFeature([], dtype=tf.string)
+            return feature_spec
+
+        def _parse_fn(serialized_example):
+            feature_spec = _get_feature_spec()
+            p_features = tf.parse_single_example(
+                serialized_example, feature_spec)
+
+            features = {}
+            features[fc.LABEL_FEATURE] = tf.reshape(
+                p_features[fc.LABEL_FEATURE], (batch_size, 1)
+            )
+
+            int_features = []
+            for int_ft in fc.INT_FEATURES:
+                cur_feature = tf.reshape(p_features[int_ft], (batch_size, 1))
+                int_features.append(cur_feature)
+            features["int-features"] = tf.concat(int_features, axis=-1)
+            cat_features = []
+            tc_features = []
+
+            tbl_idxs_sorted = self._feature_config.get_table_idx_orderd_by_size()
+            for idx in range(len(fc.CATEGORICAL_FEATURES)):
+                # Add features from largest-vocab to smallest-vocab.
+                raw_tbl_idx = tbl_idxs_sorted[idx]
+                cat_feature_idx = raw_tbl_idx + 14
+                cat_feature = "categorical-feature-%d" % cat_feature_idx
+
+                # Decode from bytes to int32.
+                cat_ft_int32 = tf.io.decode_raw(
+                    p_features[cat_feature], tf.int32)
+                cat_ft_int32 = tf.reshape(cat_ft_int32, (batch_size, 1))
+                if idx < self._feature_config.get_num_tables_in_ec():
+                    cat_features.append(cat_ft_int32)
+                else:
+                    tc_features.append(cat_ft_int32)
+            features["cat-features"] = tf.concat(cat_features, axis=-1)
+            if tc_features:
+                features["tc-features"] = tf.concat(tc_features, axis=-1)
+
+            return features
+
+        ds = tf.data.Dataset.list_files(self._file_path, shuffle=False)
+        ds = ds.shard(params["dataset_num_shards"], params["dataset_index"])
+
+        if self._is_training:
+            ds = ds.shuffle(
+                tf.to_int64(
+                    max(256, params["dataset_num_shards"])
+                    / params["dataset_num_shards"]
+                )
+            )
+            ds = ds.repeat()
+
+        ds = tf.data.TFRecordDataset(
+            ds, buffer_size=64 * 1024 * 1024, num_parallel_reads=8
+        )
+        ds = ds.map(_parse_fn, num_parallel_calls=8)
+
+        if not self._is_training:
+            num_dataset_samples = self._params["eval_steps"] * (
+                self._params["eval_batch_size"] // params["dataset_num_shards"]
+            )
+            num_dataset_batches = num_dataset_samples // batch_size
+
+            def _mark_as_padding(features):
+                """Padding will be denoted with a label value of -1."""
+                features[fc.LABEL_FEATURE] = -1 * tf.ones(
+                    (batch_size, 1), dtype=tf.float32
+                )
+                return features
+
+            # 100 steps worth of padding.
+            padding_ds = ds.take(self._params["replicas_per_host"])
+            padding_ds = padding_ds.map(_mark_as_padding).repeat(100)
+            ds = ds.concatenate(padding_ds).take(num_dataset_batches)
+        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
+
+        if self._use_cached_data:
+            ds = ds.take(100).cache().repeat()
+        return ds
 
 
 class CriteoTsvReader(object):
-  """Input reader fn for pre-processed Criteo data.
-
-  Raw Criteo data is assumed to be preprocessed in the following way:
-  1. Missing values are replaced with zeros.
-  2. Negative values are replaced with zeros.
-  3. Integer features are transformed by log(x+1) and are hence tf.float32.
-  4. Categorical data is bucketized and are hence tf.int32.
-  """
-
-  def __init__(self,
-               file_path=None,
-               feature_config=None,
-               is_training=True,
-               distributed_eval=False,
-               parallelism=1,
-               use_cached_data=False,
-               use_synthetic_data=False):
-    self._file_path = file_path
-    self._feature_config = feature_config
-    self._is_training = is_training
-    self._distributed_eval = distributed_eval
-    self._parallelism = parallelism
-    self._use_cached_data = use_cached_data
-    self._use_synthetic_data = use_synthetic_data
-
-  def __call__(self, params):
-    batch_size = params["batch_size"]
-    if self._use_synthetic_data:
-      ds = tf.data.Dataset.from_tensor_slices(rand_features(batch_size))
-      ds = ds.batch(batch_size, drop_remainder=True)
-      ds = ds.take(1).cache().repeat()
-      return ds
-
-    @tf.function
-    def _parse_example_fn(example):
-      """Parser function for pre-processed Criteo TSV records."""
-      label_defaults = [[0.0]]
-      int_defaults = [
-          [0.0] for _ in range(self._feature_config.get_num_dense_features())
-      ]
-      categorical_defaults = [
-          [0] for _ in range(self._feature_config.get_num_sparse_features())
-      ]
-      record_defaults = label_defaults + int_defaults + categorical_defaults
-      fields = tf.decode_csv(
-          example, record_defaults, field_delim="\t", na_value="-1")
-
-      num_labels = 1
-      num_dense = len(int_defaults)
-      features = {}
-      features[fc.LABEL_FEATURE] = tf.reshape(fields[0], [batch_size, 1])
-
-      int_features = []
-      for idx in range(num_dense):
-        int_features.append(fields[idx + num_labels])
-      features["int-features"] = tf.stack(int_features, axis=1)
-
-      cat_features = []
-      tc_features = []
-      # Features for tables in EmbeddingCore is in cat_features; features for
-      # tables in  is in tc_features. The order of the input data
-      # follows the order of FLAG.vocab_sizes_embed, so we reorder the input
-      # data with resepct to the table sizes.
-      for idx, idx_by_size in enumerate(
-          self._feature_config.get_table_idx_orderd_by_size()):
-        if idx < self._feature_config.get_num_tables_in_ec():
-          cat_features.append(
-              tf.cast(
-                  fields[idx_by_size + num_dense + num_labels], dtype=tf.int32))
-        else:
-          tc_features.append(
-              tf.cast(
-                  fields[idx_by_size + num_dense + num_labels], dtype=tf.int32))
-      features["cat-features"] = tf.stack(cat_features, axis=1)
-      if tc_features:
-        features["tc-features"] = tf.stack(tc_features, axis=1)
-
-      return features
-
-    filenames = tf.data.Dataset.list_files(self._file_path, shuffle=False)
-    filenames = filenames.shard(params["dataset_num_shards"],
-                                params["dataset_index"])
-
-    def make_dataset(ds_index):
-      ds = filenames.shard(self._parallelism, ds_index)
-      ds = ds.repeat(2)
-      ds = ds.interleave(
-          tf.data.TextLineDataset,
-          cycle_length=16,
-          block_length=batch_size // 8,
-          num_parallel_calls=8,
-          deterministic=False)
-      ds = ds.batch(batch_size, drop_remainder=True)
-      ds = ds.map(_parse_example_fn, num_parallel_calls=16)
-      return ds
-
-    ds_indices = tf.data.Dataset.range(self._parallelism)
-    ds = ds_indices.interleave(
-        make_dataset,
-        cycle_length=self._parallelism,
-        block_length=1,
-        num_parallel_calls=self._parallelism,
-        deterministic=False)
-    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
-
-    if self._use_cached_data:
-      ds = ds.take(100).cache().repeat()
-
-    return ds
+    """Input reader fn for pre-processed Criteo data.
+
+    Raw Criteo data is assumed to be preprocessed in the following way:
+    1. Missing values are replaced with zeros.
+    2. Negative values are replaced with zeros.
+    3. Integer features are transformed by log(x+1) and are hence tf.float32.
+    4. Categorical data is bucketized and are hence tf.int32.
+    """
+
+    def __init__(
+        self,
+        file_path=None,
+        feature_config=None,
+        is_training=True,
+        distributed_eval=False,
+        parallelism=1,
+        use_cached_data=False,
+        use_synthetic_data=False,
+    ):
+        self._file_path = file_path
+        self._feature_config = feature_config
+        self._is_training = is_training
+        self._distributed_eval = distributed_eval
+        self._parallelism = parallelism
+        self._use_cached_data = use_cached_data
+        self._use_synthetic_data = use_synthetic_data
+
+    def __call__(self, params):
+        batch_size = params["batch_size"]
+        if self._use_synthetic_data:
+            ds = tf.data.Dataset.from_tensor_slices(rand_features(batch_size))
+            ds = ds.batch(batch_size, drop_remainder=True)
+            ds = ds.take(1).cache().repeat()
+            return ds
+
+        @tf.function
+        def _parse_example_fn(example):
+            """Parser function for pre-processed Criteo TSV records."""
+            label_defaults = [[0.0]]
+            int_defaults = [
+                [0.0] for _ in range(self._feature_config.get_num_dense_features())
+            ]
+            categorical_defaults = [
+                [0] for _ in range(self._feature_config.get_num_sparse_features())
+            ]
+            record_defaults = label_defaults + int_defaults + categorical_defaults
+            fields = tf.decode_csv(
+                example, record_defaults, field_delim="\t", na_value="-1"
+            )
+
+            num_labels = 1
+            num_dense = len(int_defaults)
+            features = {}
+            features[fc.LABEL_FEATURE] = tf.reshape(fields[0], [batch_size, 1])
+
+            int_features = []
+            for idx in range(num_dense):
+                int_features.append(fields[idx + num_labels])
+            features["int-features"] = tf.stack(int_features, axis=1)
+
+            cat_features = []
+            tc_features = []
+            # Features for tables in EmbeddingCore is in cat_features; features for
+            # tables in  is in tc_features. The order of the input data
+            # follows the order of FLAG.vocab_sizes_embed, so we reorder the input
+            # data with resepct to the table sizes.
+            for idx, idx_by_size in enumerate(
+                self._feature_config.get_table_idx_orderd_by_size()
+            ):
+                if idx < self._feature_config.get_num_tables_in_ec():
+                    cat_features.append(
+                        tf.cast(
+                            fields[idx_by_size + num_dense +
+                                   num_labels], dtype=tf.int32
+                        )
+                    )
+                else:
+                    tc_features.append(
+                        tf.cast(
+                            fields[idx_by_size + num_dense +
+                                   num_labels], dtype=tf.int32
+                        )
+                    )
+            features["cat-features"] = tf.stack(cat_features, axis=1)
+            if tc_features:
+                features["tc-features"] = tf.stack(tc_features, axis=1)
+
+            return features
+
+        filenames = tf.data.Dataset.list_files(self._file_path, shuffle=False)
+        filenames = filenames.shard(
+            params["dataset_num_shards"], params["dataset_index"]
+        )
+
+        def make_dataset(ds_index):
+            ds = filenames.shard(self._parallelism, ds_index)
+            ds = ds.repeat(2)
+            ds = ds.interleave(
+                tf.data.TextLineDataset,
+                cycle_length=16,
+                block_length=batch_size // 8,
+                num_parallel_calls=8,
+                deterministic=False,
+            )
+            ds = ds.batch(batch_size, drop_remainder=True)
+            ds = ds.map(_parse_example_fn, num_parallel_calls=16)
+            return ds
+
+        ds_indices = tf.data.Dataset.range(self._parallelism)
+        ds = ds_indices.interleave(
+            make_dataset,
+            cycle_length=self._parallelism,
+            block_length=1,
+            num_parallel_calls=self._parallelism,
+            deterministic=False,
+        )
+        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
+
+        if self._use_cached_data:
+            ds = ds.take(100).cache().repeat()
+
+        return ds
diff --git a/retired_benchmarks/recommendation/dlrm/tf/dlrm.py b/retired_benchmarks/recommendation/dlrm/tf/dlrm.py
index bbc97e16c..1e3590db5 100644
--- a/retired_benchmarks/recommendation/dlrm/tf/dlrm.py
+++ b/retired_benchmarks/recommendation/dlrm/tf/dlrm.py
@@ -21,350 +21,397 @@
 import tensorflow.compat.v1 as tf
 
 from tensorflow.compiler.tf2xla.python import xla
-from tensorflow.contrib import  layers as contrib_layers
+from tensorflow.contrib import layers as contrib_layers
 import utils
 
 
 def dot_interact(concat_features, params=None):
-  """Performs feature interaction operation between dense and sparse.
-
-  Input tensors represent dense and sparse features.
-  Pre-condition: The tensors have been stacked along dimension 1.
-
-  Args:
-    concat_features: Tensor of features with shape [B, n_features, feature_dim].
-    params: Model params.
-
-  Returns:
-    activations: Tensor representing interacted features.
-  """
-  batch_size = concat_features.shape[0]
-  if not params:
-    params = {}
-
-  # Interact features, select lower-triangular portion, and re-shape.
-  xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
-  tf.logging.info("Model_FN: xactions shape: %s", xactions.get_shape())
-  ones = tf.ones_like(xactions)
-  upper_tri_mask = tf.linalg.band_part(ones, 0, -1)
-  feature_dim = xactions.shape[-1]
-
-  if params["opt_skip_gather"]:
-    upper_tri_bool = tf.cast(upper_tri_mask, tf.bool)
-    activations = tf.where(
-        condition=upper_tri_bool, x=tf.zeros_like(xactions), y=xactions)
-    tf.logging.info("Model_FN: activations shape: %s", activations.get_shape())
-    out_dim = feature_dim * feature_dim
-  else:
-    lower_tri_mask = ones - upper_tri_mask
-    activations = tf.boolean_mask(xactions, lower_tri_mask)
-    tf.logging.info("Model_FN: activations shape: %s", activations.get_shape())
-    out_dim = feature_dim * (feature_dim - 1) // 2
-
-  activations = tf.reshape(activations, (batch_size, out_dim))
-  return activations
+    """Performs feature interaction operation between dense and sparse.
+
+    Input tensors represent dense and sparse features.
+    Pre-condition: The tensors have been stacked along dimension 1.
+
+    Args:
+      concat_features: Tensor of features with shape [B, n_features, feature_dim].
+      params: Model params.
+
+    Returns:
+      activations: Tensor representing interacted features.
+    """
+    batch_size = concat_features.shape[0]
+    if not params:
+        params = {}
+
+    # Interact features, select lower-triangular portion, and re-shape.
+    xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
+    tf.logging.info("Model_FN: xactions shape: %s", xactions.get_shape())
+    ones = tf.ones_like(xactions)
+    upper_tri_mask = tf.linalg.band_part(ones, 0, -1)
+    feature_dim = xactions.shape[-1]
+
+    if params["opt_skip_gather"]:
+        upper_tri_bool = tf.cast(upper_tri_mask, tf.bool)
+        activations = tf.where(
+            condition=upper_tri_bool, x=tf.zeros_like(xactions), y=xactions
+        )
+        tf.logging.info(
+            "Model_FN: activations shape: %s",
+            activations.get_shape())
+        out_dim = feature_dim * feature_dim
+    else:
+        lower_tri_mask = ones - upper_tri_mask
+        activations = tf.boolean_mask(xactions, lower_tri_mask)
+        tf.logging.info(
+            "Model_FN: activations shape: %s",
+            activations.get_shape())
+        out_dim = feature_dim * (feature_dim - 1) // 2
+
+    activations = tf.reshape(activations, (batch_size, out_dim))
+    return activations
 
 
 def logits_fn(features, params, feature_config):
-  """Calculate predictions."""
-  tf.logging.info("Model_FN: Number of input features: %d", len(features))
-  for ft in sorted(features.keys()):
-    tf.logging.info("Model_FN: Feature %s -- shape %s", ft,
-                    features[ft].get_shape())
-
-  bot_mlp_input = features["int-features"]
-  tf.logging.info("Model_FN: Bottom MLP input (int features) shape: %s",
-                  bot_mlp_input.get_shape())
-  mlp_dims_bottom = params["mlp_bottom"]
-
-  for layer_idx in range(len(mlp_dims_bottom)):
-    bot_mlp_input = tf.layers.dense(
-        bot_mlp_input,
-        mlp_dims_bottom[layer_idx],
-        activation="relu",
-        kernel_initializer=tf.compat.v2.initializers.GlorotNormal(),
-        bias_initializer=tf.compat.v2.initializers.RandomNormal(
-            mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_bottom[layer_idx])),
-        name="bottom_mlp_layer_%d" % layer_idx)
-  bot_mlp_output = bot_mlp_input
-
-  cat_features = []
-  for feature_name, value in sorted(features.items()):
-    if "categorical-feature" in feature_name:
-      cat_features.append(value)
-
-  tc_features = []
-  if "tc-features" in features:
-    # Compute offsets for single concatenated table.
-    batch_size = features["tc-features"].shape[0]
-    num_tc_features = features["tc-features"].shape[1]
-    num_tables_in_ec = params["num_tables_in_ec"]
-    tc_table_sizes = feature_config.get_sorted_table_size()[num_tables_in_ec:]
-    total_tbl_size = sum(tc_table_sizes)
-    idx_offsets = [0] + list(np.cumsum(tc_table_sizes[:-1]))
-    idx_offsets = tf.broadcast_to(
-        tf.constant(idx_offsets), (batch_size, num_tc_features))
-    idxs = idx_offsets + features["tc-features"]
-
-    def _create_init_table():
-      """Table initialization varies depending on the vocab size."""
-      full_tbl = np.zeros(
-          shape=(total_tbl_size, params["dim_embed"]), dtype=np.float32)
-      start_idx = 0
-      for idx, tbl_size in enumerate(tc_table_sizes):
-        end_idx = start_idx + tc_table_sizes[idx]
-        cur_tbl_init = np.random.uniform(
-            low=-1 / np.sqrt(tbl_size),
-            high=1 / np.sqrt(tbl_size),
-            size=(tbl_size, params["dim_embed"])).astype(np.float32)
-        full_tbl[start_idx:end_idx, :] = cur_tbl_init
-        start_idx += tc_table_sizes[idx]
-      return tf.constant(full_tbl)
-
-    tc_embedding_table = tf.get_variable(
-        name="tc_embedding_table",
-        dtype=tf.float32,
-        trainable=True,
-        # pylint: disable=unnecessary-lambda
-        initializer=lambda: _create_init_table())
-    tc_features = tf.gather(tc_embedding_table, idxs)
-    tf.logging.info("TC features shape: {}".format(tc_features.get_shape()))
-
-  # Dot feature interaction
-  # Concat and reshape, instead of stack. Better for performance.
-  batch_size = bot_mlp_output.shape[0]
-  feature_stack = tf.concat([bot_mlp_output] + cat_features, axis=-1)
-  feature_stack = tf.reshape(feature_stack,
-                             [batch_size, -1, params["dim_embed"]])
-  if "tc-features" in features:
-    feature_stack = tf.concat([feature_stack, tc_features], axis=1)
-  tf.logging.info("Model_FN: concated feature shape: %s",
-                  feature_stack.get_shape())
-  dot_interact_output = dot_interact(
-      concat_features=feature_stack, params=params)
-  top_mlp_input = tf.concat([bot_mlp_output, dot_interact_output], axis=1)
-  tf.logging.info("Model_FN: Top MLP input (full features) shape: %s",
-                  top_mlp_input.get_shape())
-
-  # Capture original MLP fan-in for proper kernel initialization.
-  num_fts = len(cat_features) + 1
-  orig_top_mlp_dim = (num_fts * (num_fts - 1)) / 2 + params["dim_embed"]
-  tf.logging.info("Model_FN: Original feature len: {}".format(orig_top_mlp_dim))
-
-  # Top MLP
-  # NOTE: For the top MLP, the last layer is a sigmoid. The loss function should
-  #       therefore take [0,1] probability values as inputs, instead of logits.
-  mlp_dims_top = params["mlp_top"]
-  num_layers_top = len(mlp_dims_top)
-  sigmoid_layer_top = num_layers_top - 1
-  for layer_idx in range(num_layers_top):
-    fan_in = orig_top_mlp_dim if layer_idx == 0 else mlp_dims_top[layer_idx - 1]
-    fan_out = mlp_dims_top[layer_idx]
-    tf.logging.info("  layer {}: fan_in={} fan_out={}".format(
-        layer_idx, fan_in, fan_out))
-    top_mlp_input = tf.layers.dense(
-        top_mlp_input,
-        mlp_dims_top[layer_idx],
-        activation="sigmoid" if layer_idx == sigmoid_layer_top else "relu",
-        # NOTE: We would usually use GlorotNormal() initializer here. But due to
-        # the skip_gather optimization, the GlorotNormal would result in a
-        # mathematical error, as Glorot is a function of the fan-in.
-        # The fan-in will be larger for skip-gather activations since we also
-        # pass in the zeros. Therefore we explicitly set the kernel intializer
-        # to RandomNormal(0, sqrt(2/(fan_in+fan_out))
-        kernel_initializer=tf.compat.v2.initializers.RandomNormal(
-            mean=0.0, stddev=math.sqrt(2.0 / (fan_in + fan_out))),
-        bias_initializer=tf.compat.v2.initializers.RandomNormal(
-            mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_top[layer_idx])),
-        name="top_mlp_layer_%d" % layer_idx)
-  predictions = top_mlp_input
-  return predictions, None
+    """Calculate predictions."""
+    tf.logging.info("Model_FN: Number of input features: %d", len(features))
+    for ft in sorted(features.keys()):
+        tf.logging.info(
+            "Model_FN: Feature %s -- shape %s", ft, features[ft].get_shape()
+        )
+
+    bot_mlp_input = features["int-features"]
+    tf.logging.info(
+        "Model_FN: Bottom MLP input (int features) shape: %s", bot_mlp_input.get_shape(
+        )
+    )
+    mlp_dims_bottom = params["mlp_bottom"]
+
+    for layer_idx in range(len(mlp_dims_bottom)):
+        bot_mlp_input = tf.layers.dense(
+            bot_mlp_input,
+            mlp_dims_bottom[layer_idx],
+            activation="relu",
+            kernel_initializer=tf.compat.v2.initializers.GlorotNormal(),
+            bias_initializer=tf.compat.v2.initializers.RandomNormal(
+                mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_bottom[layer_idx])
+            ),
+            name="bottom_mlp_layer_%d" % layer_idx,
+        )
+    bot_mlp_output = bot_mlp_input
+
+    cat_features = []
+    for feature_name, value in sorted(features.items()):
+        if "categorical-feature" in feature_name:
+            cat_features.append(value)
+
+    tc_features = []
+    if "tc-features" in features:
+        # Compute offsets for single concatenated table.
+        batch_size = features["tc-features"].shape[0]
+        num_tc_features = features["tc-features"].shape[1]
+        num_tables_in_ec = params["num_tables_in_ec"]
+        tc_table_sizes = feature_config.get_sorted_table_size()[
+            num_tables_in_ec:]
+        total_tbl_size = sum(tc_table_sizes)
+        idx_offsets = [0] + list(np.cumsum(tc_table_sizes[:-1]))
+        idx_offsets = tf.broadcast_to(
+            tf.constant(idx_offsets), (batch_size, num_tc_features)
+        )
+        idxs = idx_offsets + features["tc-features"]
+
+        def _create_init_table():
+            """Table initialization varies depending on the vocab size."""
+            full_tbl = np.zeros(
+                shape=(total_tbl_size, params["dim_embed"]), dtype=np.float32
+            )
+            start_idx = 0
+            for idx, tbl_size in enumerate(tc_table_sizes):
+                end_idx = start_idx + tc_table_sizes[idx]
+                cur_tbl_init = np.random.uniform(
+                    low=-1 / np.sqrt(tbl_size),
+                    high=1 / np.sqrt(tbl_size),
+                    size=(tbl_size, params["dim_embed"]),
+                ).astype(np.float32)
+                full_tbl[start_idx:end_idx, :] = cur_tbl_init
+                start_idx += tc_table_sizes[idx]
+            return tf.constant(full_tbl)
+
+        tc_embedding_table = tf.get_variable(
+            name="tc_embedding_table",
+            dtype=tf.float32,
+            trainable=True,
+            # pylint: disable=unnecessary-lambda
+            initializer=lambda: _create_init_table(),
+        )
+        tc_features = tf.gather(tc_embedding_table, idxs)
+        tf.logging.info(
+            "TC features shape: {}".format(
+                tc_features.get_shape()))
+
+    # Dot feature interaction
+    # Concat and reshape, instead of stack. Better for performance.
+    batch_size = bot_mlp_output.shape[0]
+    feature_stack = tf.concat([bot_mlp_output] + cat_features, axis=-1)
+    feature_stack = tf.reshape(
+        feature_stack, [
+            batch_size, -1, params["dim_embed"]])
+    if "tc-features" in features:
+        feature_stack = tf.concat([feature_stack, tc_features], axis=1)
+    tf.logging.info(
+        "Model_FN: concated feature shape: %s",
+        feature_stack.get_shape())
+    dot_interact_output = dot_interact(
+        concat_features=feature_stack, params=params)
+    top_mlp_input = tf.concat([bot_mlp_output, dot_interact_output], axis=1)
+    tf.logging.info(
+        "Model_FN: Top MLP input (full features) shape: %s", top_mlp_input.get_shape(
+        )
+    )
+
+    # Capture original MLP fan-in for proper kernel initialization.
+    num_fts = len(cat_features) + 1
+    orig_top_mlp_dim = (num_fts * (num_fts - 1)) / 2 + params["dim_embed"]
+    tf.logging.info(
+        "Model_FN: Original feature len: {}".format(orig_top_mlp_dim))
+
+    # Top MLP
+    # NOTE: For the top MLP, the last layer is a sigmoid. The loss function should
+    # therefore take [0,1] probability values as inputs, instead of logits.
+    mlp_dims_top = params["mlp_top"]
+    num_layers_top = len(mlp_dims_top)
+    sigmoid_layer_top = num_layers_top - 1
+    for layer_idx in range(num_layers_top):
+        fan_in = orig_top_mlp_dim if layer_idx == 0 else mlp_dims_top[layer_idx - 1]
+        fan_out = mlp_dims_top[layer_idx]
+        tf.logging.info(
+            "  layer {}: fan_in={} fan_out={}".format(
+                layer_idx, fan_in, fan_out)
+        )
+        top_mlp_input = tf.layers.dense(
+            top_mlp_input,
+            mlp_dims_top[layer_idx],
+            activation="sigmoid" if layer_idx == sigmoid_layer_top else "relu",
+            # NOTE: We would usually use GlorotNormal() initializer here. But due to
+            # the skip_gather optimization, the GlorotNormal would result in a
+            # mathematical error, as Glorot is a function of the fan-in.
+            # The fan-in will be larger for skip-gather activations since we also
+            # pass in the zeros. Therefore we explicitly set the kernel intializer
+            # to RandomNormal(0, sqrt(2/(fan_in+fan_out))
+            kernel_initializer=tf.compat.v2.initializers.RandomNormal(
+                mean=0.0, stddev=math.sqrt(2.0 / (fan_in + fan_out))
+            ),
+            bias_initializer=tf.compat.v2.initializers.RandomNormal(
+                mean=0.0, stddev=math.sqrt(1.0 / mlp_dims_top[layer_idx])
+            ),
+            name="top_mlp_layer_%d" % layer_idx,
+        )
+    predictions = top_mlp_input
+    return predictions, None
 
 
 def create_model_fn():
-  """Creates the model_fn to be used by the TPUEstimator."""
+    """Creates the model_fn to be used by the TPUEstimator."""
+
+    def _dlrm_model_fn(features, mode, params):
+        """Model function definition for DLRM.
+
+        Args:
+          features: List of feature tensors used in model.
+          mode: Usage mode of the model function, e.g. train, eval, etc.
+          params: Hparams for the model.
+
+        Returns:
+          TPUEstimatorSpec providing the train_op and loss operators.
+
+        Raises:
+          NotImplementedError for unsupported execution modes.
+        """
+
+        preds, host_call_fn = logits_fn(features, params, None)
+        tf.logging.info(
+            "Model_FN: Shape of predictions: %s",
+            preds.get_shape())
+        labels = features["clicked"]
+        tf.logging.info("Model_FN: Shape of labels: %s", labels.get_shape())
+
+        if mode == tf.estimator.ModeKeys.EVAL:
+            labels = tf.reshape(labels, [-1])
+            preds = tf.reshape(preds, [-1])
+            bce_func = tf.keras.losses.BinaryCrossentropy(
+                from_logits=False, reduction=tf.compat.v2.keras.losses.Reduction.NONE
+            )
+            eval_loss = tf.reduce_mean(bce_func(labels, preds))
+
+            def metric_fn(labels, predictions):
+                label_weights = tf.ones_like(labels, dtype=tf.float32)
+                prediction_labels = tf.round(predictions)
+
+                return {
+                    utils.ACC_KEY: tf.metrics.accuracy(
+                        labels=labels,
+                        predictions=prediction_labels,
+                        weights=label_weights,
+                    ),
+                    utils.AUC_KEY: tf.metrics.auc(
+                        labels=labels,
+                        predictions=predictions,
+                        weights=label_weights,
+                        num_thresholds=1000,
+                        curve="ROC",
+                    ),
+                }
+
+            eval_metrics = (metric_fn, [labels, preds])
+
+            tf.logging.info(
+                "Model_FN EVAL: Metrics have been set up. Now returning..")
+
+            return tf.estimator.tpu.TPUEstimatorSpec(
+                mode=mode,
+                loss=eval_loss,
+                host_call=host_call_fn,
+                eval_metrics=eval_metrics,
+            )
+
+        elif mode == tf.estimator.ModeKeys.TRAIN:
+
+            bce_func = tf.keras.losses.BinaryCrossentropy(
+                from_logits=False, reduction=tf.compat.v2.keras.losses.Reduction.NONE
+            )
+            loss = tf.reduce_mean(bce_func(labels, preds))
+
+            global_step = tf.train.get_global_step()
+            optimizer = tf.train.GradientDescentOptimizer(
+                learning_rate=params["learning_rate"]
+            )
+            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
+            train_op = optimizer.minimize(loss, global_step=global_step)
+
+            return tf.estimator.tpu.TPUEstimatorSpec(
+                mode=mode,
+                loss=loss,
+                train_op=train_op,
+                host_call=host_call_fn,
+            )
+
+        else:
+            raise NotImplementedError(
+                "Only TRAIN and EVAL modes are supported. Got: %s" % (mode)
+            )
+
+    return _dlrm_model_fn
+
+
+class ConditionalOptimizer(tf.train.Optimizer):
+    """Conditional optimizer."""
+
+    def __init__(self, params, lr, global_step):
+        self._params = params
+        self._global_step = global_step
+        self._bfloat16_grads_all_reduce = params["bfloat16_grads_all_reduce"]
+        self._lr = lr
+        self._opt = tf.train.GradientDescentOptimizer(
+            learning_rate=utils.lr_fn(self._params, self._global_step)
+        )
+        if params["optimizer"] == "adagrad":
+            self._opt = tf.train.AdagradOptimizer(
+                learning_rate=params["learning_rate"],
+                initial_accumulator_value=params["adagrad_init_accum"],
+            )
+
+    def _cast_like(self, x, y):
+        """Cast x to y's dtype, if necessary."""
+        x = tf.convert_to_tensor(x)
+        y = tf.convert_to_tensor(y)
+        if x.dtype.base_dtype == y.dtype.base_dtype:
+            return x
+        cast_x = tf.cast(x, y.dtype)
+        if cast_x.device != x.device:
+            tf.logging.warning(
+                "Cast for %s may induce copy from '%s' to '%s'",
+                x.name,
+                x.device,
+                cast_x.device,
+            )
+        return cast_x
+
+    # pylint: disable=arguments-differ
+    def compute_gradients(self, loss, var_list=None, **kwargs):
+        gradients = self._opt.compute_gradients(loss, var_list, **kwargs)
+
+        def cast_grad(g, v):
+            if v is not None and g is not None:
+                g = self._cast_like(g, v)
+            return (g, v)
+
+        gradients = [cast_grad(g, v) for g, v in gradients]
+        if self._bfloat16_grads_all_reduce:
+            gradients = [(tf.cast(g, tf.bfloat16), v) for g, v in gradients]
+        return gradients
+
+    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+        # Cast the gradients back to float32 for weight updates.
+        if self._bfloat16_grads_all_reduce:
+            grads_and_vars = [(tf.cast(g, tf.float32), v)
+                              for g, v in grads_and_vars]
+        return self._opt.apply_gradients(
+            grads_and_vars, global_step=global_step, name=name
+        )
 
-  def _dlrm_model_fn(features, mode, params):
-    """Model function definition for DLRM.
+
+# TODO(tayo): Clean this up and merge with estimator invocation.
+def dlrm_llr_model_fn(
+    params,
+    feature_config,
+    features,
+    labels,
+    is_training,
+    eval_step_num=None,
+    predictions=None,
+):
+    """Model fn.
 
     Args:
-      features: List of feature tensors used in model.
-      mode: Usage mode of the model function, e.g. train, eval, etc.
-      params: Hparams for the model.
+      params: Params dict for the model.
+      feature_config: Configuration of features.
+      features: Features dict for the model.
+      labels: Labels tensor. Not used for this model.
+      is_training: Boolean, True if training.
+      eval_step_num: Int tensor, representing the batch number during eval.
+      predictions: [num_batches, batch_size, 2] tensor holding all predictions.
 
     Returns:
-      TPUEstimatorSpec providing the train_op and loss operators.
-
-    Raises:
-      NotImplementedError for unsupported execution modes.
+      [train_op, predictions]
     """
-
-    preds, host_call_fn = logits_fn(features, params, None)
-    tf.logging.info("Model_FN: Shape of predictions: %s", preds.get_shape())
+    assert labels is None, "Labels should be None. Reconfigure."
     labels = features["clicked"]
-    tf.logging.info("Model_FN: Shape of labels: %s", labels.get_shape())
-
-    if mode == tf.estimator.ModeKeys.EVAL:
-      labels = tf.reshape(labels, [-1])
-      preds = tf.reshape(preds, [-1])
-      bce_func = tf.keras.losses.BinaryCrossentropy(
-          from_logits=False, reduction=tf.compat.v2.keras.losses.Reduction.NONE)
-      eval_loss = tf.reduce_mean(bce_func(labels, preds))
-
-      def metric_fn(labels, predictions):
-        label_weights = tf.ones_like(labels, dtype=tf.float32)
-        prediction_labels = tf.round(predictions)
-
-        return {
-            utils.ACC_KEY:
-                tf.metrics.accuracy(
-                    labels=labels,
-                    predictions=prediction_labels,
-                    weights=label_weights),
-            utils.AUC_KEY:
-                tf.metrics.auc(
-                    labels=labels,
-                    predictions=predictions,
-                    weights=label_weights,
-                    num_thresholds=1000,
-                    curve="ROC"),
-        }
-
-      eval_metrics = (metric_fn, [labels, preds])
-
-      tf.logging.info(
-          "Model_FN EVAL: Metrics have been set up. Now returning..")
-
-      return tf.estimator.tpu.TPUEstimatorSpec(
-          mode=mode,
-          loss=eval_loss,
-          host_call=host_call_fn,
-          eval_metrics=eval_metrics)
-
-    elif mode == tf.estimator.ModeKeys.TRAIN:
-
-      bce_func = tf.keras.losses.BinaryCrossentropy(
-          from_logits=False, reduction=tf.compat.v2.keras.losses.Reduction.NONE)
-      loss = tf.reduce_mean(bce_func(labels, preds))
-
-      global_step = tf.train.get_global_step()
-      optimizer = tf.train.GradientDescentOptimizer(
-          learning_rate=params["learning_rate"])
-      optimizer = tf.tpu.CrossShardOptimizer(optimizer)
-      train_op = optimizer.minimize(loss, global_step=global_step)
-
-      return tf.estimator.tpu.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          train_op=train_op,
-          host_call=host_call_fn,
-      )
-
+    preds, _ = logits_fn(features, params, feature_config)
+    global_step = tf.train.get_or_create_global_step()
+
+    if is_training:
+        bce_func = tf.keras.losses.BinaryCrossentropy(
+            from_logits=False, reduction=tf.compat.v2.keras.losses.Reduction.NONE
+        )
+        loss = tf.reduce_mean(bce_func(labels, preds))
+        learning_rate = utils.lr_fn(params, global_step)
+        optimizer = ConditionalOptimizer(params, learning_rate, global_step)
+        optimizer = tf.tpu.CrossShardOptimizer(optimizer)
+        train_op = contrib_layers.optimize_loss(
+            name="training",
+            loss=loss,
+            global_step=global_step,
+            learning_rate=learning_rate,
+            optimizer=optimizer,
+            colocate_gradients_with_ops=True,
+        )
+        return train_op, None
     else:
-      raise NotImplementedError(
-          "Only TRAIN and EVAL modes are supported. Got: %s" % (mode))
+        # TODO(tayo): Consider adding a local key-value sort.
+        new_preds = tf.concat([preds, tf.cast(labels, tf.float32)], axis=1)
 
-  return _dlrm_model_fn
+        predictions = xla.dynamic_update_slice(
+            predictions,
+            tf.expand_dims(new_preds, axis=0),
+            tf.stack([eval_step_num, tf.constant(0), tf.constant(0)]),
+        )
 
-
-class ConditionalOptimizer(tf.train.Optimizer):
-  """Conditional optimizer."""
-
-  def __init__(self, params, lr, global_step):
-    self._params = params
-    self._global_step = global_step
-    self._bfloat16_grads_all_reduce = params["bfloat16_grads_all_reduce"]
-    self._lr = lr
-    self._opt = tf.train.GradientDescentOptimizer(
-        learning_rate=utils.lr_fn(self._params, self._global_step))
-    if params["optimizer"] == "adagrad":
-      self._opt = tf.train.AdagradOptimizer(
-          learning_rate=params["learning_rate"],
-          initial_accumulator_value=params["adagrad_init_accum"])
-
-  def _cast_like(self, x, y):
-    """Cast x to y's dtype, if necessary."""
-    x = tf.convert_to_tensor(x)
-    y = tf.convert_to_tensor(y)
-    if x.dtype.base_dtype == y.dtype.base_dtype:
-      return x
-    cast_x = tf.cast(x, y.dtype)
-    if cast_x.device != x.device:
-      tf.logging.warning("Cast for %s may induce copy from '%s' to '%s'",
-                         x.name, x.device, cast_x.device)
-    return cast_x
-
-  # pylint: disable=arguments-differ
-  def compute_gradients(self, loss, var_list=None, **kwargs):
-    gradients = self._opt.compute_gradients(loss, var_list, **kwargs)
-
-    def cast_grad(g, v):
-      if v is not None and g is not None:
-        g = self._cast_like(g, v)
-      return (g, v)
-
-    gradients = [cast_grad(g, v) for g, v in gradients]
-    if self._bfloat16_grads_all_reduce:
-      gradients = [(tf.cast(g, tf.bfloat16), v) for g, v in gradients]
-    return gradients
-
-  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    # Cast the gradients back to float32 for weight updates.
-    if self._bfloat16_grads_all_reduce:
-      grads_and_vars = [(tf.cast(g, tf.float32), v) for g, v in grads_and_vars]
-    return self._opt.apply_gradients(
-        grads_and_vars, global_step=global_step, name=name)
-
-
-# TODO(tayo): Clean this up and merge with estimator invocation.
-def dlrm_llr_model_fn(params,
-                      feature_config,
-                      features,
-                      labels,
-                      is_training,
-                      eval_step_num=None,
-                      predictions=None):
-  """Model fn.
-
-  Args:
-    params: Params dict for the model.
-    feature_config: Configuration of features.
-    features: Features dict for the model.
-    labels: Labels tensor. Not used for this model.
-    is_training: Boolean, True if training.
-    eval_step_num: Int tensor, representing the batch number during eval.
-    predictions: [num_batches, batch_size, 2] tensor holding all predictions.
-
-  Returns:
-    [train_op, predictions]
-  """
-  assert labels is None, "Labels should be None. Reconfigure."
-  labels = features["clicked"]
-  preds, _ = logits_fn(features, params, feature_config)
-  global_step = tf.train.get_or_create_global_step()
-
-  if is_training:
-    bce_func = tf.keras.losses.BinaryCrossentropy(
-        from_logits=False, reduction=tf.compat.v2.keras.losses.Reduction.NONE)
-    loss = tf.reduce_mean(bce_func(labels, preds))
-    learning_rate = utils.lr_fn(params, global_step)
-    optimizer = ConditionalOptimizer(params, learning_rate, global_step)
-    optimizer = tf.tpu.CrossShardOptimizer(optimizer)
-    train_op = contrib_layers.optimize_loss(
-        name="training",
-        loss=loss,
-        global_step=global_step,
-        learning_rate=learning_rate,
-        optimizer=optimizer,
-        colocate_gradients_with_ops=True)
-    return train_op, None
-  else:
-    # TODO(tayo): Consider adding a local key-value sort.
-    new_preds = tf.concat([preds, tf.cast(labels, tf.float32)], axis=1)
-
-    predictions = xla.dynamic_update_slice(
-        predictions, tf.expand_dims(new_preds, axis=0),
-        tf.stack([eval_step_num, tf.constant(0),
-                  tf.constant(0)]))
-
-    return None, dict(results=predictions)
+        return None, dict(results=predictions)
diff --git a/retired_benchmarks/recommendation/dlrm/tf/dlrm_embedding_runner.py b/retired_benchmarks/recommendation/dlrm/tf/dlrm_embedding_runner.py
index d3d5535fe..0e11d2442 100644
--- a/retired_benchmarks/recommendation/dlrm/tf/dlrm_embedding_runner.py
+++ b/retired_benchmarks/recommendation/dlrm/tf/dlrm_embedding_runner.py
@@ -53,148 +53,165 @@
 
 
 class DLRMEmbeddingRunner(tr.TrainAndEvalRunner):
-  """Augmentation of the TrainAndEvalRunner with embedding support.
-
-  This class uses the TPUEmbedding library as an API for organizing embedding
-  metadata for:
-  1. Configuration
-  2. Building infeed ops
-  3. Buidling embedding table load/restore ops
-  4. Building an embedding update/train op.
-
-  Attributes:
-    sparse_features_key: String key used for all embedding features. This class
-      requires all embedding features to be keyed under this string. This is
-      necessary for the runner to properly strip away only those features and
-      enqueue them properly.
-    embedding: TPUEmbedding object representing the table and feature config.
-      This attribute is required.
-    **kwargs: See TrainAndEvalRunner.
-  """
-
-  def __init__(self, sparse_features_key, embedding, **kwargs):
-    """Initializes the runner."""
-    super(DLRMEmbeddingRunner, self).__init__(**kwargs, do_initialize=False)
-    self.embedding = embedding
-    self.embedding_config = embedding.config_proto
-    self.features_key = sparse_features_key
-    self.embed_vars_and_ops = None
-    self.retrieve_ops = None
-    self.enqueue_datas_list = {True: [], False: []}
-    self.dummy_variables = None
-    self.dummy_variables_init = None
-    self.num_outfeeds = 1
-
-    with self.graph.as_default():
-      self.embed_vars_and_ops = self.embedding.create_variables_and_ops()
-      self.dummy_variables, self.dummy_variables_init = (
-          tpu_embedding_gradient.create_dummy_table_variables(self.embedding))
-    self.device_topology = tf.Session(
-        self.master, config=self.config).run(
-            tpu.initialize_system(embedding_config=self.embedding_config))
-
-  def eval_step(self, step_num, preds):
-    """One evaluation step."""
-    inp = self.infeed_op[False].generate_dequeue_op()
-    flatten_structure = tf.nest.flatten(self.feature_structure[False])
-    inp = [
-        tf.slice(i, [0] * i.shape.ndims, j.shape)
-        for i, j in zip(inp, flatten_structure)
-    ]
-    eval_has_labels = False
-    if eval_has_labels:
-      features, labels = tf.nest.pack_sequence_as(self.feature_structure[False],
-                                                  inp)
-    else:
-      features = tf.nest.pack_sequence_as(self.feature_structure[False], inp)
-      labels = None
-    self.maybe_add_embedding_features(features, False)
-    _, self.predict_output = self.model_fn(features, labels, False, step_num,
-                                           preds)
-    for _ in self.predict_output:
-      self.dequeue_ops.append([])
-    with tf.device(tr.device_for_tpu_core(self.get_host(0))):
-      return step_num + 1, self.predict_output["results"]
-
-  @tpu_function.on_device_training_loop
-  def eval_loop(self):
-    per_replica_eval_batch_size = self.eval_batch_size // self.num_replicas
-    tf.get_variable_scope().reuse_variables()
-    predictions = tf.zeros([self.eval_steps, per_replica_eval_batch_size, 2])
-    _, predictions = training_loop.repeat(
-        int(self.eval_steps), self.eval_step, [tf.constant(0), predictions])
-    with tf.control_dependencies([tpu_ops.outfeed_enqueue_tuple([predictions])
-                                 ]):
-      return tf.no_op()
-
-  def maybe_capture_embedding_inputs(self, inputs, is_training):
-    """Removes sparse inputs and stores them.
-
-    Args:
-      inputs: Dict of input features, resulting from iterator.get_next().
-      is_training: Boolean that is True for training and False otherwise.
+    """Augmentation of the TrainAndEvalRunner with embedding support.
+
+    This class uses the TPUEmbedding library as an API for organizing embedding
+    metadata for:
+    1. Configuration
+    2. Building infeed ops
+    3. Buidling embedding table load/restore ops
+    4. Building an embedding update/train op.
+
+    Attributes:
+      sparse_features_key: String key used for all embedding features. This class
+        requires all embedding features to be keyed under this string. This is
+        necessary for the runner to properly strip away only those features and
+        enqueue them properly.
+      embedding: TPUEmbedding object representing the table and feature config.
+        This attribute is required.
+      **kwargs: See TrainAndEvalRunner.
     """
-    sparse_inputs = inputs.pop(self.features_key)
-    sparse_inputs = tf.split(sparse_inputs, sparse_inputs.shape[-1], axis=1)
-    sparse_inputs = [tf.squeeze(x) for x in sparse_inputs]
-    self.enqueue_datas_list[is_training].append(sparse_inputs)
 
-  def maybe_add_embedding_enqueue_ops_int(self, is_training, enqueue_ops):
-    """Adds embedding input enqueue ops.
-
-    Args:
-      is_training: Boolean that is True for training and False otherwise.
-      enqueue_ops: List of existing enqueue ops used by the runner.
-    """
-    sparse_enqueue_ops = []
-    for i, batch_data in enumerate(self.enqueue_datas_list[is_training]):
-      enqueue_op = tpu_ops.enqueue_tpu_embedding_integer_batch(
-          batch=batch_data,
-          device_ordinal=i % FLAGS.replicas_per_host,
-          mode_override="inference" if not is_training else None)
-      sparse_enqueue_ops.append(enqueue_op)
-    enqueue_ops.extend(sparse_enqueue_ops)
-    # Clear sparse input list for this host.
-    del self.enqueue_datas_list[is_training][:]
-
-  def maybe_get_embedding_train_op(self):
-    """Builds embedding table update op.
-
-    Returns:
-      An op which computes gradients and updates tables.
-    """
-    with tf.device(tr.device_for_tpu_core(self.get_host(0))):
-      sparse_grads = (
-          tpu_embedding_gradient.get_gradients_through_dummy_table_variables(
-              self.embedding))
-      embedding_train_op = self.embedding.generate_send_gradients_op(
-          sparse_grads, tf.compat.v1.train.get_global_step())
-      return embedding_train_op
-
-  def maybe_add_embedding_features(self, features, hook_dummy_variables):
-    """Adds sparse activations to feature list.
-
-    Args:
-      features: Dict of features, used by the model_fn.
-      hook_dummy_variables: Boolean telling whether to back-propagate through
-        embedding activations. Set to true when training and desiring backprop
-        to extend to the embedding tables.
-    """
-    if hook_dummy_variables:
-      with tf.device(tr.device_for_tpu_core(self.get_host(0))):
-        embedding_activations = self.embedding.get_activations()
-        new_embedding_activations = tpu_embedding_gradient.hook_dummy_table_variables_to_activations(
-            self.embedding, embedding_activations, self.dummy_variables)
-        features.update(new_embedding_activations)
-    else:
-      embedding_activations = self.embedding.get_activations()
-      features.update(embedding_activations)
-
-  def maybe_load_embedding_vars(self):
-    """Loads tables into accelerator device memory."""
-    self.sess.run(self.dummy_variables_init)
-    self.sess.run(self.embed_vars_and_ops.load_ops())
-    self.retrieve_ops = self.embed_vars_and_ops.retrieve_ops()
-
-  def retrieve_embedding_vars(self):
-    self.sess.run(self.retrieve_ops)
+    def __init__(self, sparse_features_key, embedding, **kwargs):
+        """Initializes the runner."""
+        super(
+            DLRMEmbeddingRunner,
+            self).__init__(
+            **kwargs,
+            do_initialize=False)
+        self.embedding = embedding
+        self.embedding_config = embedding.config_proto
+        self.features_key = sparse_features_key
+        self.embed_vars_and_ops = None
+        self.retrieve_ops = None
+        self.enqueue_datas_list = {True: [], False: []}
+        self.dummy_variables = None
+        self.dummy_variables_init = None
+        self.num_outfeeds = 1
+
+        with self.graph.as_default():
+            self.embed_vars_and_ops = self.embedding.create_variables_and_ops()
+            self.dummy_variables, self.dummy_variables_init = (
+                tpu_embedding_gradient.create_dummy_table_variables(
+                    self.embedding)
+            )
+        self.device_topology = tf.Session(self.master, config=self.config).run(
+            tpu.initialize_system(embedding_config=self.embedding_config)
+        )
+
+    def eval_step(self, step_num, preds):
+        """One evaluation step."""
+        inp = self.infeed_op[False].generate_dequeue_op()
+        flatten_structure = tf.nest.flatten(self.feature_structure[False])
+        inp = [
+            tf.slice(i, [0] * i.shape.ndims, j.shape)
+            for i, j in zip(inp, flatten_structure)
+        ]
+        eval_has_labels = False
+        if eval_has_labels:
+            features, labels = tf.nest.pack_sequence_as(
+                self.feature_structure[False], inp
+            )
+        else:
+            features = tf.nest.pack_sequence_as(
+                self.feature_structure[False], inp)
+            labels = None
+        self.maybe_add_embedding_features(features, False)
+        _, self.predict_output = self.model_fn(
+            features, labels, False, step_num, preds)
+        for _ in self.predict_output:
+            self.dequeue_ops.append([])
+        with tf.device(tr.device_for_tpu_core(self.get_host(0))):
+            return step_num + 1, self.predict_output["results"]
+
+    @tpu_function.on_device_training_loop
+    def eval_loop(self):
+        per_replica_eval_batch_size = self.eval_batch_size // self.num_replicas
+        tf.get_variable_scope().reuse_variables()
+        predictions = tf.zeros(
+            [self.eval_steps, per_replica_eval_batch_size, 2])
+        _, predictions = training_loop.repeat(
+            int(self.eval_steps), self.eval_step, [tf.constant(0), predictions]
+        )
+        with tf.control_dependencies([tpu_ops.outfeed_enqueue_tuple([predictions])]):
+            return tf.no_op()
+
+    def maybe_capture_embedding_inputs(self, inputs, is_training):
+        """Removes sparse inputs and stores them.
+
+        Args:
+          inputs: Dict of input features, resulting from iterator.get_next().
+          is_training: Boolean that is True for training and False otherwise.
+        """
+        sparse_inputs = inputs.pop(self.features_key)
+        sparse_inputs = tf.split(
+            sparse_inputs, sparse_inputs.shape[-1], axis=1)
+        sparse_inputs = [tf.squeeze(x) for x in sparse_inputs]
+        self.enqueue_datas_list[is_training].append(sparse_inputs)
+
+    def maybe_add_embedding_enqueue_ops_int(self, is_training, enqueue_ops):
+        """Adds embedding input enqueue ops.
+
+        Args:
+          is_training: Boolean that is True for training and False otherwise.
+          enqueue_ops: List of existing enqueue ops used by the runner.
+        """
+        sparse_enqueue_ops = []
+        for i, batch_data in enumerate(self.enqueue_datas_list[is_training]):
+            enqueue_op = tpu_ops.enqueue_tpu_embedding_integer_batch(
+                batch=batch_data,
+                device_ordinal=i % FLAGS.replicas_per_host,
+                mode_override="inference" if not is_training else None,
+            )
+            sparse_enqueue_ops.append(enqueue_op)
+        enqueue_ops.extend(sparse_enqueue_ops)
+        # Clear sparse input list for this host.
+        del self.enqueue_datas_list[is_training][:]
+
+    def maybe_get_embedding_train_op(self):
+        """Builds embedding table update op.
+
+        Returns:
+          An op which computes gradients and updates tables.
+        """
+        with tf.device(tr.device_for_tpu_core(self.get_host(0))):
+            sparse_grads = (
+                tpu_embedding_gradient.get_gradients_through_dummy_table_variables(
+                    self.embedding
+                )
+            )
+            embedding_train_op = self.embedding.generate_send_gradients_op(
+                sparse_grads, tf.compat.v1.train.get_global_step()
+            )
+            return embedding_train_op
+
+    def maybe_add_embedding_features(self, features, hook_dummy_variables):
+        """Adds sparse activations to feature list.
+
+        Args:
+          features: Dict of features, used by the model_fn.
+          hook_dummy_variables: Boolean telling whether to back-propagate through
+            embedding activations. Set to true when training and desiring backprop
+            to extend to the embedding tables.
+        """
+        if hook_dummy_variables:
+            with tf.device(tr.device_for_tpu_core(self.get_host(0))):
+                embedding_activations = self.embedding.get_activations()
+                new_embedding_activations = (
+                    tpu_embedding_gradient.hook_dummy_table_variables_to_activations(
+                        self.embedding, embedding_activations, self.dummy_variables
+                    )
+                )
+                features.update(new_embedding_activations)
+        else:
+            embedding_activations = self.embedding.get_activations()
+            features.update(embedding_activations)
+
+    def maybe_load_embedding_vars(self):
+        """Loads tables into accelerator device memory."""
+        self.sess.run(self.dummy_variables_init)
+        self.sess.run(self.embed_vars_and_ops.load_ops())
+        self.retrieve_ops = self.embed_vars_and_ops.retrieve_ops()
+
+    def retrieve_embedding_vars(self):
+        self.sess.run(self.retrieve_ops)
diff --git a/retired_benchmarks/recommendation/dlrm/tf/dlrm_main.py b/retired_benchmarks/recommendation/dlrm/tf/dlrm_main.py
index 8f0381e23..dc70feceb 100644
--- a/retired_benchmarks/recommendation/dlrm/tf/dlrm_main.py
+++ b/retired_benchmarks/recommendation/dlrm/tf/dlrm_main.py
@@ -1,6 +1,6 @@
 """Training script for DLRM model."""
-import functools
 
+import functools
 
 
 from absl import app as absl_app
@@ -26,213 +26,234 @@
 
 
 def get_input_fns(params, feature_config):
-  """Returns input function objects."""
-
-  def _csv_record_path(mode):
-    return "{data_dir}/terabyte_processed_golden_shuffled/{mode}/{mode}*".format(
-        data_dir=FLAGS.data_dir, mode=mode)
-
-  def _batched_tfrecord_path(mode):
-    """Pre-generated data: 16 files per task for batch_size 64K."""
-    replica_batch_size = params["batch_size"] // params["num_shards"]
-    file_cnt = (128 * 1024) // replica_batch_size
-    if replica_batch_size > 1024:
-      # Minimum number of files.
-      file_cnt = 64
-    elif replica_batch_size == 432:
-      # Special case for batch 54K hparams.
-      file_cnt = 256
-    return "{data_dir}/terabyte_tfrecords_batched{bs}/{mode}{file_cnt}shards/{mode}*".format(
-        data_dir=FLAGS.data_dir,
-        bs=replica_batch_size,
-        mode=mode,
-        file_cnt=file_cnt)
-
-  if FLAGS.use_batched_tfrecords:
-    train_input_fn = dataloader.CriteoTFRecordReader(
-        file_path=_batched_tfrecord_path("train"),
-        feature_config=feature_config,
-        is_training=True,
-        use_cached_data=params["use_cached_data"],
-        use_synthetic_data=params["use_synthetic_data"],
-        params=params)
-    eval_input_fn = dataloader.CriteoTFRecordReader(
-        file_path=_batched_tfrecord_path("eval"),
-        feature_config=feature_config,
-        is_training=False,
-        use_cached_data=params["use_cached_data"],
-        use_synthetic_data=params["use_synthetic_data"],
-        params=params)
-  else:
-    train_input_fn = dataloader.CriteoTsvReader(
-        file_path=_csv_record_path("train"),
-        feature_config=feature_config,
-        is_training=True,
-        parallelism=16,
-        use_cached_data=params["use_cached_data"],
-        use_synthetic_data=params["use_synthetic_data"])
-
-    eval_input_fn = dataloader.CriteoTsvReader(
-        file_path=_csv_record_path("eval"),
-        feature_config=feature_config,
-        is_training=False,
-        parallelism=16,
-        use_cached_data=params["use_cached_data"],
-        use_synthetic_data=params["use_synthetic_data"])
-
-  return train_input_fn, eval_input_fn
-
-
-def run_model(params,
-              eval_init_fn=None,
-              eval_finish_fn=None,
-              run_finish_fn=None):
-  """Run the DLRM model, using a pre-defined configuration.
-
-  Args:
-    params: HPTuner object that provides new params for the trial.
-    eval_init_fn: Lambda to run at start of eval. None means use the default.
-    eval_finish_fn: Lambda for end of eval. None means use the default.
-    run_finish_fn: Lambda for end of execution. None means use the default.
-
-  Returns:
-    A list of tuples, each entry describing the eval metric for one eval. Each
-    tuple entry is (global_step, metric_value).
-  """
-  mlp_log.mlperf_print(key="cache_clear", value=True)
-  mlp_log.mlperf_print(key="init_start", value=None)
-  mlp_log.mlperf_print("global_batch_size", params["batch_size"])
-  mlp_log.mlperf_print("train_samples", _NUM_TRAIN_EXAMPLES)
-  mlp_log.mlperf_print("eval_samples", _NUM_EVAL_EXAMPLES)
-  adjusted_lr = params["learning_rate"] * (params["batch_size"] / 2048.0)
-  mlp_log.mlperf_print("opt_base_learning_rate", adjusted_lr)
-  mlp_log.mlperf_print("sgd_opt_base_learning_rate", adjusted_lr)
-  mlp_log.mlperf_print("sgd_opt_learning_rate_decay_poly_power", 2)
-  mlp_log.mlperf_print("sgd_opt_learning_rate_decay_steps",
-                       params["decay_steps"])
-  mlp_log.mlperf_print("lr_decay_start_steps", params["decay_start_step"])
-  mlp_log.mlperf_print("opt_learning_rate_warmup_steps",
-                       params["lr_warmup_steps"])
-
-  # Used for vizier. List of tuples. Each entry is (global_step, auc_metric).
-  eval_metrics = [(0, 0.0)]
-
-  feature_config = fc.FeatureConfig(params)
-  (feature_to_config_dict,
-   table_to_config_dict) = feature_config.get_feature_tbl_config()
-  opt_params = {
-      "sgd":
-          tpu_embedding.StochasticGradientDescentParameters(
-              learning_rate=params["learning_rate"]),
-      "adagrad":
-          tpu_embedding.AdagradParameters(
-              learning_rate=params["learning_rate"],
-              initial_accumulator=params["adagrad_init_accum"])
-  }
-  embedding = tpu_embedding.TPUEmbedding(
-      table_to_config_dict,
-      feature_to_config_dict,
-      params["batch_size"],
-      mode=tpu_embedding.TRAINING,
-      optimization_parameters=opt_params[params["optimizer"]],
-      partition_strategy="mod",
-      pipeline_execution_with_tensor_core=FLAGS.pipeline_execution,
-      master=FLAGS.master)
-
-  runner = dlrm_embedding_runner.DLRMEmbeddingRunner(
-      iterations_per_loop=FLAGS.steps_between_evals,
-      train_steps=FLAGS.train_steps,
-      eval_steps=FLAGS.eval_steps,
-      num_replicas=FLAGS.num_tpu_shards,
-      sparse_features_key="cat-features",
-      embedding=embedding)
-
-  train_input_fn, eval_input_fn = get_input_fns(params, feature_config)
-
-  runner.initialize(
-      train_input_fn,
-      eval_input_fn,
-      functools.partial(dlrm.dlrm_llr_model_fn, params, feature_config),
-      params["batch_size"],
-      params["eval_batch_size"],
-      train_has_labels=False,
-      eval_has_labels=False)
-
-  mlp_log.mlperf_print("init_stop", None)
-  mlp_log.mlperf_print("run_start", None)
-
-  def _default_eval_init_fn(cur_step):
-    """Logging statements executed before every eval."""
-    eval_num = 0
-    if FLAGS.steps_between_evals:
-      eval_num = cur_step // FLAGS.steps_between_evals
-    tf.logging.info("== Block {}. Step {} of {}".format(eval_num + 1, cur_step,
-                                                        FLAGS.train_steps))
-    mlp_log.mlperf_print(
-        "block_start",
-        None,
-        metadata={
-            "first_epoch_num": eval_num + 1,
-            "epoch_count": 1
-        })
-    mlp_log.mlperf_print(
-        "eval_start", None, metadata={"epoch_num": eval_num + 1})
-
-  def _default_eval_finish_fn(cur_step, eval_output, summary_writer=None):
-    eval_num = 0
-    if FLAGS.steps_between_evals:
-      eval_num = cur_step // FLAGS.steps_between_evals
-    mlp_log.mlperf_print(
-        "eval_stop", None, metadata={"epoch_num": eval_num + 1})
+    """Returns input function objects."""
+
+    def _csv_record_path(mode):
+        return "{data_dir}/terabyte_processed_golden_shuffled/{mode}/{mode}*".format(
+            data_dir=FLAGS.data_dir, mode=mode
+        )
+
+    def _batched_tfrecord_path(mode):
+        """Pre-generated data: 16 files per task for batch_size 64K."""
+        replica_batch_size = params["batch_size"] // params["num_shards"]
+        file_cnt = (128 * 1024) // replica_batch_size
+        if replica_batch_size > 1024:
+            # Minimum number of files.
+            file_cnt = 64
+        elif replica_batch_size == 432:
+            # Special case for batch 54K hparams.
+            file_cnt = 256
+        return "{data_dir}/terabyte_tfrecords_batched{bs}/{mode}{file_cnt}shards/{mode}*".format(
+            data_dir=FLAGS.data_dir, bs=replica_batch_size, mode=mode, file_cnt=file_cnt
+        )
+
+    if FLAGS.use_batched_tfrecords:
+        train_input_fn = dataloader.CriteoTFRecordReader(
+            file_path=_batched_tfrecord_path("train"),
+            feature_config=feature_config,
+            is_training=True,
+            use_cached_data=params["use_cached_data"],
+            use_synthetic_data=params["use_synthetic_data"],
+            params=params,
+        )
+        eval_input_fn = dataloader.CriteoTFRecordReader(
+            file_path=_batched_tfrecord_path("eval"),
+            feature_config=feature_config,
+            is_training=False,
+            use_cached_data=params["use_cached_data"],
+            use_synthetic_data=params["use_synthetic_data"],
+            params=params,
+        )
+    else:
+        train_input_fn = dataloader.CriteoTsvReader(
+            file_path=_csv_record_path("train"),
+            feature_config=feature_config,
+            is_training=True,
+            parallelism=16,
+            use_cached_data=params["use_cached_data"],
+            use_synthetic_data=params["use_synthetic_data"],
+        )
+
+        eval_input_fn = dataloader.CriteoTsvReader(
+            file_path=_csv_record_path("eval"),
+            feature_config=feature_config,
+            is_training=False,
+            parallelism=16,
+            use_cached_data=params["use_cached_data"],
+            use_synthetic_data=params["use_synthetic_data"],
+        )
+
+    return train_input_fn, eval_input_fn
+
+
+def run_model(params, eval_init_fn=None,
+              eval_finish_fn=None, run_finish_fn=None):
+    """Run the DLRM model, using a pre-defined configuration.
+
+    Args:
+      params: HPTuner object that provides new params for the trial.
+      eval_init_fn: Lambda to run at start of eval. None means use the default.
+      eval_finish_fn: Lambda for end of eval. None means use the default.
+      run_finish_fn: Lambda for end of execution. None means use the default.
+
+    Returns:
+      A list of tuples, each entry describing the eval metric for one eval. Each
+      tuple entry is (global_step, metric_value).
+    """
+    mlp_log.mlperf_print(key="cache_clear", value=True)
+    mlp_log.mlperf_print(key="init_start", value=None)
+    mlp_log.mlperf_print("global_batch_size", params["batch_size"])
+    mlp_log.mlperf_print("train_samples", _NUM_TRAIN_EXAMPLES)
+    mlp_log.mlperf_print("eval_samples", _NUM_EVAL_EXAMPLES)
+    adjusted_lr = params["learning_rate"] * (params["batch_size"] / 2048.0)
+    mlp_log.mlperf_print("opt_base_learning_rate", adjusted_lr)
+    mlp_log.mlperf_print("sgd_opt_base_learning_rate", adjusted_lr)
+    mlp_log.mlperf_print("sgd_opt_learning_rate_decay_poly_power", 2)
     mlp_log.mlperf_print(
-        "block_stop", None, metadata={"first_epoch_num": eval_num + 1})
-    tf.logging.info(
-        "== Eval finished (step {}). Computing metric..".format(cur_step))
-
-    results_np = np.array(eval_output["results"])
-    results_np = np.reshape(results_np, (-1, 2))
-    predictions_np = results_np[:, 0].astype(np.float32)
-    targets_np = results_np[:, 1].astype(np.int32)
-    # TODO: Fix roc clif in cloud.
-    # roc_obj = roc_metrics.RocMetrics(predictions_np, targets_np)
-    # roc_auc = roc_obj.ComputeRocAuc()
-    roc_auc = 0.0
-    tf.logging.info("== Eval shape: {}.  AUC = {:.4f}".format(
-        predictions_np.shape, roc_auc))
-    success = roc_auc >= _ACCURACY_THRESH
+        "sgd_opt_learning_rate_decay_steps",
+        params["decay_steps"])
+    mlp_log.mlperf_print("lr_decay_start_steps", params["decay_start_step"])
     mlp_log.mlperf_print(
-        "eval_accuracy", roc_auc, metadata={"epoch_num": eval_num + 1})
-    if success:
-      mlp_log.mlperf_print("run_stop", None, metadata={"status": "success"})
-    if summary_writer:
-      summary_writer.add_summary(
-          utils.create_scalar_summary("auc", roc_auc),
-          global_step=cur_step + FLAGS.steps_between_evals)
-    eval_metrics.append((cur_step + FLAGS.steps_between_evals, roc_auc))
-    return success
-
-  def _default_run_finish_fn(success_status):
-    if not success_status:
-      mlp_log.mlperf_print("run_stop", None, metadata={"status": "failure"})
-    tf.logging.info("Retrieving embedding vars and writing stats.")
-    runner.retrieve_embedding_vars()
-
-  runner.train_and_eval(
-      eval_init_fn=eval_init_fn or _default_eval_init_fn,
-      eval_finish_fn=eval_finish_fn or _default_eval_finish_fn,
-      run_finish_fn=run_finish_fn or _default_run_finish_fn)
-
-  return eval_metrics
+        "opt_learning_rate_warmup_steps",
+        params["lr_warmup_steps"])
+
+    # Used for vizier. List of tuples. Each entry is (global_step, auc_metric).
+    eval_metrics = [(0, 0.0)]
+
+    feature_config = fc.FeatureConfig(params)
+    (feature_to_config_dict, table_to_config_dict) = (
+        feature_config.get_feature_tbl_config()
+    )
+    opt_params = {
+        "sgd": tpu_embedding.StochasticGradientDescentParameters(
+            learning_rate=params["learning_rate"]
+        ),
+        "adagrad": tpu_embedding.AdagradParameters(
+            learning_rate=params["learning_rate"],
+            initial_accumulator=params["adagrad_init_accum"],
+        ),
+    }
+    embedding = tpu_embedding.TPUEmbedding(
+        table_to_config_dict,
+        feature_to_config_dict,
+        params["batch_size"],
+        mode=tpu_embedding.TRAINING,
+        optimization_parameters=opt_params[params["optimizer"]],
+        partition_strategy="mod",
+        pipeline_execution_with_tensor_core=FLAGS.pipeline_execution,
+        master=FLAGS.master,
+    )
+
+    runner = dlrm_embedding_runner.DLRMEmbeddingRunner(
+        iterations_per_loop=FLAGS.steps_between_evals,
+        train_steps=FLAGS.train_steps,
+        eval_steps=FLAGS.eval_steps,
+        num_replicas=FLAGS.num_tpu_shards,
+        sparse_features_key="cat-features",
+        embedding=embedding,
+    )
+
+    train_input_fn, eval_input_fn = get_input_fns(params, feature_config)
+
+    runner.initialize(
+        train_input_fn,
+        eval_input_fn,
+        functools.partial(dlrm.dlrm_llr_model_fn, params, feature_config),
+        params["batch_size"],
+        params["eval_batch_size"],
+        train_has_labels=False,
+        eval_has_labels=False,
+    )
+
+    mlp_log.mlperf_print("init_stop", None)
+    mlp_log.mlperf_print("run_start", None)
+
+    def _default_eval_init_fn(cur_step):
+        """Logging statements executed before every eval."""
+        eval_num = 0
+        if FLAGS.steps_between_evals:
+            eval_num = cur_step // FLAGS.steps_between_evals
+        tf.logging.info(
+            "== Block {}. Step {} of {}".format(
+                eval_num + 1, cur_step, FLAGS.train_steps
+            )
+        )
+        mlp_log.mlperf_print(
+            "block_start",
+            None,
+            metadata={"first_epoch_num": eval_num + 1, "epoch_count": 1},
+        )
+        mlp_log.mlperf_print(
+            "eval_start", None, metadata={
+                "epoch_num": eval_num + 1})
+
+    def _default_eval_finish_fn(cur_step, eval_output, summary_writer=None):
+        eval_num = 0
+        if FLAGS.steps_between_evals:
+            eval_num = cur_step // FLAGS.steps_between_evals
+        mlp_log.mlperf_print(
+            "eval_stop", None, metadata={
+                "epoch_num": eval_num + 1})
+        mlp_log.mlperf_print(
+            "block_stop", None, metadata={"first_epoch_num": eval_num + 1}
+        )
+        tf.logging.info(
+            "== Eval finished (step {}). Computing metric..".format(cur_step)
+        )
+
+        results_np = np.array(eval_output["results"])
+        results_np = np.reshape(results_np, (-1, 2))
+        predictions_np = results_np[:, 0].astype(np.float32)
+        targets_np = results_np[:, 1].astype(np.int32)
+        # TODO: Fix roc clif in cloud.
+        # roc_obj = roc_metrics.RocMetrics(predictions_np, targets_np)
+        # roc_auc = roc_obj.ComputeRocAuc()
+        roc_auc = 0.0
+        tf.logging.info(
+            "== Eval shape: {}.  AUC = {:.4f}".format(
+                predictions_np.shape, roc_auc)
+        )
+        success = roc_auc >= _ACCURACY_THRESH
+        mlp_log.mlperf_print(
+            "eval_accuracy", roc_auc, metadata={"epoch_num": eval_num + 1}
+        )
+        if success:
+            mlp_log.mlperf_print(
+                "run_stop", None, metadata={
+                    "status": "success"})
+        if summary_writer:
+            summary_writer.add_summary(
+                utils.create_scalar_summary("auc", roc_auc),
+                global_step=cur_step + FLAGS.steps_between_evals,
+            )
+        eval_metrics.append((cur_step + FLAGS.steps_between_evals, roc_auc))
+        return success
+
+    def _default_run_finish_fn(success_status):
+        if not success_status:
+            mlp_log.mlperf_print(
+                "run_stop", None, metadata={
+                    "status": "failure"})
+        tf.logging.info("Retrieving embedding vars and writing stats.")
+        runner.retrieve_embedding_vars()
+
+    runner.train_and_eval(
+        eval_init_fn=eval_init_fn or _default_eval_init_fn,
+        eval_finish_fn=eval_finish_fn or _default_eval_finish_fn,
+        run_finish_fn=run_finish_fn or _default_run_finish_fn,
+    )
+
+    return eval_metrics
 
 
 def main(argv):
-  del argv
-  params = common.get_params()
-  run_model(params)
+    del argv
+    params = common.get_params()
+    run_model(params)
 
 
 if __name__ == "__main__":
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.disable_v2_behavior()
-  common.define_dlrm_flags()
-  absl_app.run(main)
+    tf.logging.set_verbosity(tf.logging.INFO)
+    tf.disable_v2_behavior()
+    common.define_dlrm_flags()
+    absl_app.run(main)
diff --git a/retired_benchmarks/recommendation/dlrm/tf/feature_config.py b/retired_benchmarks/recommendation/dlrm/tf/feature_config.py
index 12318ed7d..ffdd57c5f 100644
--- a/retired_benchmarks/recommendation/dlrm/tf/feature_config.py
+++ b/retired_benchmarks/recommendation/dlrm/tf/feature_config.py
@@ -24,138 +24,153 @@
 
 LABEL_FEATURE = "clicked"
 INT_FEATURES = ["int-feature-%d" % x for x in range(1, 14)]
-CATEGORICAL_FEATURES = [
-    "categorical-feature-%d" % x for x in range(14, 40)
-]
+CATEGORICAL_FEATURES = ["categorical-feature-%d" % x for x in range(14, 40)]
 FAKE_DATA_VOCAB_SIZE = 1000
 FAKE_DATA_INT_MAX = 10.0
 
 
 class FeatureConfig(object):
-  """Configure dense and sparse features.
-
-  The embedding tables can be placed either in EmbeddingCore or .
-  In practice, large embedding tables are placed in EmbeddingCore while small
-  ones are in . FeatureConfig sorts the embedding table by its size
-  and stores the metadata such that input (dataloader.py) and model (dlrm.py)
-  see a consistent view of the placement of all embedding tables.
-
-  """
-
-  def __init__(self, params):
-    """Init method."""
-    # Hyperparameters
-    self._batch_size = params["batch_size"]
-    self._learning_rate = params["learning_rate"]
-    self._lr_warmup_steps = params["lr_warmup_steps"]
-    self._optimizer = params["optimizer"]
-    self._decay_steps = params["decay_steps"]
-    self._decay_start_steps = params["decay_start_step"]
-
-    self._vse = params["vocab_sizes"]
-    self._de = params["dim_embed"]
-    self._num_dense_features = params["num_dense_features"]
-    self._num_sparse_features = len(self._vse)
-
-    self._num_tables_in_ec = params["num_tables_in_ec"]
-    # Get the sorted table size in a descending order.
-    self._table_size_sorted = sorted(self._vse)[::-1]
-    # The following is equivalent to np.argsort in a descending order.
-    self._table_idx_ordered_by_size = sorted(
-        range(len(self._vse)), key=self._vse.__getitem__)[::-1]
-    tf.logging.info(self._table_size_sorted)
-    tf.logging.info(self._table_idx_ordered_by_size)
-
-  def get_sorted_table_size(self):
-    return self._table_size_sorted
-
-  def get_table_idx_orderd_by_size(self):
-    return self._table_idx_ordered_by_size
-
-  def get_num_tables_in_ec(self):
-    return self._num_tables_in_ec
-
-  def get_num_dense_features(self):
-    return self._num_dense_features
-
-  def get_num_sparse_features(self):
-    return self._num_sparse_features
-
-  def get_feature_tbl_config(self):
-    """Creates table configuration data structures.
-
-    For all tables, vocab size and width are given by params.
-
-    Table setup:
-    tbl0 - categorical-feature-14
-    tbl1 - categorical-feature-15
-    ..
-
-    Feature setup:
-    categorical-feature-14 -- tbl0 (first sparse feature)
-    categorical-feature-15 -- tbl1 (second sparse feature)
-
-    Returns:
-      A tuple of dicts, one for feature_to_config and one for table_to_config.
+    """Configure dense and sparse features.
+
+    The embedding tables can be placed either in EmbeddingCore or .
+    In practice, large embedding tables are placed in EmbeddingCore while small
+    ones are in . FeatureConfig sorts the embedding table by its size
+    and stores the metadata such that input (dataloader.py) and model (dlrm.py)
+    see a consistent view of the placement of all embedding tables.
+
     """
 
-    def lr_fn(global_step):
-      """Learning function for the embeddings. Linear warmup and poly decay."""
-      decay_exp = 2
-      scal = self._batch_size / 2048
-      adj_lr = self._learning_rate * scal
-      if self._lr_warmup_steps == 0:
-        return adj_lr
-      if self._optimizer == "adagrad":
-        return self._learning_rate
-      warmup_lr = tf.cast(
-          global_step, dtype=tf.float32) / self._lr_warmup_steps * adj_lr
-
-      global_step = tf.cast(global_step, tf.float32)
-      decay_steps = tf.cast(self._decay_steps, tf.float32)
-      decay_start_step = tf.cast(self._decay_start_steps, tf.float32)
-      steps_since_decay_start = global_step - decay_start_step
-      already_decayed_steps = tf.minimum(steps_since_decay_start, decay_steps)
-      decay_lr = adj_lr * (
-          (decay_steps - already_decayed_steps) / decay_steps)**decay_exp
-      decay_lr = tf.maximum(0.0000001, decay_lr)
-
-      lr = tf.where(
-          global_step < self._lr_warmup_steps, warmup_lr,
-          tf.where(
-              tf.logical_and(decay_steps > 0, global_step > decay_start_step),
-              decay_lr, adj_lr))
-
-      return lr
-
-    table_to_config_dict = {}
-    for i in range(self._num_tables_in_ec):
-      vocab_size = self._table_size_sorted[i]
-      table_to_config_dict["tbl%02d" % i] = tpu_embedding.TableConfig(
-          vocabulary_size=vocab_size,
-          dimension=self._de,
-          # NOTE: Default weight initializer uses trunc_normal,
-          #       stddv=1/dimension. This is changed to match the mlperf
-          #       reference model.
-          initializer=tf.random_uniform_initializer(
-              minval=-1 / math.sqrt(vocab_size),
-              maxval=1 / math.sqrt(vocab_size)),
-          combiner=None,
-          learning_rate_fn=lr_fn,
-
-          # TODO(tayo): Using the utils lr_fn leads to problems with embedding
-          # table size. The embedding table stops being able to fit.
-          # learning_rate_fn=functools.partial(utils.lr_fn, params)
-      )
-
-    # Use an offset to allow the categorical feature numbering to be subsequent
-    # to the integer feature numbering.
-    offset = 1 + self._num_dense_features
-    feature_to_config_dict = {}
-    feature_to_config_dict.update([
-        ("categorical-feature-%02d" % i,
-         tpu_embedding.FeatureConfig(table_id="tbl%02d" % (i - offset)))
-        for i in range(offset, offset + self._num_tables_in_ec)
-    ])
-
-    return feature_to_config_dict, table_to_config_dict
+    def __init__(self, params):
+        """Init method."""
+        # Hyperparameters
+        self._batch_size = params["batch_size"]
+        self._learning_rate = params["learning_rate"]
+        self._lr_warmup_steps = params["lr_warmup_steps"]
+        self._optimizer = params["optimizer"]
+        self._decay_steps = params["decay_steps"]
+        self._decay_start_steps = params["decay_start_step"]
+
+        self._vse = params["vocab_sizes"]
+        self._de = params["dim_embed"]
+        self._num_dense_features = params["num_dense_features"]
+        self._num_sparse_features = len(self._vse)
+
+        self._num_tables_in_ec = params["num_tables_in_ec"]
+        # Get the sorted table size in a descending order.
+        self._table_size_sorted = sorted(self._vse)[::-1]
+        # The following is equivalent to np.argsort in a descending order.
+        self._table_idx_ordered_by_size = sorted(
+            range(len(self._vse)), key=self._vse.__getitem__
+        )[::-1]
+        tf.logging.info(self._table_size_sorted)
+        tf.logging.info(self._table_idx_ordered_by_size)
+
+    def get_sorted_table_size(self):
+        return self._table_size_sorted
+
+    def get_table_idx_orderd_by_size(self):
+        return self._table_idx_ordered_by_size
+
+    def get_num_tables_in_ec(self):
+        return self._num_tables_in_ec
+
+    def get_num_dense_features(self):
+        return self._num_dense_features
+
+    def get_num_sparse_features(self):
+        return self._num_sparse_features
+
+    def get_feature_tbl_config(self):
+        """Creates table configuration data structures.
+
+        For all tables, vocab size and width are given by params.
+
+        Table setup:
+        tbl0 - categorical-feature-14
+        tbl1 - categorical-feature-15
+        ..
+
+        Feature setup:
+        categorical-feature-14 -- tbl0 (first sparse feature)
+        categorical-feature-15 -- tbl1 (second sparse feature)
+
+        Returns:
+          A tuple of dicts, one for feature_to_config and one for table_to_config.
+        """
+
+        def lr_fn(global_step):
+            """Learning function for the embeddings. Linear warmup and poly decay."""
+            decay_exp = 2
+            scal = self._batch_size / 2048
+            adj_lr = self._learning_rate * scal
+            if self._lr_warmup_steps == 0:
+                return adj_lr
+            if self._optimizer == "adagrad":
+                return self._learning_rate
+            warmup_lr = (
+                tf.cast(global_step, dtype=tf.float32) /
+                self._lr_warmup_steps * adj_lr
+            )
+
+            global_step = tf.cast(global_step, tf.float32)
+            decay_steps = tf.cast(self._decay_steps, tf.float32)
+            decay_start_step = tf.cast(self._decay_start_steps, tf.float32)
+            steps_since_decay_start = global_step - decay_start_step
+            already_decayed_steps = tf.minimum(
+                steps_since_decay_start, decay_steps)
+            decay_lr = (
+                adj_lr
+                * ((decay_steps - already_decayed_steps) / decay_steps) ** decay_exp
+            )
+            decay_lr = tf.maximum(0.0000001, decay_lr)
+
+            lr = tf.where(
+                global_step < self._lr_warmup_steps,
+                warmup_lr,
+                tf.where(
+                    tf.logical_and(
+                        decay_steps > 0,
+                        global_step > decay_start_step),
+                    decay_lr,
+                    adj_lr,
+                ),
+            )
+
+            return lr
+
+        table_to_config_dict = {}
+        for i in range(self._num_tables_in_ec):
+            vocab_size = self._table_size_sorted[i]
+            table_to_config_dict["tbl%02d" % i] = tpu_embedding.TableConfig(
+                vocabulary_size=vocab_size,
+                dimension=self._de,
+                # NOTE: Default weight initializer uses trunc_normal,
+                #       stddv=1/dimension. This is changed to match the mlperf
+                #       reference model.
+                initializer=tf.random_uniform_initializer(
+                    minval=-1 / math.sqrt(vocab_size), maxval=1 / math.sqrt(vocab_size)
+                ),
+                combiner=None,
+                learning_rate_fn=lr_fn,
+                # TODO(tayo): Using the utils lr_fn leads to problems with embedding
+                # table size. The embedding table stops being able to fit.
+                # learning_rate_fn=functools.partial(utils.lr_fn, params)
+            )
+
+        # Use an offset to allow the categorical feature numbering to be subsequent
+        # to the integer feature numbering.
+        offset = 1 + self._num_dense_features
+        feature_to_config_dict = {}
+        feature_to_config_dict.update(
+            [
+                (
+                    "categorical-feature-%02d" % i,
+                    tpu_embedding.FeatureConfig(
+                        table_id="tbl%02d" %
+                        (i - offset)),
+                )
+                for i in range(offset, offset + self._num_tables_in_ec)
+            ]
+        )
+
+        return feature_to_config_dict, table_to_config_dict
diff --git a/retired_benchmarks/recommendation/dlrm/tf/mlp_log.py b/retired_benchmarks/recommendation/dlrm/tf/mlp_log.py
index 48d9f12d7..523f4be28 100644
--- a/retired_benchmarks/recommendation/dlrm/tf/mlp_log.py
+++ b/retired_benchmarks/recommendation/dlrm/tf/mlp_log.py
@@ -28,11 +28,11 @@
 import sys
 import time
 
-PATTERN = re.compile('[a-zA-Z0-9]+')
+PATTERN = re.compile("[a-zA-Z0-9]+")
 
-LOG_FILE = os.getenv('COMPLIANCE_FILE')
+LOG_FILE = os.getenv("COMPLIANCE_FILE")
 # create logger with 'spam_application'
-LOGGER = logging.getLogger('mlperf_compliance')
+LOGGER = logging.getLogger("mlperf_compliance")
 LOGGER.setLevel(logging.DEBUG)
 
 _STREAM_HANDLER = logging.StreamHandler(stream=sys.stdout)
@@ -40,21 +40,22 @@
 LOGGER.addHandler(_STREAM_HANDLER)
 
 if LOG_FILE:
-  _FILE_HANDLER = logging.FileHandler(LOG_FILE)
-  _FILE_HANDLER.setLevel(logging.DEBUG)
-  LOGGER.addHandler(_FILE_HANDLER)
+    _FILE_HANDLER = logging.FileHandler(LOG_FILE)
+    _FILE_HANDLER.setLevel(logging.DEBUG)
+    LOGGER.addHandler(_FILE_HANDLER)
 else:
-  _STREAM_HANDLER.setLevel(logging.DEBUG)
+    _STREAM_HANDLER.setLevel(logging.DEBUG)
 
 
 def get_caller(stack_index=2, root_dir=None):
-  caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
+    caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
+
+    # Trim the filenames for readability.
+    filename = caller.filename
+    if root_dir is not None:
+        filename = re.sub("^" + root_dir + "/", "", filename)
+    return (filename, caller.lineno)
 
-  # Trim the filenames for readability.
-  filename = caller.filename
-  if root_dir is not None:
-    filename = re.sub('^' + root_dir + '/', '', filename)
-  return (filename, caller.lineno)
 
 # :::MLL 1556733699.71 run_start: {"value": null,
 # "metadata": {"lineno": 77, "file": main.py}}
@@ -62,21 +63,29 @@ def get_caller(stack_index=2, root_dir=None):
 
 
 def mlperf_format(key, value, stack_offset=0, metadata=None):
-  """Format a message for MLPerf."""
-  if metadata is None:
-    metadata = {}
+    """Format a message for MLPerf."""
+    if metadata is None:
+        metadata = {}
 
-  if 'lineno' not in metadata:
-    filename, lineno = get_caller(2 + stack_offset, root_dir=None)
-    metadata['lineno'] = lineno
-    metadata['file'] = filename
+    if "lineno" not in metadata:
+        filename, lineno = get_caller(2 + stack_offset, root_dir=None)
+        metadata["lineno"] = lineno
+        metadata["file"] = filename
 
-  now = time.time()
-  msg = LOG_TEMPLATE.format(now, key, json.dumps(value), json.dumps(metadata))
-  return msg
+    now = time.time()
+    msg = LOG_TEMPLATE.format(
+        now,
+        key,
+        json.dumps(value),
+        json.dumps(metadata))
+    return msg
 
 
 def mlperf_print(key, value, stack_offset=0, metadata=None):
-  LOGGER.info(
-      mlperf_format(
-          key, value, stack_offset=stack_offset + 1, metadata=metadata))
+    LOGGER.info(
+        mlperf_format(
+            key,
+            value,
+            stack_offset=stack_offset + 1,
+            metadata=metadata)
+    )
diff --git a/retired_benchmarks/recommendation/dlrm/tf/train_and_eval_runner.py b/retired_benchmarks/recommendation/dlrm/tf/train_and_eval_runner.py
index ca0651b8c..b2554cfda 100644
--- a/retired_benchmarks/recommendation/dlrm/tf/train_and_eval_runner.py
+++ b/retired_benchmarks/recommendation/dlrm/tf/train_and_eval_runner.py
@@ -42,46 +42,55 @@
     "master",
     default=None,
     help="The Cloud TPU to use for training. This should be either the name "
-    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.")
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.",
+)
 
 flags.DEFINE_string(
     "gcp_project",
     default=None,
     help="Project name for the Cloud TPU-enabled project. If not specified, we "
-    "will attempt to automatically detect the GCE project from metadata.")
+    "will attempt to automatically detect the GCE project from metadata.",
+)
 
 flags.DEFINE_string(
     "tpu_zone",
     default=None,
     help="GCE zone where the Cloud TPU is located in. If not specified, we "
-    "will attempt to automatically detect the GCE project from metadata.")
+    "will attempt to automatically detect the GCE project from metadata.",
+)
 
 flags.DEFINE_integer(
-    "replicas_per_host", default=8, help=("Number of replicas per host."))
+    "replicas_per_host", default=8, help=("Number of replicas per host.")
+)
 
 flags.DEFINE_bool("enable_summary", default=False, help=("Enable summary"))
 
 flags.DEFINE_string(
     "model_dir",
     default=None,
-    help=("The directory where the model and summaries are stored."))
+    help=("The directory where the model and summaries are stored."),
+)
 
 flags.DEFINE_bool("save_checkpoint", default=False, help=("Save checkpoint"))
 
 flags.DEFINE_bool(
-    "restore_checkpoint", default=False, help=("Restore checkpoint"))
+    "restore_checkpoint",
+    default=False,
+    help=("Restore checkpoint"))
 
 flags.DEFINE_integer(
-    "sleep_after_init", default=60, help=("Sleep for N seconds after init."))
+    "sleep_after_init", default=60, help=("Sleep for N seconds after init.")
+)
 
 flags.DEFINE_bool(
-    "enable_mlir_bridge", default=False, help=("Enable TF/XLA MLIR bridge"))
+    "enable_mlir_bridge", default=False, help=("Enable TF/XLA MLIR bridge")
+)
 
 flags.DEFINE_bool(
     "enable_profiling",
     default=False,
-    help=("Get xprof traces at"
-          "the start and middle of the train loops"))
+    help=("Get xprof traces at" "the start and middle of the train loops"),
+)
 
 _NUM_CORES_TO_COMPUTATION_SHAPE = {
     1: [1, 1, 1, 1],
@@ -93,495 +102,586 @@
 
 
 def _profiler_callback(comment, session_id):
-  if session_id is None:
-    tf.logging.info("Profiling failed for %s", comment)
-  else:
-    tf.logging.info("Profiling succeeded for %s. Overview page url:", comment)
+    if session_id is None:
+        tf.logging.info("Profiling failed for %s", comment)
+    else:
+        tf.logging.info(
+            "Profiling succeeded for %s. Overview page url:",
+            comment)
 
 
 # Decorator function for tpu computation func that was passed to tpu.rewrite()
 # if there are embedded train and eval loops in this func, trace tools will
 # generate step markers for each iteration.
 def on_device_train_and_eval_loops(func):
-  # Value for this attribute is from xla.DebugOptions.StepMarkerLocation.
-  setattr(func, "step_marker_location", "STEP_MARK_AT_SECOND_LEVEL_WHILE_LOOP")
-  return func
+    # Value for this attribute is from xla.DebugOptions.StepMarkerLocation.
+    setattr(
+        func,
+        "step_marker_location",
+        "STEP_MARK_AT_SECOND_LEVEL_WHILE_LOOP")
+    return func
 
 
 def device_for_tpu_core(host_name, core=0):
-  return host_name + "/device:TPU_REPLICATED_CORE:%d" % core
+    return host_name + "/device:TPU_REPLICATED_CORE:%d" % core
 
 
 def device_for_host(host_name):
-  return host_name + "/device:CPU:0"
+    return host_name + "/device:CPU:0"
 
 
 class TrainAndEvalRunner(object):
-  """Remove init overheads in TPU Estimator via direct session.run calls."""
-
-  def __init__(self,
-               iterations_per_loop,
-               train_steps,
-               eval_steps,
-               num_replicas,
-               eval_dataset_repeats=True,
-               do_initialize=True):
-    self.feature_structure = {}
-    self.infeed_op = {}
-    self.num_replicas = num_replicas
-    self.eval_dataset_repeats = eval_dataset_repeats
-    # Set number of input graphs to number of hosts up to a maximum of 32.
-    self.num_input_graphs = min(32,
-                                self.num_replicas // FLAGS.replicas_per_host)
-    # Following data has separated copies for training and eval, thus
-    # represented as a map from is_train(boolean) to actual data
-    self.dataset_initializer = {True: [], False: []}
-    self.input_graph = {True: [], False: []}
-    self.input_sess = {True: [], False: []}
-    self.enqueue_ops = {True: [], False: []}
-    for _ in range(self.num_input_graphs):
-      self.input_graph[True].append(tf.Graph())
-      self.input_graph[False].append(tf.Graph())
-      self.dataset_initializer[True].append([])
-      self.dataset_initializer[False].append([])
-      self.enqueue_ops[True].append([])
-      self.enqueue_ops[False].append([])
-      self.input_sess[True].append([])
-      self.input_sess[False].append([])
-    # dequeue_ops is only for eval
-    self.dequeue_ops = []
-    self.iterations_per_loop = iterations_per_loop
-    self.sess = None
-    self.output_sess = None
-    self.train_eval_thread = None
-    self.graph = tf.Graph()
-    if iterations_per_loop != 0 and train_steps % iterations_per_loop != 0:
-      train_steps = iterations_per_loop * int(
-          math.ceil(train_steps / iterations_per_loop))
-    self.train_steps = train_steps
-    if iterations_per_loop == 0:
-      self.max_train_iterations = 1
-    else:
-      self.max_train_iterations = train_steps // iterations_per_loop
-    self.eval_steps = int(eval_steps)
-    self.train_batch_size = 0
-    self.eval_batch_size = 0
-    self.eval_has_labels = 0
-    self.model_fn = None
-    self.num_outfeeds = self.eval_steps
-    self.config = tf.ConfigProto(
-        operation_timeout_in_ms=600 * 60 * 1000,
-        allow_soft_placement=True,
-        graph_options=tf.GraphOptions(
-            rewrite_options=rewriter_config_pb2.RewriterConfig(
-                disable_meta_optimizer=True)),
-        isolate_session_state=True)
-
-    if FLAGS.enable_mlir_bridge:
-      self.config.experimental.enable_mlir_bridge = True
-
-    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-        FLAGS.master, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project,
-        job_name="tpu_worker")
-    self.master = tpu_cluster_resolver.get_master()
-    self.job_name = tpu_cluster_resolver.get_job_name() or "tpu_worker"
-    self.embedding_config = None
-    self.device_topology = None
-    if do_initialize:
-      self.device_topology = tf.Session(
-          self.master, config=self.config).run(
-              tpu.initialize_system())
-
-  def maybe_capture_embedding_inputs(self, inputs, is_training):
-    pass
-
-  def maybe_add_embedding_enqueue_ops_int(self, is_training, enqueue_ops):
-    pass
-
-  def maybe_get_embedding_train_op(self):
-    return tf.no_op()
-
-  def maybe_add_embedding_features(self, features, hook_dummy_variables):
-    pass
-
-  def maybe_load_embedding_vars(self):
-    pass
-
-  def get_host(self, host_id):
-    if self.master in ("", "local"):
-      return "/replica:0/task:0"
-    return "/job:%s/task:%d" % (self.job_name, host_id)
-
-  def build_enqueue_ops(self, input_fn, is_training, input_partition_dims,
-                        params):
-    """Build enqueue operations for the input pipeline in a given host.
-
-    Args:
-      input_fn: dataset input graph generation function
-      is_training: boolean indicates if it is training
-      input_partition_dims: list of integers to partition input
-      params: hyper parameters
-    """
-
-    def _tpu_ordinal_fn(shard_index_in_host):
-      replica_id = self.device_assignment.lookup_replicas(
-          host_id, logical_core=0)[shard_index_in_host]
-      return self.device_assignment.tpu_ordinal(
-          replica=replica_id, logical_core=0)
-
-    host_id = params["dataset_index"]
-    gindex = host_id % self.num_input_graphs
-    with self.input_graph[is_training][gindex].as_default():
-      with tf.device(device_for_host(self.get_host(host_id))):
-        dataset = input_fn(params)
-        if not is_training and self.eval_dataset_repeats:
-          dataset = dataset.cache().repeat()
-        iterator = dataset.make_initializable_iterator()
-        self.dataset_initializer[is_training][gindex].append(
-            iterator.initializer)
-
-        def enqueue_ops_fn(idx):
-          """Generate the infeed enqueue ops graph."""
-
-          per_host_sharded_inputs = []
-          control_deps = []
-          for _ in range(FLAGS.replicas_per_host):
-            with tf.control_dependencies(control_deps):
-              self.feature_structure[is_training] = iterator.get_next()
-            self.maybe_capture_embedding_inputs(
-                self.feature_structure[is_training], is_training)
-            flattened_inputs = tf.nest.flatten(
-                self.feature_structure[is_training])
-            control_deps.extend(flattened_inputs)
-            if input_partition_dims:
-              padded_inputs = []
-              for inp in flattened_inputs:
-                if inp.shape.ndims < len(input_partition_dims):
-                  padded_inputs.append(inp)
-                  continue
-                paddings = []
-                for i, j in enumerate(input_partition_dims):
-                  r = inp.shape.as_list()[i] % j
-                  if r > 0:
-                    paddings.append([0, j - r])
-                  else:
-                    paddings.append([0, 0])
-                for i in range(inp.shape.ndims - len(input_partition_dims)):
-                  paddings.append([0, 0])
-                padded_inputs.append(tf.pad(inp, paddings))
-              per_host_sharded_inputs.append(padded_inputs)
-            else:
-              per_host_sharded_inputs.append(flattened_inputs)
-
-          if input_partition_dims:
-            flattened_input_dims = []
-            for i in per_host_sharded_inputs[0]:
-              if i.shape.ndims == len(input_partition_dims):
-                flattened_input_dims.append(input_partition_dims)
-              elif i.shape.ndims > len(input_partition_dims):
-                flattened_input_dims.append(
-                    input_partition_dims + [1] *
-                    (i.shape.ndims - len(input_partition_dims)))
-              else:
-                flattened_input_dims.append([1] * i.shape.ndims)
-            # pylint: disable=protected-access
-            self.infeed_op[is_training] = tpu_feed._PartitionedInfeedQueue(
-                number_of_tuple_elements=len(per_host_sharded_inputs[0]),
-                host_id=host_id,
-                input_partition_dims=flattened_input_dims,
-                device_assignment=self.device_assignment)
-            with tf.control_dependencies(
-                self.infeed_op[is_training].generate_enqueue_ops(
-                    per_host_sharded_inputs)):
-              return idx + 1
-          else:
-            self.infeed_op[is_training] = tpu_feed.InfeedQueue(
-                number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-            per_host_enqueue_ops = (
-                self.infeed_op[is_training].generate_enqueue_ops(
-                    per_host_sharded_inputs,
-                    tpu_ordinal_function=_tpu_ordinal_fn))
-
-          self.maybe_add_embedding_enqueue_ops_int(
-              is_training, per_host_enqueue_ops)
-          with tf.control_dependencies(per_host_enqueue_ops):
-            return idx + 1
-
-        iterations = self.iterations_per_loop if is_training else self.eval_steps
-        self.enqueue_ops[is_training][gindex].append(
-            tf.while_loop(
-                lambda i: tf.less(i, iterations),
-                enqueue_ops_fn, [tf.constant(0)],
-                parallel_iterations=1))
-
-  def launch_profiler(self):
-    """Launches a profiling session to collect a trace from worker-0."""
-    if result == profiler_client.PROFILED_IN_NEW_THREAD:
-      tf.logging.info("A profiler session launched in a new thread.")
-    else:
-      tf.logging.info("profiler.collect() failed.")
-
-  def eval_step(self):
-    """One evaluation step."""
-    inp = self.infeed_op[False].generate_dequeue_op()
-    flatten_structure = tf.nest.flatten(self.feature_structure[False])
-    inp = [
-        tf.slice(i, [0] * i.shape.ndims, j.shape)
-        for i, j in zip(inp, flatten_structure)
-    ]
-    if self.eval_has_labels:
-      features, labels = tf.nest.pack_sequence_as(
-          self.feature_structure[False], inp)
-    else:
-      features = tf.nest.pack_sequence_as(self.feature_structure[False], inp)
-      labels = None
-    self.maybe_add_embedding_features(features, False)
-    _, self.predict_output = self.model_fn(features, labels, False)
-    for _ in self.predict_output:
-      self.dequeue_ops.append([])
-    with tf.device(device_for_tpu_core(self.get_host(0))):
-      return [
-          tpu_ops.outfeed_enqueue_tuple(tf.nest.flatten(self.predict_output))
-      ]
-
-  @tpu_function.on_device_training_loop
-  def eval_loop(self):
-    tf.get_variable_scope().reuse_variables()
-    return training_loop.repeat(int(self.eval_steps), self.eval_step)
-
-  def initialize(self,
-                 train_input_fn,
-                 eval_input_fn,
-                 model_fn,
-                 train_batch_size,
-                 eval_batch_size,
-                 input_partition_dims=None,
-                 init_fn=None,
-                 train_has_labels=True,
-                 eval_has_labels=True,
-                 params=None,
-                 num_partitions=None):
-    """Build graphs for the TPU device and the input pipelines."""
-    num_cores_per_replica = 1
-    num_cores_per_replica = functools.reduce(
-        operator.mul, input_partition_dims
-    ) if input_partition_dims else num_partitions if num_partitions else 1
-
-    self.device_assignment = device_assignment.device_assignment(
-        topology=self.device_topology,
-        computation_shape=_NUM_CORES_TO_COMPUTATION_SHAPE[
-            num_cores_per_replica],
-        num_replicas=self.num_replicas)
-    self.train_batch_size = train_batch_size
-    self.eval_batch_size = eval_batch_size
-    self.eval_has_labels = eval_has_labels
-    self.model_fn = model_fn
-
-    if params is None:
-      params = {}
-    params["dataset_num_shards"] = self.num_replicas // FLAGS.replicas_per_host
-    per_replica_train_batch_size = train_batch_size // self.num_replicas
-    per_replica_eval_batch_size = eval_batch_size // self.num_replicas
-    for i in range(self.num_replicas // FLAGS.replicas_per_host):
-      params["dataset_index"] = i
-      params["batch_size"] = per_replica_train_batch_size
-      self.build_enqueue_ops(train_input_fn, True, input_partition_dims, params)
-      if self.eval_steps > 0:
-        params["batch_size"] = per_replica_eval_batch_size
-        self.build_enqueue_ops(eval_input_fn, False, input_partition_dims,
-                               params)
-
-    def train_step(_):
-      """One train step."""
-      inp = self.infeed_op[True].generate_dequeue_op()
-      flatten_structure = tf.nest.flatten(self.feature_structure[True])
-      inp = [
-          tf.slice(i, [0] * i.shape.ndims, j.shape)
-          for i, j in zip(inp, flatten_structure)
-      ]
-      if train_has_labels:
-        features, labels = tf.nest.pack_sequence_as(
-            self.feature_structure[True], inp)
-      else:
-        features = tf.nest.pack_sequence_as(self.feature_structure[True], inp)
-        labels = None
-      self.maybe_add_embedding_features(features, True)
-      train_op, _ = model_fn(features, labels, True)
-      embedding_train_op = self.maybe_get_embedding_train_op()
-      with tf.device(device_for_tpu_core(self.get_host(0))):
-        with tf.control_dependencies([train_op, embedding_train_op]):
-          return tf.constant(0)
+    """Remove init overheads in TPU Estimator via direct session.run calls."""
+
+    def __init__(
+        self,
+        iterations_per_loop,
+        train_steps,
+        eval_steps,
+        num_replicas,
+        eval_dataset_repeats=True,
+        do_initialize=True,
+    ):
+        self.feature_structure = {}
+        self.infeed_op = {}
+        self.num_replicas = num_replicas
+        self.eval_dataset_repeats = eval_dataset_repeats
+        # Set number of input graphs to number of hosts up to a maximum of 32.
+        self.num_input_graphs = min(
+            32, self.num_replicas // FLAGS.replicas_per_host)
+        # Following data has separated copies for training and eval, thus
+        # represented as a map from is_train(boolean) to actual data
+        self.dataset_initializer = {True: [], False: []}
+        self.input_graph = {True: [], False: []}
+        self.input_sess = {True: [], False: []}
+        self.enqueue_ops = {True: [], False: []}
+        for _ in range(self.num_input_graphs):
+            self.input_graph[True].append(tf.Graph())
+            self.input_graph[False].append(tf.Graph())
+            self.dataset_initializer[True].append([])
+            self.dataset_initializer[False].append([])
+            self.enqueue_ops[True].append([])
+            self.enqueue_ops[False].append([])
+            self.input_sess[True].append([])
+            self.input_sess[False].append([])
+        # dequeue_ops is only for eval
+        self.dequeue_ops = []
+        self.iterations_per_loop = iterations_per_loop
+        self.sess = None
+        self.output_sess = None
+        self.train_eval_thread = None
+        self.graph = tf.Graph()
+        if iterations_per_loop != 0 and train_steps % iterations_per_loop != 0:
+            train_steps = iterations_per_loop * int(
+                math.ceil(train_steps / iterations_per_loop)
+            )
+        self.train_steps = train_steps
+        if iterations_per_loop == 0:
+            self.max_train_iterations = 1
+        else:
+            self.max_train_iterations = train_steps // iterations_per_loop
+        self.eval_steps = int(eval_steps)
+        self.train_batch_size = 0
+        self.eval_batch_size = 0
+        self.eval_has_labels = 0
+        self.model_fn = None
+        self.num_outfeeds = self.eval_steps
+        self.config = tf.ConfigProto(
+            operation_timeout_in_ms=600 * 60 * 1000,
+            allow_soft_placement=True,
+            graph_options=tf.GraphOptions(
+                rewrite_options=rewriter_config_pb2.RewriterConfig(
+                    disable_meta_optimizer=True
+                )
+            ),
+            isolate_session_state=True,
+        )
+
+        if FLAGS.enable_mlir_bridge:
+            self.config.experimental.enable_mlir_bridge = True
+
+        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+            FLAGS.master,
+            zone=FLAGS.tpu_zone,
+            project=FLAGS.gcp_project,
+            job_name="tpu_worker",
+        )
+        self.master = tpu_cluster_resolver.get_master()
+        self.job_name = tpu_cluster_resolver.get_job_name() or "tpu_worker"
+        self.embedding_config = None
+        self.device_topology = None
+        if do_initialize:
+            self.device_topology = tf.Session(self.master, config=self.config).run(
+                tpu.initialize_system()
+            )
+
+    def maybe_capture_embedding_inputs(self, inputs, is_training):
+        pass
+
+    def maybe_add_embedding_enqueue_ops_int(self, is_training, enqueue_ops):
+        pass
+
+    def maybe_get_embedding_train_op(self):
+        return tf.no_op()
+
+    def maybe_add_embedding_features(self, features, hook_dummy_variables):
+        pass
+
+    def maybe_load_embedding_vars(self):
+        pass
+
+    def get_host(self, host_id):
+        if self.master in ("", "local"):
+            return "/replica:0/task:0"
+        return "/job:%s/task:%d" % (self.job_name, host_id)
+
+    def build_enqueue_ops(self, input_fn, is_training,
+                          input_partition_dims, params):
+        """Build enqueue operations for the input pipeline in a given host.
+
+        Args:
+          input_fn: dataset input graph generation function
+          is_training: boolean indicates if it is training
+          input_partition_dims: list of integers to partition input
+          params: hyper parameters
+        """
+
+        def _tpu_ordinal_fn(shard_index_in_host):
+            replica_id = self.device_assignment.lookup_replicas(
+                host_id, logical_core=0
+            )[shard_index_in_host]
+            return self.device_assignment.tpu_ordinal(
+                replica=replica_id, logical_core=0
+            )
+
+        host_id = params["dataset_index"]
+        gindex = host_id % self.num_input_graphs
+        with self.input_graph[is_training][gindex].as_default():
+            with tf.device(device_for_host(self.get_host(host_id))):
+                dataset = input_fn(params)
+                if not is_training and self.eval_dataset_repeats:
+                    dataset = dataset.cache().repeat()
+                iterator = dataset.make_initializable_iterator()
+                self.dataset_initializer[is_training][gindex].append(
+                    iterator.initializer
+                )
+
+                def enqueue_ops_fn(idx):
+                    """Generate the infeed enqueue ops graph."""
+
+                    per_host_sharded_inputs = []
+                    control_deps = []
+                    for _ in range(FLAGS.replicas_per_host):
+                        with tf.control_dependencies(control_deps):
+                            self.feature_structure[is_training] = iterator.get_next(
+                            )
+                        self.maybe_capture_embedding_inputs(
+                            self.feature_structure[is_training], is_training
+                        )
+                        flattened_inputs = tf.nest.flatten(
+                            self.feature_structure[is_training]
+                        )
+                        control_deps.extend(flattened_inputs)
+                        if input_partition_dims:
+                            padded_inputs = []
+                            for inp in flattened_inputs:
+                                if inp.shape.ndims < len(input_partition_dims):
+                                    padded_inputs.append(inp)
+                                    continue
+                                paddings = []
+                                for i, j in enumerate(input_partition_dims):
+                                    r = inp.shape.as_list()[i] % j
+                                    if r > 0:
+                                        paddings.append([0, j - r])
+                                    else:
+                                        paddings.append([0, 0])
+                                for i in range(
+                                    inp.shape.ndims - len(input_partition_dims)
+                                ):
+                                    paddings.append([0, 0])
+                                padded_inputs.append(tf.pad(inp, paddings))
+                            per_host_sharded_inputs.append(padded_inputs)
+                        else:
+                            per_host_sharded_inputs.append(flattened_inputs)
+
+                    if input_partition_dims:
+                        flattened_input_dims = []
+                        for i in per_host_sharded_inputs[0]:
+                            if i.shape.ndims == len(input_partition_dims):
+                                flattened_input_dims.append(
+                                    input_partition_dims)
+                            elif i.shape.ndims > len(input_partition_dims):
+                                flattened_input_dims.append(
+                                    input_partition_dims
+                                    + [1] * (i.shape.ndims -
+                                             len(input_partition_dims))
+                                )
+                            else:
+                                flattened_input_dims.append(
+                                    [1] * i.shape.ndims)
+                        # pylint: disable=protected-access
+                        self.infeed_op[is_training] = tpu_feed._PartitionedInfeedQueue(
+                            number_of_tuple_elements=len(
+                                per_host_sharded_inputs[0]),
+                            host_id=host_id,
+                            input_partition_dims=flattened_input_dims,
+                            device_assignment=self.device_assignment,
+                        )
+                        with tf.control_dependencies(
+                            self.infeed_op[is_training].generate_enqueue_ops(
+                                per_host_sharded_inputs
+                            )
+                        ):
+                            return idx + 1
+                    else:
+                        self.infeed_op[is_training] = tpu_feed.InfeedQueue(
+                            number_of_tuple_elements=len(
+                                per_host_sharded_inputs[0])
+                        )
+                        per_host_enqueue_ops = self.infeed_op[
+                            is_training
+                        ].generate_enqueue_ops(
+                            per_host_sharded_inputs,
+                            tpu_ordinal_function=_tpu_ordinal_fn,
+                        )
+
+                    self.maybe_add_embedding_enqueue_ops_int(
+                        is_training, per_host_enqueue_ops
+                    )
+                    with tf.control_dependencies(per_host_enqueue_ops):
+                        return idx + 1
+
+                iterations = (
+                    self.iterations_per_loop if is_training else self.eval_steps
+                )
+                self.enqueue_ops[is_training][gindex].append(
+                    tf.while_loop(
+                        lambda i: tf.less(i, iterations),
+                        enqueue_ops_fn,
+                        [tf.constant(0)],
+                        parallel_iterations=1,
+                    )
+                )
+
+    def launch_profiler(self):
+        """Launches a profiling session to collect a trace from worker-0."""
+        if result == profiler_client.PROFILED_IN_NEW_THREAD:
+            tf.logging.info("A profiler session launched in a new thread.")
+        else:
+            tf.logging.info("profiler.collect() failed.")
+
+    def eval_step(self):
+        """One evaluation step."""
+        inp = self.infeed_op[False].generate_dequeue_op()
+        flatten_structure = tf.nest.flatten(self.feature_structure[False])
+        inp = [
+            tf.slice(i, [0] * i.shape.ndims, j.shape)
+            for i, j in zip(inp, flatten_structure)
+        ]
+        if self.eval_has_labels:
+            features, labels = tf.nest.pack_sequence_as(
+                self.feature_structure[False], inp
+            )
+        else:
+            features = tf.nest.pack_sequence_as(
+                self.feature_structure[False], inp)
+            labels = None
+        self.maybe_add_embedding_features(features, False)
+        _, self.predict_output = self.model_fn(features, labels, False)
+        for _ in self.predict_output:
+            self.dequeue_ops.append([])
+        with tf.device(device_for_tpu_core(self.get_host(0))):
+            return [tpu_ops.outfeed_enqueue_tuple(
+                tf.nest.flatten(self.predict_output))]
 
     @tpu_function.on_device_training_loop
-    def train_loop():
-      return training_loop.repeat(self.iterations_per_loop, train_step,
-                                  tf.constant(0))
-
-    def train_eval_step():
-      with tf.control_dependencies(train_loop()):
+    def eval_loop(self):
+        tf.get_variable_scope().reuse_variables()
+        return training_loop.repeat(int(self.eval_steps), self.eval_step)
+
+    def initialize(
+        self,
+        train_input_fn,
+        eval_input_fn,
+        model_fn,
+        train_batch_size,
+        eval_batch_size,
+        input_partition_dims=None,
+        init_fn=None,
+        train_has_labels=True,
+        eval_has_labels=True,
+        params=None,
+        num_partitions=None,
+    ):
+        """Build graphs for the TPU device and the input pipelines."""
+        num_cores_per_replica = 1
+        num_cores_per_replica = (
+            functools.reduce(operator.mul, input_partition_dims)
+            if input_partition_dims
+            else num_partitions if num_partitions else 1
+        )
+
+        self.device_assignment = device_assignment.device_assignment(
+            topology=self.device_topology,
+            computation_shape=_NUM_CORES_TO_COMPUTATION_SHAPE[num_cores_per_replica],
+            num_replicas=self.num_replicas,
+        )
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.eval_has_labels = eval_has_labels
+        self.model_fn = model_fn
+
+        if params is None:
+            params = {}
+        params["dataset_num_shards"] = self.num_replicas // FLAGS.replicas_per_host
+        per_replica_train_batch_size = train_batch_size // self.num_replicas
+        per_replica_eval_batch_size = eval_batch_size // self.num_replicas
+        for i in range(self.num_replicas // FLAGS.replicas_per_host):
+            params["dataset_index"] = i
+            params["batch_size"] = per_replica_train_batch_size
+            self.build_enqueue_ops(
+                train_input_fn, True, input_partition_dims, params)
+            if self.eval_steps > 0:
+                params["batch_size"] = per_replica_eval_batch_size
+                self.build_enqueue_ops(
+                    eval_input_fn, False, input_partition_dims, params
+                )
+
+        def train_step(_):
+            """One train step."""
+            inp = self.infeed_op[True].generate_dequeue_op()
+            flatten_structure = tf.nest.flatten(self.feature_structure[True])
+            inp = [
+                tf.slice(i, [0] * i.shape.ndims, j.shape)
+                for i, j in zip(inp, flatten_structure)
+            ]
+            if train_has_labels:
+                features, labels = tf.nest.pack_sequence_as(
+                    self.feature_structure[True], inp
+                )
+            else:
+                features = tf.nest.pack_sequence_as(
+                    self.feature_structure[True], inp)
+                labels = None
+            self.maybe_add_embedding_features(features, True)
+            train_op, _ = model_fn(features, labels, True)
+            embedding_train_op = self.maybe_get_embedding_train_op()
+            with tf.device(device_for_tpu_core(self.get_host(0))):
+                with tf.control_dependencies([train_op, embedding_train_op]):
+                    return tf.constant(0)
+
+        @tpu_function.on_device_training_loop
+        def train_loop():
+            return training_loop.repeat(
+                self.iterations_per_loop, train_step, tf.constant(0)
+            )
+
+        def train_eval_step():
+            with tf.control_dependencies(train_loop()):
+                if self.eval_steps > 0:
+                    return self.eval_loop()
+                else:
+                    return tf.no_op()
+
+        @on_device_train_and_eval_loops
+        def train_eval_loop():
+            return training_loop.repeat(
+                self.max_train_iterations, train_eval_step)
+
+        with self.graph.as_default():
+            (self.train_eval_op,) = tpu.shard(
+                train_eval_loop,
+                inputs=[],
+                num_shards=self.num_replicas,
+                outputs_from_all_shards=False,
+                device_assignment=self.device_assignment,
+            )
+            if FLAGS.model_dir:
+                tf.io.write_graph(self.graph, FLAGS.model_dir, "graph.pbtxt")
+
+        output_graph = tf.Graph()
         if self.eval_steps > 0:
-          return self.eval_loop()
+            with output_graph.as_default():
+                flatten_output = tf.nest.flatten(self.predict_output)
+                self.dequeue_ops = [[] for _ in flatten_output]
+                tensor_dtypes = [v.dtype for v in flatten_output]
+                tensor_shapes = [v.shape for v in flatten_output]
+                is_padded_index = (
+                    flatten_output.index(self.predict_output[_IS_PADDED])
+                    if _IS_PADDED in self.predict_output
+                    else -1
+                )
+                for i in range(self.num_replicas // FLAGS.replicas_per_host):
+                    with tf.device(device_for_host(self.get_host(i))):
+                        host_dequeue_ops = [[] for _ in flatten_output]
+                        for j in range(FLAGS.replicas_per_host):
+                            replica_id = self.device_assignment.lookup_replicas(i, 0)[
+                                j]
+                            ordinal = self.device_assignment.tpu_ordinal(
+                                replica=replica_id, logical_core=0
+                            )
+                            dequeue_ops = tpu_ops.outfeed_dequeue_tuple(
+                                dtypes=tensor_dtypes,
+                                shapes=tensor_shapes,
+                                device_ordinal=ordinal,
+                            )
+                            if is_padded_index >= 0:
+                                num_non_pad = tf.shape(dequeue_ops[is_padded_index])[
+                                    0
+                                ] - tf.reduce_sum(
+                                    tf.cast(
+                                        dequeue_ops[is_padded_index], tf.int32)
+                                )
+                                dequeue_ops = [
+                                    tf.slice(
+                                        k,
+                                        [0] * k.shape.ndims,
+                                        [num_non_pad] + [-1] *
+                                        (k.shape.ndims - 1),
+                                    )
+                                    for k in dequeue_ops
+                                ]
+                            for k, item in enumerate(dequeue_ops):
+                                host_dequeue_ops[k].append(item)
+                        for k in range(len(self.predict_output)):
+                            self.dequeue_ops[k].append(
+                                tf.concat(host_dequeue_ops[k], axis=0)
+                            )
+
+        self.sess = tf.Session(
+            self.master,
+            graph=self.graph,
+            config=self.config)
+        for is_training in [True, False]:
+            if is_training or self.eval_steps > 0:
+                for i in range(self.num_input_graphs):
+                    with self.input_graph[is_training][i].as_default():
+                        self.input_sess[is_training][i] = tf.Session(
+                            self.master,
+                            graph=self.input_graph[is_training][i],
+                            config=self.config,
+                        )
+                        self.input_sess[is_training][i].run(
+                            self.dataset_initializer[is_training][i]
+                        )
+        self.output_sess = tf.Session(
+            self.master, graph=output_graph, config=self.config
+        )
+
+        with self.graph.as_default():
+            _ = tf.train.get_or_create_global_step()
+            if init_fn:
+                init_fn()
+            checkpoint_path = (
+                tf.train.latest_checkpoint(
+                    FLAGS.model_dir) if FLAGS.model_dir else None
+            )
+            if FLAGS.restore_checkpoint and checkpoint_path:
+                tf.train.Saver().restore(self.sess, checkpoint_path)
+            else:
+                self.sess.run(tf.global_variables_initializer())
+                self.sess.run(tf.local_variables_initializer())
+            self.maybe_load_embedding_vars()
+            self.global_step = self.sess.run(
+                tf.train.get_global_step(self.graph))
+
+        def train_eval_thread_fn(sess, train_eval_op):
+            sess.run([train_eval_op])
+
+        # Start the just in time compilation of the model function
+        self.train_eval_thread = threading.Thread(
+            target=train_eval_thread_fn, args=(self.sess, self.train_eval_op)
+        )
+        self.train_eval_thread.start()
+
+        # Sleep for JTC to finish
+        time.sleep(FLAGS.sleep_after_init)
+
+    def train_and_eval(
+        self, eval_init_fn=None, eval_finish_fn=None, run_finish_fn=None
+    ):
+        """Run the Train steps on the TPU device."""
+        if FLAGS.enable_summary:
+            output_dir = os.path.join(FLAGS.model_dir, "eval")
+            tf.gfile.MakeDirs(output_dir)
+            summary_writer = tf.summary.FileWriter(output_dir)
         else:
-          return tf.no_op()
-
-    @on_device_train_and_eval_loops
-    def train_eval_loop():
-      return training_loop.repeat(self.max_train_iterations, train_eval_step)
-
-    with self.graph.as_default():
-      (self.train_eval_op,) = tpu.shard(
-          train_eval_loop,
-          inputs=[],
-          num_shards=self.num_replicas,
-          outputs_from_all_shards=False,
-          device_assignment=self.device_assignment)
-      if FLAGS.model_dir:
-        tf.io.write_graph(self.graph, FLAGS.model_dir, "graph.pbtxt")
-
-    output_graph = tf.Graph()
-    if self.eval_steps > 0:
-      with output_graph.as_default():
-        flatten_output = tf.nest.flatten(self.predict_output)
-        self.dequeue_ops = [[] for _ in flatten_output]
-        tensor_dtypes = [v.dtype for v in flatten_output]
-        tensor_shapes = [v.shape for v in flatten_output]
-        is_padded_index = flatten_output.index(
-            self.predict_output[_IS_PADDED]
-        ) if _IS_PADDED in self.predict_output else -1
-        for i in range(self.num_replicas // FLAGS.replicas_per_host):
-          with tf.device(device_for_host(self.get_host(i))):
-            host_dequeue_ops = [[] for _ in flatten_output]
-            for j in range(FLAGS.replicas_per_host):
-              replica_id = self.device_assignment.lookup_replicas(i, 0)[j]
-              ordinal = self.device_assignment.tpu_ordinal(
-                  replica=replica_id, logical_core=0)
-              dequeue_ops = tpu_ops.outfeed_dequeue_tuple(
-                  dtypes=tensor_dtypes,
-                  shapes=tensor_shapes,
-                  device_ordinal=ordinal)
-              if is_padded_index >= 0:
-                num_non_pad = tf.shape(
-                    dequeue_ops[is_padded_index])[0] - tf.reduce_sum(
-                        tf.cast(dequeue_ops[is_padded_index], tf.int32))
-                dequeue_ops = [
-                    tf.slice(k, [0] * k.shape.ndims,
-                             [num_non_pad] + [-1] * (k.shape.ndims - 1))
-                    for k in dequeue_ops
-                ]
-              for k, item in enumerate(dequeue_ops):
-                host_dequeue_ops[k].append(item)
-            for k in range(len(self.predict_output)):
-              self.dequeue_ops[k].append(tf.concat(host_dequeue_ops[k], axis=0))
-
-    self.sess = tf.Session(self.master, graph=self.graph, config=self.config)
-    for is_training in [True, False]:
-      if is_training or self.eval_steps > 0:
+            summary_writer = None
+
+        def infeed_thread_fn(thread_index):
+            # Wait for condition
+            """Build and infeed session.run calls in a background thread."""
+            for _ in range(self.max_train_iterations):
+                self.input_sess[True][thread_index].run(
+                    [self.enqueue_ops[True][thread_index]]
+                )
+                if self.eval_steps > 0:
+                    if not self.eval_dataset_repeats:
+                        self.input_sess[False][thread_index].run(
+                            self.dataset_initializer[False][thread_index]
+                        )
+                    self.input_sess[False][thread_index].run(
+                        [self.enqueue_ops[False][thread_index]]
+                    )
+
+        infeed_threads = []
         for i in range(self.num_input_graphs):
-          with self.input_graph[is_training][i].as_default():
-            self.input_sess[is_training][i] = tf.Session(
-                self.master,
-                graph=self.input_graph[is_training][i],
-                config=self.config)
-            self.input_sess[is_training][i].run(
-                self.dataset_initializer[is_training][i])
-    self.output_sess = tf.Session(
-        self.master, graph=output_graph, config=self.config)
-
-    with self.graph.as_default():
-      _ = tf.train.get_or_create_global_step()
-      if init_fn:
-        init_fn()
-      checkpoint_path = tf.train.latest_checkpoint(
-          FLAGS.model_dir) if FLAGS.model_dir else None
-      if FLAGS.restore_checkpoint and checkpoint_path:
-        tf.train.Saver().restore(self.sess, checkpoint_path)
-      else:
-        self.sess.run(tf.global_variables_initializer())
-        self.sess.run(tf.local_variables_initializer())
-      self.maybe_load_embedding_vars()
-      self.global_step = self.sess.run(tf.train.get_global_step(self.graph))
-
-    def train_eval_thread_fn(sess, train_eval_op):
-      sess.run([train_eval_op])
-
-    # Start the just in time compilation of the model function
-    self.train_eval_thread = threading.Thread(
-        target=train_eval_thread_fn, args=(self.sess, self.train_eval_op))
-    self.train_eval_thread.start()
-
-    # Sleep for JTC to finish
-    time.sleep(FLAGS.sleep_after_init)
-
-  def train_and_eval(self,
-                     eval_init_fn=None,
-                     eval_finish_fn=None,
-                     run_finish_fn=None):
-    """Run the Train steps on the TPU device."""
-    if FLAGS.enable_summary:
-      output_dir = os.path.join(FLAGS.model_dir, "eval")
-      tf.gfile.MakeDirs(output_dir)
-      summary_writer = tf.summary.FileWriter(output_dir)
-    else:
-      summary_writer = None
-
-    def infeed_thread_fn(thread_index):
-      # Wait for condition
-      """Build and infeed session.run calls in a background thread."""
-      for _ in range(self.max_train_iterations):
-        self.input_sess[True][thread_index].run(
-            [self.enqueue_ops[True][thread_index]])
+            thread = threading.Thread(target=infeed_thread_fn, args=([i]))
+            thread.start()
+            infeed_threads.append(thread)
+
+        global_step = self.global_step
+
         if self.eval_steps > 0:
-          if not self.eval_dataset_repeats:
-            self.input_sess[False][thread_index].run(
-                self.dataset_initializer[False][thread_index])
-          self.input_sess[False][thread_index].run(
-              [self.enqueue_ops[False][thread_index]])
-
-    infeed_threads = []
-    for i in range(self.num_input_graphs):
-      thread = threading.Thread(target=infeed_thread_fn, args=([i]))
-      thread.start()
-      infeed_threads.append(thread)
-
-    global_step = self.global_step
-
-    if self.eval_steps > 0:
-      enable_tracing = FLAGS.enable_profiling
-      if enable_tracing:
-        self.launch_profiler()
-
-      success = False
-      step_range = [global_step] if self.iterations_per_loop == 0 else range(
-          global_step, global_step + self.train_steps, self.iterations_per_loop)
-      for cur_step in step_range:
-        if not success and eval_init_fn:
-          eval_init_fn(cur_step)
-        eval_output = [[] for _ in self.dequeue_ops]
-        for _ in range(self.num_outfeeds):
-          for i, t in enumerate(self.output_sess.run(self.dequeue_ops)):
-            eval_output[i] += list(t)
-        eval_output = tf.nest.pack_sequence_as(self.predict_output, eval_output)
-        if eval_finish_fn and not success and eval_finish_fn(
-            cur_step, eval_output, summary_writer):
-          success = True
-        if enable_tracing and cur_step > self.train_steps // 4:
-          self.launch_profiler()
-          enable_tracing = False
-
-      if run_finish_fn:
-        run_finish_fn(success)
-
-    if FLAGS.save_checkpoint:
-      with self.graph.as_default():
-        self.global_step = self.sess.run(tf.train.get_global_step(self.graph))
-        checkpoint_path = FLAGS.model_dir + "/model.ckpt-%d" % self.global_step
-        tf.train.Saver().save(self.sess, checkpoint_path)
-        tf.logging.info("Checkpoint saved to %s", checkpoint_path)
-
-    if FLAGS.enable_summary:
-      summary_writer.close()
-
-    self.train_eval_thread.join()
-    for i in range(self.num_input_graphs):
-      infeed_threads[i].join()
-    self.sess.close()
+            enable_tracing = FLAGS.enable_profiling
+            if enable_tracing:
+                self.launch_profiler()
+
+            success = False
+            step_range = (
+                [global_step]
+                if self.iterations_per_loop == 0
+                else range(
+                    global_step,
+                    global_step + self.train_steps,
+                    self.iterations_per_loop,
+                )
+            )
+            for cur_step in step_range:
+                if not success and eval_init_fn:
+                    eval_init_fn(cur_step)
+                eval_output = [[] for _ in self.dequeue_ops]
+                for _ in range(self.num_outfeeds):
+                    for i, t in enumerate(
+                            self.output_sess.run(self.dequeue_ops)):
+                        eval_output[i] += list(t)
+                eval_output = tf.nest.pack_sequence_as(
+                    self.predict_output, eval_output)
+                if (
+                    eval_finish_fn
+                    and not success
+                    and eval_finish_fn(cur_step, eval_output, summary_writer)
+                ):
+                    success = True
+                if enable_tracing and cur_step > self.train_steps // 4:
+                    self.launch_profiler()
+                    enable_tracing = False
+
+            if run_finish_fn:
+                run_finish_fn(success)
+
+        if FLAGS.save_checkpoint:
+            with self.graph.as_default():
+                self.global_step = self.sess.run(
+                    tf.train.get_global_step(self.graph))
+                checkpoint_path = FLAGS.model_dir + "/model.ckpt-%d" % self.global_step
+                tf.train.Saver().save(self.sess, checkpoint_path)
+                tf.logging.info("Checkpoint saved to %s", checkpoint_path)
+
+        if FLAGS.enable_summary:
+            summary_writer.close()
+
+        self.train_eval_thread.join()
+        for i in range(self.num_input_graphs):
+            infeed_threads[i].join()
+        self.sess.close()
diff --git a/retired_benchmarks/recommendation/dlrm/tf/utils.py b/retired_benchmarks/recommendation/dlrm/tf/utils.py
index edcf2a79a..e3b3d8283 100644
--- a/retired_benchmarks/recommendation/dlrm/tf/utils.py
+++ b/retired_benchmarks/recommendation/dlrm/tf/utils.py
@@ -29,58 +29,68 @@
 
 
 def create_scalar_summary(name, simple_value):
-  return tf.Summary(
-      value=[tf.Summary.Value(tag=name, simple_value=simple_value)])
+    return tf.Summary(value=[tf.Summary.Value(
+        tag=name, simple_value=simple_value)])
 
 
 def train_loop_iters():
 
-  def _ceil(n, d):
-    return (n + d - 1) // d
+    def _ceil(n, d):
+        return (n + d - 1) // d
 
-  return _ceil(FLAGS.train_steps, FLAGS.steps_between_evals)
+    return _ceil(FLAGS.train_steps, FLAGS.steps_between_evals)
 
 
 def lr_fn(params, global_step):
-  """Calculates adjusted LR based on global step.
-
-  Linear warmup and polynomial decay.
-
-  Args:
-    params: Params dict for the model.
-    global_step: Variable representing the current step.
-
-  Returns:
-    New learning rate tensor (float32).
-  """
-  decay_exp = 2
-  base_learning_rate = params["learning_rate"]
-  global_step = tf.cast(global_step, tf.float32)
-  lr_warmup_steps = tf.constant(params["lr_warmup_steps"], tf.float32)
-  decay_steps_float = tf.constant(params["decay_steps"], tf.float32)
-  decay_start_step_float = tf.constant(params["decay_start_step"], tf.float32)
-  global_batch_size = params["batch_size"]
-  scaling_factor = global_batch_size / 2048.0
-  adjusted_lr = base_learning_rate * scaling_factor
-  adjusted_lr = tf.constant(adjusted_lr, tf.float32)
-  if not params["lr_warmup_steps"]:
-    return adjusted_lr
-
-  change_rate = adjusted_lr / lr_warmup_steps
-  warmup_lr = adjusted_lr - (lr_warmup_steps - global_step) * change_rate
-
-  steps_since_decay_start_float = global_step - decay_start_step_float
-  already_decayed_steps = tf.minimum(steps_since_decay_start_float,
-                                     decay_steps_float)
-  decay_lr = adjusted_lr * ((decay_steps_float - already_decayed_steps) /
-                            decay_steps_float)**decay_exp
-  decay_lr = tf.maximum(decay_lr, tf.constant(0.0000001))
-
-  is_warmup_step = tf.cast(global_step < lr_warmup_steps, tf.float32)
-  is_decay_step = tf.cast(global_step > decay_start_step_float, tf.float32)
-  is_middle_step = tf.cast(
-      tf.equal(is_warmup_step + is_decay_step, 0.0), tf.float32)
-
-  lr = (is_warmup_step * warmup_lr + is_middle_step * adjusted_lr +
-        is_decay_step * decay_lr)
-  return lr
+    """Calculates adjusted LR based on global step.
+
+    Linear warmup and polynomial decay.
+
+    Args:
+      params: Params dict for the model.
+      global_step: Variable representing the current step.
+
+    Returns:
+      New learning rate tensor (float32).
+    """
+    decay_exp = 2
+    base_learning_rate = params["learning_rate"]
+    global_step = tf.cast(global_step, tf.float32)
+    lr_warmup_steps = tf.constant(params["lr_warmup_steps"], tf.float32)
+    decay_steps_float = tf.constant(params["decay_steps"], tf.float32)
+    decay_start_step_float = tf.constant(
+        params["decay_start_step"], tf.float32)
+    global_batch_size = params["batch_size"]
+    scaling_factor = global_batch_size / 2048.0
+    adjusted_lr = base_learning_rate * scaling_factor
+    adjusted_lr = tf.constant(adjusted_lr, tf.float32)
+    if not params["lr_warmup_steps"]:
+        return adjusted_lr
+
+    change_rate = adjusted_lr / lr_warmup_steps
+    warmup_lr = adjusted_lr - (lr_warmup_steps - global_step) * change_rate
+
+    steps_since_decay_start_float = global_step - decay_start_step_float
+    already_decayed_steps = tf.minimum(
+        steps_since_decay_start_float, decay_steps_float)
+    decay_lr = (
+        adjusted_lr
+        * ((decay_steps_float - already_decayed_steps) / decay_steps_float) ** decay_exp
+    )
+    decay_lr = tf.maximum(decay_lr, tf.constant(0.0000001))
+
+    is_warmup_step = tf.cast(global_step < lr_warmup_steps, tf.float32)
+    is_decay_step = tf.cast(global_step > decay_start_step_float, tf.float32)
+    is_middle_step = tf.cast(
+        tf.equal(
+            is_warmup_step +
+            is_decay_step,
+            0.0),
+        tf.float32)
+
+    lr = (
+        is_warmup_step * warmup_lr
+        + is_middle_step * adjusted_lr
+        + is_decay_step * decay_lr
+    )
+    return lr
diff --git a/retired_benchmarks/speech_recognition/rnnt/QSL.py b/retired_benchmarks/speech_recognition/rnnt/QSL.py
index 9c0abe4e7..c7a242352 100644
--- a/retired_benchmarks/speech_recognition/rnnt/QSL.py
+++ b/retired_benchmarks/speech_recognition/rnnt/QSL.py
@@ -1,33 +1,35 @@
+import mlperf_loadgen as lg
+import numpy as np
+from parts.segment import AudioSegment
+from parts.manifest import Manifest
 import sys
 import os
-sys.path.insert(0, os.path.join(os.getcwd(), "pytorch"))
 
-from parts.manifest import Manifest
-from parts.segment import AudioSegment
-
-import numpy as np
-
-import mlperf_loadgen as lg
+sys.path.insert(0, os.path.join(os.getcwd(), "pytorch"))
 
 
 class AudioQSL:
-    def __init__(self, dataset_dir, manifest_filepath, labels,
-                 sample_rate=16000, perf_count=None):
+    def __init__(
+        self, dataset_dir, manifest_filepath, labels, sample_rate=16000, perf_count=None
+    ):
         m_paths = [manifest_filepath]
-        self.manifest = Manifest(dataset_dir, m_paths, labels, len(labels),
-                                 normalize=True, max_duration=15.0)
+        self.manifest = Manifest(
+            dataset_dir, m_paths, labels, len(labels), normalize=True, max_duration=15.0
+        )
         self.sample_rate = sample_rate
         self.count = len(self.manifest)
         perf_count = self.count if perf_count is None else perf_count
         self.sample_id_to_sample = {}
-        self.qsl = lg.ConstructQSL(self.count, perf_count,
-                                   self.load_query_samples,
-                                   self.unload_query_samples)
+        self.qsl = lg.ConstructQSL(
+            self.count, perf_count, self.load_query_samples, self.unload_query_samples
+        )
         print(
             "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours. Number of samples: {2}".format(
                 self.manifest.duration / 3600,
                 self.manifest.filtered_duration / 3600,
-                self.count))
+                self.count,
+            )
+        )
 
     def load_query_samples(self, sample_list):
         for sample_id in sample_list:
@@ -39,10 +41,13 @@ def unload_query_samples(self, sample_list):
 
     def _load_sample(self, index):
         sample = self.manifest[index]
-        segment = AudioSegment.from_file(sample['audio_filepath'][0],
-                                         target_sr=self.sample_rate)
+        segment = AudioSegment.from_file(
+            sample["audio_filepath"][0], target_sr=self.sample_rate
+        )
         waveform = segment.samples
-        assert isinstance(waveform, np.ndarray) and waveform.dtype == np.float32
+        assert isinstance(
+            waveform,
+            np.ndarray) and waveform.dtype == np.float32
         return waveform
 
     def __getitem__(self, index):
@@ -52,13 +57,16 @@ def __del__(self):
         lg.DestroyQSL(self.qsl)
         print("Finished destroying QSL.")
 
+
 # We have no problem fitting all data in memory, so we do that, in
 # order to speed up execution of the benchmark.
 class AudioQSLInMemory(AudioQSL):
-    def __init__(self, dataset_dir, manifest_filepath, labels,
-                 sample_rate=16000, perf_count=None):
-        super().__init__(dataset_dir, manifest_filepath, labels,
-                         sample_rate, perf_count)
+    def __init__(
+        self, dataset_dir, manifest_filepath, labels, sample_rate=16000, perf_count=None
+    ):
+        super().__init__(
+            dataset_dir, manifest_filepath, labels, sample_rate, perf_count
+        )
         super().load_query_samples(range(self.count))
 
     def load_query_samples(self, sample_list):
diff --git a/retired_benchmarks/speech_recognition/rnnt/accuracy_eval.py b/retired_benchmarks/speech_recognition/rnnt/accuracy_eval.py
index 4341900c5..b85f46df3 100644
--- a/retired_benchmarks/speech_recognition/rnnt/accuracy_eval.py
+++ b/retired_benchmarks/speech_recognition/rnnt/accuracy_eval.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+from parts.manifest import Manifest
+from helpers import process_evaluation_epoch, __gather_predictions
 import argparse
 import array
 import json
@@ -8,44 +10,92 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pytorch"))
 
-from helpers import process_evaluation_epoch, __gather_predictions
-from parts.manifest import Manifest
 
 dtype_map = {
-    "int8": 'b',
-    "int16": 'h',
-    "int32": 'l',
-    "int64": 'q',
+    "int8": "b",
+    "int16": "h",
+    "int32": "l",
+    "int64": "q",
 }
 
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--log_dir", required=True)
     parser.add_argument("--dataset_dir", required=True)
     parser.add_argument("--manifest", required=True)
-    parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type")
+    parser.add_argument(
+        "--output_dtype",
+        default="int64",
+        choices=dtype_map.keys(),
+        help="Output data type",
+    )
     args = parser.parse_args()
     return args
 
+
 def main():
     args = get_args()
-    labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
-    manifest = Manifest(args.dataset_dir, [args.manifest], labels, len(labels), normalize=True, max_duration=15.0)
+    labels = [
+        " ",
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+        "m",
+        "n",
+        "o",
+        "p",
+        "q",
+        "r",
+        "s",
+        "t",
+        "u",
+        "v",
+        "w",
+        "x",
+        "y",
+        "z",
+        "'",
+    ]
+    manifest = Manifest(
+        args.dataset_dir,
+        [args.manifest],
+        labels,
+        len(labels),
+        normalize=True,
+        max_duration=15.0,
+    )
     with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh:
         results = json.load(fh)
     hypotheses = []
     references = []
     for result in results:
-        hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist())
+        hypotheses.append(
+            array.array(
+                dtype_map[args.output_dtype], bytes.fromhex(result["data"])
+            ).tolist()
+        )
         references.append(manifest[result["qsl_idx"]]["transcript"])
 
     references = __gather_predictions([references], labels=labels)
     hypotheses = __gather_predictions([hypotheses], labels=labels)
 
-    d = dict(predictions=hypotheses,
-             transcripts=references)
+    d = dict(predictions=hypotheses, transcripts=references)
     wer = process_evaluation_epoch(d)
-    print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100))
+    print(
+        "Word Error Rate: {:}%, accuracy={:}%".format(
+            wer * 100,
+         (1 - wer) * 100))
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/dataset.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/dataset.py
index 7b9036f1c..3c89ca6b3 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/dataset.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/dataset.py
@@ -30,62 +30,72 @@ def seq_collate_fn(batch):
     Returns
     batches of tensors
     """
-    audio_lengths = torch.LongTensor([sample.waveform.size(0)
-                                      for sample in batch])
-    transcript_lengths = torch.LongTensor([sample.transcript.size(0)
-                                           for sample in batch])
+    audio_lengths = torch.LongTensor(
+        [sample.waveform.size(0) for sample in batch])
+    transcript_lengths = torch.LongTensor(
+        [sample.transcript.size(0) for sample in batch]
+    )
     permute_indices = torch.argsort(audio_lengths, descending=True)
 
     audio_lengths = audio_lengths[permute_indices]
     transcript_lengths = transcript_lengths[permute_indices]
     padded_audio_signals = torch.nn.utils.rnn.pad_sequence(
-        [batch[i].waveform for i in permute_indices],
-        batch_first=True
+        [batch[i].waveform for i in permute_indices], batch_first=True
+    )
+    transcript_list = [batch[i].transcript for i in permute_indices]
+    packed_transcripts = torch.nn.utils.rnn.pack_sequence(
+        transcript_list, enforce_sorted=False
     )
-    transcript_list = [batch[i].transcript
-                       for i in permute_indices]
-    packed_transcripts = torch.nn.utils.rnn.pack_sequence(transcript_list,
-                                                          enforce_sorted=False)
 
     # TODO: Don't I need to stop grad at some point now?
-    return (padded_audio_signals, audio_lengths, transcript_list,
-            packed_transcripts, transcript_lengths)
+    return (
+        padded_audio_signals,
+        audio_lengths,
+        transcript_list,
+        packed_transcripts,
+        transcript_lengths,
+    )
 
 
 class AudioToTextDataLayer:
-    """Data layer with data loader
-    """
+    """Data layer with data loader"""
 
     def __init__(self, **kwargs):
-        featurizer_config = kwargs['featurizer_config']
-        pad_to_max = kwargs.get('pad_to_max', False)
-        perturb_config = kwargs.get('perturb_config', None)
-        manifest_filepath = kwargs['manifest_filepath']
-        dataset_dir = kwargs['dataset_dir']
-        labels = kwargs['labels']
-        batch_size = kwargs['batch_size']
-        drop_last = kwargs.get('drop_last', False)
-        shuffle = kwargs.get('shuffle', True)
-        min_duration = featurizer_config.get('min_duration', 0.1)
-        max_duration = featurizer_config.get('max_duration', None)
-        normalize_transcripts = kwargs.get('normalize_transcripts', True)
-        trim_silence = kwargs.get('trim_silence', False)
-        sampler_type = kwargs.get('sampler', 'default')
-        speed_perturbation = featurizer_config.get('speed_perturbation', False)
-        sort_by_duration = sampler_type == 'bucket'
+        featurizer_config = kwargs["featurizer_config"]
+        pad_to_max = kwargs.get("pad_to_max", False)
+        perturb_config = kwargs.get("perturb_config", None)
+        manifest_filepath = kwargs["manifest_filepath"]
+        dataset_dir = kwargs["dataset_dir"]
+        labels = kwargs["labels"]
+        batch_size = kwargs["batch_size"]
+        drop_last = kwargs.get("drop_last", False)
+        shuffle = kwargs.get("shuffle", True)
+        min_duration = featurizer_config.get("min_duration", 0.1)
+        max_duration = featurizer_config.get("max_duration", None)
+        normalize_transcripts = kwargs.get("normalize_transcripts", True)
+        trim_silence = kwargs.get("trim_silence", False)
+        sampler_type = kwargs.get("sampler", "default")
+        speed_perturbation = featurizer_config.get("speed_perturbation", False)
+        sort_by_duration = sampler_type == "bucket"
         self._featurizer = WaveformFeaturizer.from_config(
-            featurizer_config, perturbation_configs=perturb_config)
+            featurizer_config, perturbation_configs=perturb_config
+        )
         self._dataset = AudioDataset(
             dataset_dir=dataset_dir,
             manifest_filepath=manifest_filepath,
-            labels=labels, blank_index=len(labels),
+            labels=labels,
+            blank_index=len(labels),
             sort_by_duration=sort_by_duration,
             pad_to_max=pad_to_max,
-            featurizer=self._featurizer, max_duration=max_duration,
-            min_duration=min_duration, normalize=normalize_transcripts,
-            trim=trim_silence, speed_perturbation=speed_perturbation)
+            featurizer=self._featurizer,
+            max_duration=max_duration,
+            min_duration=min_duration,
+            normalize=normalize_transcripts,
+            trim=trim_silence,
+            speed_perturbation=speed_perturbation,
+        )
 
-        print('sort_by_duration', sort_by_duration)
+        print("sort_by_duration", sort_by_duration)
 
         self._dataloader = torch.utils.data.DataLoader(
             dataset=self._dataset,
@@ -95,7 +105,7 @@ def __init__(self, **kwargs):
             shuffle=shuffle,
             num_workers=0,
             pin_memory=True,
-            sampler=None
+            sampler=None,
         )
 
     def __len__(self):
@@ -107,9 +117,22 @@ def data_iterator(self):
 
 
 class AudioDataset(Dataset):
-    def __init__(self, dataset_dir, manifest_filepath, labels, featurizer, max_duration=None, pad_to_max=False,
-                 min_duration=None, blank_index=0, max_utts=0, normalize=True, sort_by_duration=False,
-                 trim=False, speed_perturbation=False):
+    def __init__(
+        self,
+        dataset_dir,
+        manifest_filepath,
+        labels,
+        featurizer,
+        max_duration=None,
+        pad_to_max=False,
+        min_duration=None,
+        blank_index=0,
+        max_utts=0,
+        normalize=True,
+        sort_by_duration=False,
+        trim=False,
+        speed_perturbation=False,
+    ):
         """Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations
         (in seconds). Each entry is a different audio sample.
         Args:
@@ -128,32 +151,44 @@ def __init__(self, dataset_dir, manifest_filepath, labels, featurizer, max_durat
             speed_perturbation: specify if using data contains speed perburbation
         """
         m_paths = [manifest_filepath]
-        self.manifest = Manifest(dataset_dir, m_paths, labels, blank_index, pad_to_max=pad_to_max,
-                                 max_duration=max_duration,
-                                 sort_by_duration=sort_by_duration,
-                                 min_duration=min_duration, max_utts=max_utts,
-                                 normalize=normalize, speed_perturbation=speed_perturbation)
+        self.manifest = Manifest(
+            dataset_dir,
+            m_paths,
+            labels,
+            blank_index,
+            pad_to_max=pad_to_max,
+            max_duration=max_duration,
+            sort_by_duration=sort_by_duration,
+            min_duration=min_duration,
+            max_utts=max_utts,
+            normalize=normalize,
+            speed_perturbation=speed_perturbation,
+        )
         self.featurizer = featurizer
         self.blank_index = blank_index
         self.trim = trim
         print(
             "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours.".format(
-                self.manifest.duration / 3600,
-                self.manifest.filtered_duration / 3600))
+                self.manifest.duration / 3600, self.manifest.filtered_duration / 3600
+            )
+        )
 
     def __getitem__(self, index):
         sample = self.manifest[index]
-        rn_indx = np.random.randint(len(sample['audio_filepath']))
-        duration = sample['audio_duration'][rn_indx] if 'audio_duration' in sample else 0
-        offset = sample['offset'] if 'offset' in sample else 0
-        features = self.featurizer.process(sample['audio_filepath'][rn_indx],
-                                           offset=offset, duration=duration,
-                                           trim=self.trim)
-
-        AudioSample = namedtuple('AudioSample', ['waveform',
-                                                 'transcript'])
-        return AudioSample(features,
-                           torch.LongTensor(sample["transcript"]))
+        rn_indx = np.random.randint(len(sample["audio_filepath"]))
+        duration = (
+            sample["audio_duration"][rn_indx] if "audio_duration" in sample else 0
+        )
+        offset = sample["offset"] if "offset" in sample else 0
+        features = self.featurizer.process(
+            sample["audio_filepath"][rn_indx],
+            offset=offset,
+            duration=duration,
+            trim=self.trim,
+        )
+
+        AudioSample = namedtuple("AudioSample", ["waveform", "transcript"])
+        return AudioSample(features, torch.LongTensor(sample["transcript"]))
 
     def __len__(self):
         return len(self.manifest)
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/decoders.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/decoders.py
index 56745d7ec..7a6239da5 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/decoders.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/decoders.py
@@ -42,12 +42,19 @@ def __init__(self, blank_index, model, max_symbols_per_step=30):
         self._model = model
         self._blank_id = blank_index
         self._SOS = -1
-        self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in  [ "no", "false" ]  else torch.device("cpu")
+        self.dev = (
+            torch.device("cuda:0")
+            if torch.cuda.is_available()
+            and os.environ.get("USE_GPU", "").lower() not in ["no", "false"]
+            else torch.device("cpu")
+        )
         assert max_symbols_per_step > 0
         self._max_symbols_per_step = max_symbols_per_step
 
     @torch.jit.export
-    def forward(self, x: torch.Tensor, out_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, List[List[int]]]:
+    def forward(
+        self, x: torch.Tensor, out_lens: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[int]]]:
         """Returns a list of sentences given an input batch.
 
         Args:
@@ -73,7 +80,8 @@ def forward(self, x: torch.Tensor, out_lens: torch.Tensor) -> Tuple[torch.Tensor
 
         return logits, logits_lens, output
 
-    def _greedy_decode(self, x: torch.Tensor, out_len: torch.Tensor) -> List[int]:
+    def _greedy_decode(self, x: torch.Tensor,
+                       out_len: torch.Tensor) -> List[int]:
         hidden: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
         label: List[int] = []
         for time_idx in range(int(out_len.item())):
@@ -84,9 +92,7 @@ def _greedy_decode(self, x: torch.Tensor, out_len: torch.Tensor) -> List[int]:
 
             while not_blank and symbols_added < self._max_symbols_per_step:
                 g, hidden_prime = self._pred_step(
-                    self._get_last_symb(label),
-                    hidden
-                )
+                    self._get_last_symb(label), hidden)
                 logp = self._joint_step(f, g, log_normalize=False)[0, :]
 
                 # get index k, of max prob
@@ -102,7 +108,9 @@ def _greedy_decode(self, x: torch.Tensor, out_len: torch.Tensor) -> List[int]:
 
         return label
 
-    def _pred_step(self, label: int, hidden: Optional[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    def _pred_step(
+        self, label: int, hidden: Optional[Tuple[torch.Tensor, torch.Tensor]]
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         if label == self._SOS:
             return self._model.prediction(None, hidden)
         if label > self._blank_id:
@@ -110,7 +118,9 @@ def _pred_step(self, label: int, hidden: Optional[Tuple[torch.Tensor, torch.Tens
         label = torch.tensor([[label]], dtype=torch.int64)
         return self._model.prediction(label, hidden)
 
-    def _joint_step(self, enc: torch.Tensor, pred: torch.Tensor, log_normalize: bool=False) -> torch.Tensor:
+    def _joint_step(
+        self, enc: torch.Tensor, pred: torch.Tensor, log_normalize: bool = False
+    ) -> torch.Tensor:
         logits = self._model.joint(enc, pred)[:, 0, 0, :]
         if not log_normalize:
             return logits
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/helpers.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/helpers.py
index cfe3b66f3..4427ad094 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/helpers.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/helpers.py
@@ -20,6 +20,7 @@
 class Optimization(Enum):
     """Various levels of Optimization.
     WARNING: This might have effect on model accuracy."""
+
     nothing = 0
     mxprO0 = 1
     mxprO1 = 2
@@ -27,10 +28,12 @@ class Optimization(Enum):
     mxprO3 = 4
 
 
-AmpOptimizations = {Optimization.mxprO0: "O0",
-                    Optimization.mxprO1: "O1",
-                    Optimization.mxprO2: "O2",
-                    Optimization.mxprO3: "O3"}
+AmpOptimizations = {
+    Optimization.mxprO0: "O0",
+    Optimization.mxprO1: "O1",
+    Optimization.mxprO2: "O2",
+    Optimization.mxprO3: "O3",
+}
 
 
 def add_blank_label(labels):
@@ -53,7 +56,7 @@ def __rnnt_decoder_predictions_tensor(tensor, labels):
     labels_map = dict([(i, labels[i]) for i in range(len(labels))])
     # iterate over batch
     for ind in range(len(tensor)):
-        hypothesis = ''.join([labels_map[c] for c in tensor[ind]])
+        hypothesis = "".join([labels_map[c] for c in tensor[ind]])
         hypotheses.append(hypothesis)
     return hypotheses
 
@@ -65,13 +68,14 @@ def __gather_predictions(predictions_list: list, labels: list) -> list:
     return results
 
 
-def __gather_transcripts(transcript_list: list, transcript_len_list: list,
-                         labels: list) -> list:
+def __gather_transcripts(
+    transcript_list: list, transcript_len_list: list, labels: list
+) -> list:
     results = []
     labels_map = dict([(i, labels[i]) for i in range(len(labels))])
     for i, t in enumerate(transcript_list):
         target = t.numpy().tolist()
-        reference = ''.join([labels_map[c] for c in target])
+        reference = "".join([labels_map[c] for c in target])
         results.append(reference)
     return results
 
@@ -85,17 +89,17 @@ def process_evaluation_batch(tensors: dict, global_vars: dict, labels: list):
         labels: A list of labels
     """
     for kv, v in tensors.items():
-        if kv.startswith('predictions'):
-            global_vars['predictions'] += __gather_predictions(
+        if kv.startswith("predictions"):
+            global_vars["predictions"] += __gather_predictions(
                 v, labels=labels)
-        elif kv.startswith('transcript_length'):
+        elif kv.startswith("transcript_length"):
             transcript_len_list = v
-        elif kv.startswith('transcript'):
+        elif kv.startswith("transcript"):
             transcript_list = v
 
-    global_vars['transcripts'] += __gather_transcripts(transcript_list,
-                                                       transcript_len_list,
-                                                       labels=labels)
+    global_vars["transcripts"] += __gather_transcripts(
+        transcript_list, transcript_len_list, labels=labels
+    )
 
 
 def process_evaluation_epoch(global_vars: dict, tag=None):
@@ -107,17 +111,18 @@ def process_evaluation_epoch(global_vars: dict, tag=None):
         wer: final word error rate
         loss: final loss
     """
-    hypotheses = global_vars['predictions']
-    references = global_vars['transcripts']
+    hypotheses = global_vars["predictions"]
+    references = global_vars["transcripts"]
 
     wer, scores, num_words = word_error_rate(
-        hypotheses=hypotheses, references=references)
+        hypotheses=hypotheses, references=references
+    )
     return wer
 
 
 def print_dict(d):
     maxLen = max([len(ii) for ii in d.keys()])
-    fmtString = '\t%' + str(maxLen) + 's : %s'
-    print('Arguments:')
+    fmtString = "\t%" + str(maxLen) + "s : %s"
+    print("Arguments:")
     for keyPair in sorted(d.items()):
         print(fmtString % keyPair)
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/metrics.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/metrics.py
index 5426e3723..2fb1ad183 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/metrics.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/metrics.py
@@ -16,8 +16,7 @@
 
 
 def __levenshtein(a: List, b: List) -> int:
-    """Calculates the Levenshtein distance between a and b.
-    """
+    """Calculates the Levenshtein distance between a and b."""
     n, m = len(a), len(b)
     if n > m:
         # Make sure n <= m, to use O(min(n,m)) space
@@ -52,9 +51,12 @@ def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
     scores = 0
     words = 0
     if len(hypotheses) != len(references):
-        raise ValueError("In word error rate calculation, hypotheses and reference"
-                         " lists must have the same number of elements. But I got:"
-                         "{0} and {1} correspondingly".format(len(hypotheses), len(references)))
+        raise ValueError(
+            "In word error rate calculation, hypotheses and reference"
+            " lists must have the same number of elements. But I got:"
+            "{0} and {1} correspondingly".format(
+                len(hypotheses), len(references))
+        )
     for h, r in zip(hypotheses, references):
         h_list = h.split()
         r_list = r.split()
@@ -63,5 +65,5 @@ def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
     if words != 0:
         wer = (1.0 * scores) / words
     else:
-        wer = float('inf')
+        wer = float("inf")
     return wer, scores, words
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/model_separable_rnnt.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/model_separable_rnnt.py
index 39e181358..638d115a2 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/model_separable_rnnt.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/model_separable_rnnt.py
@@ -16,10 +16,11 @@ def __init__(self, rnnt=None, num_classes=1, **kwargs):
             feat_config = kwargs.get("feature_config")
             # This may be useful in the future, for MLPerf
             # configuration.
-            in_features = feat_config['features'] * \
+            in_features = feat_config["features"] * \
                 feat_config.get("frame_splicing", 1)
 
-        self.encoder = Encoder(in_features,
+        self.encoder = Encoder(
+            in_features,
             rnnt["encoder_n_hidden"],
             rnnt["encoder_pre_rnn_layers"],
             rnnt["encoder_post_rnn_layers"],
@@ -48,17 +49,32 @@ def __init__(self, rnnt=None, num_classes=1, **kwargs):
             rnnt["dropout"],
         )
 
-    def forward(self, x_padded: torch.Tensor, x_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self, x_padded: torch.Tensor, x_lens: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         return self.encoder(x_padded, x_lens)
-        
+
 
 class Encoder(torch.nn.Module):
-    def __init__(self, in_features, encoder_n_hidden,
-                 encoder_pre_rnn_layers, encoder_post_rnn_layers,
-                 forget_gate_bias, norm, rnn_type, encoder_stack_time_factor,
-                 dropout):
+    def __init__(
+        self,
+        in_features,
+        encoder_n_hidden,
+        encoder_pre_rnn_layers,
+        encoder_post_rnn_layers,
+        forget_gate_bias,
+        norm,
+        rnn_type,
+        encoder_stack_time_factor,
+        dropout,
+    ):
         super().__init__()
-        self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in  [ "no", "false" ]  else torch.device("cpu")
+        self.dev = (
+            torch.device("cuda:0")
+            if torch.cuda.is_available()
+            and os.environ.get("USE_GPU", "").lower() not in ["no", "false"]
+            else torch.device("cpu")
+        )
 
         self.pre_rnn = rnn(
             rnn=rnn_type,
@@ -81,7 +97,9 @@ def __init__(self, in_features, encoder_n_hidden,
             dropout=dropout,
         )
 
-    def forward(self, x_padded: torch.Tensor, x_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self, x_padded: torch.Tensor, x_lens: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         x_padded, _ = self.pre_rnn(x_padded.to(self.dev), None)
         x_padded, x_lens = self.stack_time(x_padded, x_lens)
         # (T, B, H)
@@ -90,11 +108,25 @@ def forward(self, x_padded: torch.Tensor, x_lens: torch.Tensor) -> Tuple[torch.T
         x_padded = x_padded.transpose(0, 1)
         return x_padded, x_lens
 
+
 class Prediction(torch.nn.Module):
-    def __init__(self, vocab_size, n_hidden, pred_rnn_layers,
-                 forget_gate_bias, norm, rnn_type, dropout):
+    def __init__(
+        self,
+        vocab_size,
+        n_hidden,
+        pred_rnn_layers,
+        forget_gate_bias,
+        norm,
+        rnn_type,
+        dropout,
+    ):
         super().__init__()
-        self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in  [ "no", "false" ]  else torch.device("cpu")
+        self.dev = (
+            torch.device("cuda:0")
+            if torch.cuda.is_available()
+            and os.environ.get("USE_GPU", "").lower() not in ["no", "false"]
+            else torch.device("cpu")
+        )
 
         self.embed = torch.nn.Embedding(vocab_size - 1, n_hidden)
         self.n_hidden = n_hidden
@@ -108,8 +140,11 @@ def __init__(self, vocab_size, n_hidden, pred_rnn_layers,
             dropout=dropout,
         )
 
-    def forward(self, y: Optional[torch.Tensor],
-                state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    def forward(
+        self,
+        y: Optional[torch.Tensor],
+        state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         """
         B - batch size
         U - label length
@@ -133,7 +168,9 @@ def forward(self, y: Optional[torch.Tensor],
             assert state is None
             # Hacky, no way to determine this right now!
             B = 1
-            y = torch.zeros((B, 1, self.n_hidden), dtype=torch.float32).to(self.dev)
+            y = torch.zeros(
+                (B, 1, self.n_hidden), dtype=torch.float32).to(
+                self.dev)
         else:
             y = self.embed(y.to(self.dev)).to(self.dev)
 
@@ -151,22 +188,35 @@ def forward(self, y: Optional[torch.Tensor],
         # del y, state
         return g, hid
 
+
 class Joint(torch.nn.Module):
-    def __init__(self, vocab_size, pred_n_hidden, enc_n_hidden,
-                 joint_n_hidden, dropout):
+    def __init__(
+        self, vocab_size, pred_n_hidden, enc_n_hidden, joint_n_hidden, dropout
+    ):
         super().__init__()
-        self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in  [ "no", "false" ]  else torch.device("cpu")
-
-        layers = [
-            torch.nn.Linear(pred_n_hidden + enc_n_hidden, joint_n_hidden),
-            torch.nn.ReLU(),
-        ] + ([torch.nn.Dropout(p=dropout), ] if dropout else []) + [
-            torch.nn.Linear(joint_n_hidden, vocab_size)
-        ]
-        self.net = torch.nn.Sequential(
-            *layers
+        self.dev = (
+            torch.device("cuda:0")
+            if torch.cuda.is_available()
+            and os.environ.get("USE_GPU", "").lower() not in ["no", "false"]
+            else torch.device("cpu")
         )
 
+        layers = (
+            [
+                torch.nn.Linear(pred_n_hidden + enc_n_hidden, joint_n_hidden),
+                torch.nn.ReLU(),
+            ]
+            + (
+                [
+                    torch.nn.Dropout(p=dropout),
+                ]
+                if dropout
+                else []
+            )
+            + [torch.nn.Linear(joint_n_hidden, vocab_size)]
+        )
+        self.net = torch.nn.Sequential(*layers)
+
     def forward(self, f: torch.Tensor, g: torch.Tensor):
         """
         f should be shape (B, T, H)
@@ -179,17 +229,18 @@ def forward(self, f: torch.Tensor, g: torch.Tensor):
         B, T, H = f.shape
         B, U_, H2 = g.shape
 
-        f = f.unsqueeze(dim=2).to(self.dev)   # (B, T, 1, H)
+        f = f.unsqueeze(dim=2).to(self.dev)  # (B, T, 1, H)
         f = f.expand((B, T, U_, H))
 
-        g = g.unsqueeze(dim=1).to(self.dev)   # (B, 1, U + 1, H)
+        g = g.unsqueeze(dim=1).to(self.dev)  # (B, 1, U + 1, H)
         g = g.expand((B, T, U_, H2))
 
-        inp = torch.cat([f, g], dim=3)   # (B, T, U, 2H)
+        inp = torch.cat([f, g], dim=3)  # (B, T, U, 2H)
         res = self.net(inp)
         # del f, g, inp
         return res
 
+
 def label_collate(labels):
     """Collates the label inputs for the rnn-t prediction network.
 
@@ -206,15 +257,14 @@ def label_collate(labels):
         return labels.type(torch.int64)
     if not isinstance(labels, (list, tuple)):
         raise ValueError(
-            f"`labels` should be a list or tensor not {type(labels)}"
-        )
+            f"`labels` should be a list or tensor not {type(labels)}")
 
     batch_size = len(labels)
     max_len = max(len(l) for l in labels)
 
     cat_labels = np.full((batch_size, max_len), fill_value=0.0, dtype=np.int32)
     for e, l in enumerate(labels):
-        cat_labels[e, :len(l)] = l
+        cat_labels[e, : len(l)] = l
     labels = torch.LongTensor(cat_labels)
 
     return labels
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/features.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/features.py
index 5a4decd61..109509e38 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/features.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/features.py
@@ -27,11 +27,14 @@ def __init__(self, input_cfg):
         self.cfg = input_cfg
 
     def process(self, file_path, offset=0, duration=0, trim=False):
-        audio = AudioSegment.from_file(file_path,
-                                       target_sr=self.cfg['sample_rate'],
-                                       int_values=self.cfg.get(
-                                           'int_values', False),
-                                       offset=offset, duration=duration, trim=trim)
+        audio = AudioSegment.from_file(
+            file_path,
+            target_sr=self.cfg["sample_rate"],
+            int_values=self.cfg.get("int_values", False),
+            offset=offset,
+            duration=duration,
+            trim=trim,
+        )
         return self.process_segment(audio)
 
     def process_segment(self, audio_segment):
@@ -47,13 +50,15 @@ def from_config(cls, input_config, perturbation_configs=None):
 
 def normalize_batch(x, seq_len, normalize_type):
     if normalize_type == "per_feature":
-        x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
-                             device=x.device)
-        x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
-                            device=x.device)
+        x_mean = torch.zeros(
+            (seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device
+        )
+        x_std = torch.zeros(
+            (seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device
+        )
         for i in range(x.shape[0]):
-            x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1)
-            x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1)
+            x_mean[i, :] = x[i, :, : seq_len[i]].mean(dim=1)
+            x_std[i, :] = x[i, :, : seq_len[i]].std(dim=1)
         # make sure x_std is not zero
         x_std += constant
         return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2)
@@ -61,8 +66,8 @@ def normalize_batch(x, seq_len, normalize_type):
         x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
         x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
         for i in range(x.shape[0]):
-            x_mean[i] = x[i, :, :seq_len[i].item()].mean()
-            x_std[i] = x[i, :, :seq_len[i].item()].std()
+            x_mean[i] = x[i, :, : seq_len[i].item()].mean()
+            x_std[i] = x[i, :, : seq_len[i].item()].std()
         # make sure x_std is not zero
         x_std += constant
         return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1)
@@ -71,7 +76,7 @@ def normalize_batch(x, seq_len, normalize_type):
 
 
 def splice_frames(x, frame_splicing):
-    """ Stacks frames together across feature dim
+    """Stacks frames together across feature dim
 
     input is batch_size, feature_dim, num_frames
     output is batch_size, feature_dim*frame_splicing, num_frames
@@ -86,23 +91,34 @@ def splice_frames(x, frame_splicing):
 
 
 class FilterbankFeatures(nn.Module):
-    def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01,
-                 window="hamming", normalize="per_feature", n_fft=None,
-                 preemph=0.97,
-                 nfilt=64, lowfreq=0, highfreq=None, log=True, dither=constant,
-                 pad_to=8,
-                 max_duration=16.7,
-                 frame_splicing=1):
+    def __init__(
+        self,
+        sample_rate=8000,
+        window_size=0.02,
+        window_stride=0.01,
+        window="hamming",
+        normalize="per_feature",
+        n_fft=None,
+        preemph=0.97,
+        nfilt=64,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        dither=constant,
+        pad_to=8,
+        max_duration=16.7,
+        frame_splicing=1,
+    ):
         super(FilterbankFeatures, self).__init__()
-#        print("PADDING: {}".format(pad_to))
+        #        print("PADDING: {}".format(pad_to))
 
         torch_windows = {
-                'hann': torch.hann_window,
-                'hamming': torch.hamming_window,
-                'blackman': torch.blackman_window,
-                'bartlett': torch.bartlett_window,
-                'none': None,
-                }
+            "hann": torch.hann_window,
+            "hamming": torch.hamming_window,
+            "blackman": torch.blackman_window,
+            "bartlett": torch.bartlett_window,
+            "none": None,
+        }
 
         self.win_length = int(sample_rate * window_size)  # frame size
         self.hop_length = int(sample_rate * window_stride)
@@ -116,23 +132,33 @@ def __init__(self, sample_rate=8000, window_size=0.02, window_stride=0.01,
         self.preemph = preemph
         self.pad_to = pad_to
         # For now, always enable this.
-        # See https://docs.google.com/presentation/d/1IVC3J-pHB-ipJpKsJox_SqmDHYdkIaoCXTbKmJmV2-I/edit?usp=sharing for elaboration
+        # See
+        # https://docs.google.com/presentation/d/1IVC3J-pHB-ipJpKsJox_SqmDHYdkIaoCXTbKmJmV2-I/edit?usp=sharing
+        # for elaboration
         self.use_deterministic_dithering = True
         highfreq = highfreq or sample_rate / 2
         window_fn = torch_windows.get(window, None)
-        window_tensor = window_fn(self.win_length,
-                                  periodic=False) if window_fn else None
+        window_tensor = (
+            window_fn(self.win_length, periodic=False) if window_fn else None
+        )
         filterbanks = torch.tensor(
-                librosa.filters.mel(sr=sample_rate, n_fft=self.n_fft, n_mels=nfilt, fmin=lowfreq,
-                                    fmax=highfreq), dtype=torch.float).unsqueeze(0)
+            librosa.filters.mel(
+                sr=sample_rate,
+                n_fft=self.n_fft,
+                n_mels=nfilt,
+                fmin=lowfreq,
+                fmax=highfreq,
+            ),
+            dtype=torch.float,
+        ).unsqueeze(0)
         # self.fb = filterbanks
         # self.window = window_tensor
         self.register_buffer("fb", filterbanks)
         self.register_buffer("window", window_tensor)
         # Calculate maximum sequence length (# frames)
         max_length = 1 + math.ceil(
-                (max_duration * sample_rate - self.win_length) / self.hop_length
-                )
+            (max_duration * sample_rate - self.win_length) / self.hop_length
+        )
         max_pad = 16 - (max_length % 16)
         self.max_length = max_length + max_pad
 
@@ -156,20 +182,27 @@ def forward(self, inp: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
         # do preemphasis
         # Ideally, we would mask immediately after this... Ugh :(
         if self.preemph is not None:
-            x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]),
-                          dim=1)
+            x = torch.cat(
+                (x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1
+            )
 
         # do stft
-        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
-                       win_length=self.win_length,
-                       center=True, window=self.window.to(dtype=torch.float), return_complex = True)
+        x = torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            center=True,
+            window=self.window.to(dtype=torch.float),
+            return_complex=True,
+        )
         x = torch.view_as_real(x)
 
         # get power spectrum
         x = x.pow(2).sum(-1)
 
         if self.dither > 0 and self.use_deterministic_dithering:
-            x = x + self.dither ** 2
+            x = x + self.dither**2
         # dot with filterbank energies
         x = torch.matmul(self.fb.to(x.dtype), x)
 
@@ -184,18 +217,20 @@ def forward(self, inp: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
                 tmp = torch.zeros_like(x)
                 tmp[:, :, :-n] = x[:, :, n:]
                 seq.append(tmp)
-            x = torch.cat(seq, dim=1)[:, :, ::self.frame_splicing]
+            x = torch.cat(seq, dim=1)[:, :, :: self.frame_splicing]
 
         # normalize if required
         constant = 1e-5
         if self.normalize == "per_feature":
-            x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
-                                 device=x.device)
-            x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
-                                device=x.device)
+            x_mean = torch.zeros(
+                (seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device
+            )
+            x_std = torch.zeros(
+                (seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device
+            )
             for i in range(x.shape[0]):
-                x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1)
-                x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1)
+                x_mean[i, :] = x[i, :, : seq_len[i]].mean(dim=1)
+                x_std[i, :] = x[i, :, : seq_len[i]].std(dim=1)
                 # make sure x_std is not zero
                 x_std += constant
             x = (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2)
@@ -203,8 +238,8 @@ def forward(self, inp: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
             x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
             x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
             for i in range(x.shape[0]):
-                x_mean[i] = x[i, :, :seq_len[i].item()].mean()
-                x_std[i] = x[i, :, :seq_len[i].item()].std()
+                x_mean[i] = x[i, :, : seq_len[i].item()].mean()
+                x_std[i] = x[i, :, : seq_len[i].item()].std()
                 # make sure x_std is not zero
                 x_std += constant
             x = (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1)
@@ -215,9 +250,9 @@ def forward(self, inp: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
 
         # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
         # max_len = x.size(-1)
-        x = x[:, :, :seq_len.max()]   # rnnt loss requires lengths to match
+        x = x[:, :, : seq_len.max()]  # rnnt loss requires lengths to match
         # mask = torch.arange(max_len).to(seq_len.dtype).to(x.device).expand(x.size(0),
-        #                                                                   max_len) >= seq_len.unsqueeze(1)
+        # max_len) >= seq_len.unsqueeze(1)
 
         # x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0)
         pad_to = self.pad_to
@@ -234,27 +269,34 @@ def forward(self, inp: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
 
     @classmethod
     def from_config(cls, cfg, log=False):
-        return cls(sample_rate=cfg['sample_rate'], window_size=cfg['window_size'],
-                   window_stride=cfg['window_stride'], n_fft=cfg['n_fft'],
-                   nfilt=cfg['features'], window=cfg['window'],
-                   normalize=cfg['normalize'],
-                   max_duration=cfg.get('max_duration', 16.7),
-                   dither=cfg['dither'], pad_to=cfg.get("pad_to", 0),
-                   frame_splicing=cfg.get("frame_splicing", 1), log=log)
+        return cls(
+            sample_rate=cfg["sample_rate"],
+            window_size=cfg["window_size"],
+            window_stride=cfg["window_stride"],
+            n_fft=cfg["n_fft"],
+            nfilt=cfg["features"],
+            window=cfg["window"],
+            normalize=cfg["normalize"],
+            max_duration=cfg.get("max_duration", 16.7),
+            dither=cfg["dither"],
+            pad_to=cfg.get("pad_to", 0),
+            frame_splicing=cfg.get("frame_splicing", 1),
+            log=log,
+        )
 
 
 class FeatureFactory(object):
     featurizers = {
-            "logfbank": FilterbankFeatures,
-            "fbank": FilterbankFeatures,
-            }
+        "logfbank": FilterbankFeatures,
+        "fbank": FilterbankFeatures,
+    }
 
     def __init__(self):
         pass
 
     @classmethod
     def from_config(cls, cfg):
-        feat_type = cfg.get('feat_type', "logspect")
+        feat_type = cfg.get("feat_type", "logspect")
         featurizer = cls.featurizers[feat_type]
         # return featurizer.from_config(cfg, log="log" in cfg['feat_type'])
         return featurizer.from_config(cfg, log="log" in feat_type)
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/manifest.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/manifest.py
index fb04c5da8..b2233c1ad 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/manifest.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/manifest.py
@@ -41,16 +41,28 @@ def good_token(token, labels):
 
     try:
         text = _clean_text(s, ["english_cleaners"], table).strip()
-        return ''.join([t for t in text if good_token(t, labels=labels)])
-    except:
+        return "".join([t for t in text if good_token(t, labels=labels)])
+    except BaseException:
         print("WARNING: Normalizing {} failed".format(s))
         return None
 
 
 class Manifest(object):
-    def __init__(self, data_dir, manifest_paths, labels, blank_index, max_duration=None, pad_to_max=False,
-                 min_duration=None, sort_by_duration=False, max_utts=0,
-                 normalize=True, speed_perturbation=False, filter_speed=1.0):
+    def __init__(
+        self,
+        data_dir,
+        manifest_paths,
+        labels,
+        blank_index,
+        max_duration=None,
+        pad_to_max=False,
+        min_duration=None,
+        sort_by_duration=False,
+        max_utts=0,
+        normalize=True,
+        speed_perturbation=False,
+        filter_speed=1.0,
+    ):
         self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
         self.blank_index = blank_index
         self.max_duration = max_duration
@@ -71,7 +83,8 @@ def __init__(self, data_dir, manifest_paths, labels, blank_index, max_duration=N
             # ~ -> tilde
             # _ -> underscore
             # % -> percent
-            # If a punctuation symbol is inside our vocab, we do not remove from text
+            # If a punctuation symbol is inside our vocab, we do not remove
+            # from text
             for l in labels:
                 punctuation = punctuation.replace(l, "")
             # Turn all punctuation to whitespace
@@ -80,72 +93,88 @@ def __init__(self, data_dir, manifest_paths, labels, blank_index, max_duration=N
             with open(manifest_path, "r", encoding="utf-8") as fh:
                 a = json.load(fh)
                 for data in a:
-                    files_and_speeds = data['files']
+                    files_and_speeds = data["files"]
 
                     if pad_to_max:
                         if not speed_perturbation:
                             min_speed = filter_speed
                         else:
-                            min_speed = min(x['speed']
+                            min_speed = min(x["speed"]
                                             for x in files_and_speeds)
                         max_duration = self.max_duration * min_speed
 
-                    data['duration'] = data['original_duration']
-                    if min_duration is not None and data['duration'] < min_duration:
-                        filtered_duration += data['duration']
+                    data["duration"] = data["original_duration"]
+                    if min_duration is not None and data["duration"] < min_duration:
+                        filtered_duration += data["duration"]
                         continue
-                    if max_duration is not None and data['duration'] > max_duration:
-                        filtered_duration += data['duration']
+                    if max_duration is not None and data["duration"] > max_duration:
+                        filtered_duration += data["duration"]
                         continue
 
                     # Prune and normalize according to transcript
-                    transcript_text = data[
-                        'transcript'] if "transcript" in data else self.load_transcript(
-                        data['text_filepath'])
+                    transcript_text = (
+                        data["transcript"]
+                        if "transcript" in data
+                        else self.load_transcript(data["text_filepath"])
+                    )
                     if normalize:
-                        transcript_text = normalize_string(transcript_text, labels=labels,
-                                                           table=table)
+                        transcript_text = normalize_string(
+                            transcript_text, labels=labels, table=table
+                        )
                     if not isinstance(transcript_text, str):
                         print(
                             "WARNING: Got transcript: {}. It is not a string. Dropping data point".format(
-                                transcript_text))
-                        filtered_duration += data['duration']
+                                transcript_text
+                            )
+                        )
+                        filtered_duration += data["duration"]
                         continue
                     data["transcript"] = self.parse_transcript(
-                        transcript_text)  # convert to vocab indices
+                        transcript_text
+                    )  # convert to vocab indices
 
                     if speed_perturbation:
-                        audio_paths = [x['fname'] for x in files_and_speeds]
-                        data['audio_duration'] = [x['duration']
-                                                  for x in files_and_speeds]
+                        audio_paths = [x["fname"] for x in files_and_speeds]
+                        data["audio_duration"] = [
+                            x["duration"] for x in files_and_speeds
+                        ]
                     else:
                         audio_paths = [
-                            x['fname'] for x in files_and_speeds if x['speed'] == filter_speed]
-                        data['audio_duration'] = [x['duration']
-                                                  for x in files_and_speeds if x['speed'] == filter_speed]
-                    data['audio_filepath'] = [os.path.join(
-                        data_dir, x) for x in audio_paths]
-                    data.pop('files')
-                    data.pop('original_duration')
+                            x["fname"]
+                            for x in files_and_speeds
+                            if x["speed"] == filter_speed
+                        ]
+                        data["audio_duration"] = [
+                            x["duration"]
+                            for x in files_and_speeds
+                            if x["speed"] == filter_speed
+                        ]
+                    data["audio_filepath"] = [
+                        os.path.join(data_dir, x) for x in audio_paths
+                    ]
+                    data.pop("files")
+                    data.pop("original_duration")
 
                     ids.append(data)
-                    duration += data['duration']
+                    duration += data["duration"]
 
                     if max_utts > 0 and len(ids) >= max_utts:
                         print(
-                            'Stopping parsing %s as max_utts=%d' % (manifest_path, max_utts))
+                            "Stopping parsing %s as max_utts=%d"
+                            % (manifest_path, max_utts)
+                        )
                         break
 
         if sort_by_duration:
-            ids = sorted(ids, key=lambda x: x['duration'])
+            ids = sorted(ids, key=lambda x: x["duration"])
         self._data = ids
         self._size = len(ids)
         self._duration = duration
         self._filtered_duration = filtered_duration
 
     def load_transcript(self, transcript_path):
-        with open(transcript_path, 'r', encoding="utf-8") as transcript_file:
-            transcript = transcript_file.read().replace('\n', '')
+        with open(transcript_path, "r", encoding="utf-8") as transcript_file:
+            transcript = transcript_file.read().replace("\n", "")
         return transcript
 
     def parse_transcript(self, transcript):
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/segment.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/segment.py
index 08aa5c6a4..3c1cc0a1b 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/segment.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/segment.py
@@ -26,8 +26,8 @@ class AudioSegment(object):
     :raises TypeError: If the sample data type is not float or int.
     """
 
-    def __init__(self, samples, sample_rate, target_sr=None, trim=False,
-                 trim_db=60):
+    def __init__(self, samples, sample_rate,
+                 target_sr=None, trim=False, trim_db=60):
         """Create audio segment from samples.
         Samples are convert float32 internally, with int scaled to [-1, 1].
         """
@@ -44,7 +44,7 @@ def __init__(self, samples, sample_rate, target_sr=None, trim=False,
 
     def __eq__(self, other):
         """Return whether two objects are equal."""
-        if type(other) is not type(self):
+        if not isinstance(other, type(self)):
             return False
         if self._sample_rate != other._sample_rate:
             return False
@@ -60,9 +60,13 @@ def __ne__(self, other):
 
     def __str__(self):
         """Return human-readable representation of segment."""
-        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
-                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
-                                self.duration, self.rms_db))
+        return "%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " "rms=%.2fdB" % (
+            type(self),
+            self.num_samples,
+            self.sample_rate,
+            self.duration,
+            self.rms_db,
+        )
 
     @staticmethod
     def _convert_samples_to_float32(samples):
@@ -70,19 +74,26 @@ def _convert_samples_to_float32(samples):
         Audio sample type is usually integer or float-point.
         Integers will be scaled to [-1, 1] in float32.
         """
-        float32_samples = samples.astype('float32')
-        if samples.dtype in np.sctypes['int']:
+        float32_samples = samples.astype("float32")
+        if samples.dtype in np.sctypes["int"]:
             bits = np.iinfo(samples.dtype).bits
-            float32_samples *= (1. / 2 ** (bits - 1))
-        elif samples.dtype in np.sctypes['float']:
+            float32_samples *= 1.0 / 2 ** (bits - 1)
+        elif samples.dtype in np.sctypes["float"]:
             pass
         else:
             raise TypeError("Unsupported sample type: %s." % samples.dtype)
         return float32_samples
 
     @classmethod
-    def from_file(cls, filename, target_sr=None, int_values=False, offset=0,
-                  duration=0, trim=False):
+    def from_file(
+        cls,
+        filename,
+        target_sr=None,
+        int_values=False,
+        offset=0,
+        duration=0,
+        trim=False,
+    ):
         """
         Load a file supported by librosa and return as an AudioSegment.
         :param filename: path of file to load
@@ -92,8 +103,8 @@ def from_file(cls, filename, target_sr=None, int_values=False, offset=0,
         :param duration: duration in seconds when loading audio
         :return: numpy array of samples
         """
-        with sf.SoundFile(filename, 'r') as f:
-            dtype = 'int32' if int_values else 'float32'
+        with sf.SoundFile(filename, "r") as f:
+            dtype = "int32" if int_values else "float32"
             sample_rate = f.samplerate
             if offset > 0:
                 f.seek(int(offset * sample_rate))
@@ -122,20 +133,20 @@ def duration(self):
 
     @property
     def rms_db(self):
-        mean_square = np.mean(self._samples ** 2)
+        mean_square = np.mean(self._samples**2)
         return 10 * np.log10(mean_square)
 
     def gain_db(self, gain):
-        self._samples *= 10. ** (gain / 20.)
+        self._samples *= 10.0 ** (gain / 20.0)
 
     def pad(self, pad_size, symmetric=False):
         """Add zero padding to the sample. The pad size is given in number of samples.
         If symmetric=True, `pad_size` will be added to both sides. If false, `pad_size`
         zeros will be added only to the end.
         """
-        self._samples = np.pad(self._samples,
-                               (pad_size if symmetric else 0, pad_size),
-                               mode='constant')
+        self._samples = np.pad(
+            self._samples, (pad_size if symmetric else 0, pad_size), mode="constant"
+        )
 
     def subsegment(self, start_time=None, end_time=None):
         """Cut the AudioSegment between given boundaries.
@@ -154,17 +165,23 @@ def subsegment(self, start_time=None, end_time=None):
         if end_time < 0.0:
             end_time = self.duration + end_time
         if start_time < 0.0:
-            raise ValueError("The slice start position (%f s) is out of "
-                             "bounds." % start_time)
+            raise ValueError(
+                "The slice start position (%f s) is out of " "bounds." % start_time
+            )
         if end_time < 0.0:
-            raise ValueError("The slice end position (%f s) is out of bounds." %
-                             end_time)
+            raise ValueError(
+                "The slice end position (%f s) is out of bounds." % end_time
+            )
         if start_time > end_time:
-            raise ValueError("The slice start position (%f s) is later than "
-                             "the end position (%f s)." % (start_time, end_time))
+            raise ValueError(
+                "The slice start position (%f s) is later than "
+                "the end position (%f s)." % (start_time, end_time)
+            )
         if end_time > self.duration:
-            raise ValueError("The slice end position (%f s) is out of bounds "
-                             "(> %f s)" % (end_time, self.duration))
+            raise ValueError(
+                "The slice end position (%f s) is out of bounds "
+                "(> %f s)" % (end_time, self.duration)
+            )
         start_sample = int(round(start_time * self._sample_rate))
         end_sample = int(round(end_time * self._sample_rate))
         self._samples = self._samples[start_sample:end_sample]
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/__init__.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/__init__.py
index 61936879a..ae1c39abd 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/__init__.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/__init__.py
@@ -7,6 +7,6 @@ def _clean_text(text, cleaner_names, *args):
     for name in cleaner_names:
         cleaner = getattr(cleaners, name)
         if not cleaner:
-            raise Exception('Unknown cleaner: %s' % name)
+            raise Exception("Unknown cleaner: %s" % name)
         text = cleaner(text, *args)
     return text
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/cleaners.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/cleaners.py
index 0187fb4ac..54f3a6b99 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/cleaners.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/cleaners.py
@@ -17,7 +17,7 @@
 Modified to add puncturation removal
 """
 
-'''
+"""
 Cleaners are transformations that run over the input text at both training and eval time.
 
 Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
@@ -28,36 +28,40 @@
     3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
          the symbols in symbols.py to match your data).
 
-'''
+"""
 
 
 # Regular expression matching whitespace:
+
 import re
 from unidecode import unidecode
 from .numbers import normalize_numbers
-_whitespace_re = re.compile(r'\s+')
+_whitespace_re = re.compile(r"\s+")
 
 # List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('mrs', 'misess'),
-    ('mr', 'mister'),
-    ('dr', 'doctor'),
-    ('st', 'saint'),
-    ('co', 'company'),
-    ('jr', 'junior'),
-    ('maj', 'major'),
-    ('gen', 'general'),
-    ('drs', 'doctors'),
-    ('rev', 'reverend'),
-    ('lt', 'lieutenant'),
-    ('hon', 'honorable'),
-    ('sgt', 'sergeant'),
-    ('capt', 'captain'),
-    ('esq', 'esquire'),
-    ('ltd', 'limited'),
-    ('col', 'colonel'),
-    ('ft', 'fort'),
-]]
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
 
 
 def expand_abbreviations(text):
@@ -75,7 +79,7 @@ def lowercase(text):
 
 
 def collapse_whitespace(text):
-    return re.sub(_whitespace_re, ' ', text)
+    return re.sub(_whitespace_re, " ", text)
 
 
 def convert_to_ascii(text):
@@ -84,20 +88,20 @@ def convert_to_ascii(text):
 
 def remove_punctuation(text, table):
     text = text.translate(table)
-    text = re.sub(r'&', " and ", text)
-    text = re.sub(r'\+', " plus ", text)
+    text = re.sub(r"&", " and ", text)
+    text = re.sub(r"\+", " plus ", text)
     return text
 
 
 def basic_cleaners(text):
-    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
     text = lowercase(text)
     text = collapse_whitespace(text)
     return text
 
 
 def transliteration_cleaners(text):
-    '''Pipeline for non-English text that transliterates to ASCII.'''
+    """Pipeline for non-English text that transliterates to ASCII."""
     text = convert_to_ascii(text)
     text = lowercase(text)
     text = collapse_whitespace(text)
@@ -105,7 +109,7 @@ def transliteration_cleaners(text):
 
 
 def english_cleaners(text, table=None):
-    '''Pipeline for English text, including number and abbreviation expansion.'''
+    """Pipeline for English text, including number and abbreviation expansion."""
     text = convert_to_ascii(text)
     text = lowercase(text)
     text = expand_numbers(text)
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/numbers.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/numbers.py
index 3d2f77121..2f0579f40 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/numbers.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/parts/text/numbers.py
@@ -21,42 +21,42 @@
 
 
 _inflect = inflect.engine()
-_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
-_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
-_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
-_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
-_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
-_number_re = re.compile(r'[0-9]+')
-_time_re = re.compile(r'([0-9]{1,2}):([0-9]{2})')
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+_time_re = re.compile(r"([0-9]{1,2}):([0-9]{2})")
 
 
 def _remove_commas(m):
-    return m.group(1).replace(',', '')
+    return m.group(1).replace(",", "")
 
 
 def _expand_decimal_point(m):
-    return m.group(1).replace('.', ' point ')
+    return m.group(1).replace(".", " point ")
 
 
 def _expand_dollars(m):
     match = m.group(1)
-    parts = match.split('.')
+    parts = match.split(".")
     if len(parts) > 2:
-        return match + ' dollars'  # Unexpected format
+        return match + " dollars"  # Unexpected format
     dollars = int(parts[0]) if parts[0] else 0
     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
     if dollars and cents:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
     elif dollars:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        return '%s %s' % (dollars, dollar_unit)
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
     elif cents:
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s' % (cents, cent_unit)
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
     else:
-        return 'zero dollars'
+        return "zero dollars"
 
 
 def _expand_ordinal(m):
@@ -65,34 +65,39 @@ def _expand_ordinal(m):
 
 def _expand_number(m):
     if int(m.group(0)[0]) == 0:
-        return _inflect.number_to_words(m.group(0), andword='', group=1)
+        return _inflect.number_to_words(m.group(0), andword="", group=1)
     num = int(m.group(0))
     if num > 1000 and num < 3000:
         if num == 2000:
-            return 'two thousand'
+            return "two thousand"
         elif num > 2000 and num < 2010:
-            return 'two thousand ' + _inflect.number_to_words(num % 100)
+            return "two thousand " + _inflect.number_to_words(num % 100)
         elif num % 100 == 0:
-            return _inflect.number_to_words(num // 100) + ' hundred'
+            return _inflect.number_to_words(num // 100) + " hundred"
         else:
-            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+            return _inflect.number_to_words(
+                num, andword="", zero="oh", group=2
+            ).replace(", ", " ")
     # Add check for number phones and other large numbers
     elif num > 1000000000 and num % 10000 != 0:
-        return _inflect.number_to_words(num, andword='', group=1)
+        return _inflect.number_to_words(num, andword="", group=1)
     else:
-        return _inflect.number_to_words(num, andword='')
+        return _inflect.number_to_words(num, andword="")
 
 
 def _expand_time(m):
     mins = int(m.group(2))
     if mins == 0:
         return _inflect.number_to_words(m.group(1))
-    return " ".join([_inflect.number_to_words(m.group(1)), _inflect.number_to_words(m.group(2))])
+    return " ".join(
+        [_inflect.number_to_words(m.group(1)),
+         _inflect.number_to_words(m.group(2))]
+    )
 
 
 def normalize_numbers(text):
     text = re.sub(_comma_number_re, _remove_commas, text)
-    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
     text = re.sub(_dollars_re, _expand_dollars, text)
     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
     text = re.sub(_ordinal_re, _expand_ordinal, text)
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/preprocessing.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/preprocessing.py
index 581885466..2a7c59156 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/preprocessing.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/preprocessing.py
@@ -22,16 +22,17 @@
 
 
 class AudioPreprocessing(nn.Module):
-    """GPU accelerated audio preprocessing
-    """
+    """GPU accelerated audio preprocessing"""
 
     def __init__(self, **kwargs):
-        nn.Module.__init__(self)    # For PyTorch API
+        nn.Module.__init__(self)  # For PyTorch API
         self.optim_level = kwargs.get(
-            'optimization_level', Optimization.nothing)
+            "optimization_level", Optimization.nothing)
         self.featurizer = FeatureFactory.from_config(kwargs)
 
-    def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self, x: Tuple[torch.Tensor, torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         input_signal, length = x
         length.requires_grad_(False)
         processed_signal = self.featurizer(x)
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/rnn.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/rnn.py
index 6bc825011..564eb9324 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/rnn.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/rnn.py
@@ -18,8 +18,16 @@
 from typing import Optional, Tuple
 
 
-def rnn(rnn, input_size, hidden_size, num_layers, norm=None,
-        forget_gate_bias=1.0, dropout=0.0, **kwargs):
+def rnn(
+    rnn,
+    input_size,
+    hidden_size,
+    num_layers,
+    norm=None,
+    forget_gate_bias=1.0,
+    dropout=0.0,
+    **kwargs,
+):
     """TODO"""
     if rnn != "lstm":
         raise ValueError(f"Unknown rnn={rnn}")
@@ -33,14 +41,15 @@ def rnn(rnn, input_size, hidden_size, num_layers, norm=None,
             num_layers=num_layers,
             dropout=dropout,
             forget_gate_bias=forget_gate_bias,
-            **kwargs
+            **kwargs,
         )
 
 
 class LstmDrop(torch.nn.Module):
 
-    def __init__(self, input_size, hidden_size, num_layers, dropout, forget_gate_bias,
-                 **kwargs):
+    def __init__(
+        self, input_size, hidden_size, num_layers, dropout, forget_gate_bias, **kwargs
+    ):
         """Returns an LSTM with forget gate bias init to `forget_gate_bias`.
 
         Args:
@@ -55,7 +64,12 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, forget_gate_bia
             A `torch.nn.LSTM`.
         """
         super(LstmDrop, self).__init__()
-        self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in  [ "no", "false" ]  else torch.device("cpu")
+        self.dev = (
+            torch.device("cuda:0")
+            if torch.cuda.is_available()
+            and os.environ.get("USE_GPU", "").lower() not in ["no", "false"]
+            else torch.device("cpu")
+        )
 
         self.lstm = torch.nn.LSTM(
             input_size=input_size,
@@ -67,18 +81,20 @@ def __init__(self, input_size, hidden_size, num_layers, dropout, forget_gate_bia
             for name, v in self.lstm.named_parameters():
                 if "bias_ih" in name:
                     bias = getattr(self.lstm, name)
-                    bias.data[hidden_size:2 * hidden_size].fill_(forget_gate_bias)
+                    bias.data[hidden_size: 2 *
+                              hidden_size].fill_(forget_gate_bias)
                 if "bias_hh" in name:
                     bias = getattr(self.lstm, name)
-                    bias.data[hidden_size:2 * hidden_size].fill_(0)
+                    bias.data[hidden_size: 2 * hidden_size].fill_(0)
 
         if dropout:
             self.inplace_dropout = torch.nn.Dropout(dropout, inplace=True)
         else:
             self.inplace_droput = None
 
-    def forward(self, x: torch.Tensor,
-                h: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
+    def forward(
+        self, x: torch.Tensor, h: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+    ):
         x, h = self.lstm(x, h)
 
         if self.inplace_dropout is not None:
@@ -94,15 +110,20 @@ class StackTime(torch.nn.Module):
     def __init__(self, factor):
         super().__init__()
         self.factor = int(factor)
-        self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in  [ "no", "false" ]  else torch.device("cpu")
-
+        self.dev = (
+            torch.device("cuda:0")
+            if torch.cuda.is_available()
+            and os.environ.get("USE_GPU", "").lower() not in ["no", "false"]
+            else torch.device("cpu")
+        )
 
     def forward(self, x, x_lens):
         # T, B, U
         r = torch.transpose(x, 0, 1).to(self.dev)
         s = r.shape
         zeros = torch.zeros(
-            s[0], (-s[1]) % self.factor, s[2], dtype=r.dtype, device=r.device)
+            s[0], (-s[1]) % self.factor, s[2], dtype=r.dtype, device=r.device
+        )
         r = torch.cat([r, zeros], 1)
         s = r.shape
         rs = [s[0], s[1] // self.factor, s[2] * self.factor]
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/convert_librispeech.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/convert_librispeech.py
index e90076cb0..54bcc9b3e 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/convert_librispeech.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/convert_librispeech.py
@@ -24,40 +24,59 @@
 
 from preprocessing_utils import parallel_preprocess
 
-parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.')
-parser.add_argument('--input_dir', type=str, required=True,
-                    help='LibriSpeech collection input dir')
-parser.add_argument('--dest_dir', type=str, required=True,
-                    help='Output dir')
-parser.add_argument('--output_json', type=str, default='./',
-                    help='name of the output json file.')
-parser.add_argument('-s', '--speed', type=float, nargs='*',
-                    help='Speed perturbation ratio')
-parser.add_argument('--target_sr', type=int, default=None,
-                    help='Target sample rate. '
-                         'defaults to the input sample rate')
-parser.add_argument('--overwrite', action='store_true',
-                    help='Overwrite file if exists')
-parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(),
-                    help='Number of threads to use when processing audio files')
+parser = argparse.ArgumentParser(description="Preprocess LibriSpeech.")
+parser.add_argument(
+    "--input_dir", type=str, required=True, help="LibriSpeech collection input dir"
+)
+parser.add_argument("--dest_dir", type=str, required=True, help="Output dir")
+parser.add_argument(
+    "--output_json", type=str, default="./", help="name of the output json file."
+)
+parser.add_argument(
+    "-s", "--speed", type=float, nargs="*", help="Speed perturbation ratio"
+)
+parser.add_argument(
+    "--target_sr",
+    type=int,
+    default=None,
+    help="Target sample rate. " "defaults to the input sample rate",
+)
+parser.add_argument(
+    "--overwrite",
+    action="store_true",
+    help="Overwrite file if exists")
+parser.add_argument(
+    "--parallel",
+    type=int,
+    default=multiprocessing.cpu_count(),
+    help="Number of threads to use when processing audio files",
+)
 args = parser.parse_args()
 
-args.input_dir = args.input_dir.rstrip('/')
-args.dest_dir = args.dest_dir.rstrip('/')
+args.input_dir = args.input_dir.rstrip("/")
+args.dest_dir = args.dest_dir.rstrip("/")
 
 
 def build_input_arr(input_dir):
-    txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
-                          recursive=True)
+    txt_files = glob.glob(
+        os.path.join(
+            input_dir,
+            "**",
+            "*.trans.txt"),
+        recursive=True)
     input_data = []
     for txt_file in txt_files:
         rel_path = os.path.relpath(txt_file, input_dir)
         with open(txt_file) as fp:
             for line in fp:
-                fname, _, transcript = line.partition(' ')
-                input_data.append(dict(input_relpath=os.path.dirname(rel_path),
-                                       input_fname=fname + '.flac',
-                                       transcript=transcript))
+                fname, _, transcript = line.partition(" ")
+                input_data.append(
+                    dict(
+                        input_relpath=os.path.dirname(rel_path),
+                        input_fname=fname + ".flac",
+                        transcript=transcript,
+                    )
+                )
     return input_data
 
 
@@ -65,18 +84,20 @@ def build_input_arr(input_dir):
 dataset = build_input_arr(input_dir=args.input_dir)
 
 print("[%s] Converting audio files..." % args.output_json)
-dataset = parallel_preprocess(dataset=dataset,
-                              input_dir=args.input_dir,
-                              dest_dir=args.dest_dir,
-                              target_sr=args.target_sr,
-                              speed=args.speed,
-                              overwrite=args.overwrite,
-                              parallel=args.parallel)
+dataset = parallel_preprocess(
+    dataset=dataset,
+    input_dir=args.input_dir,
+    dest_dir=args.dest_dir,
+    target_sr=args.target_sr,
+    speed=args.speed,
+    overwrite=args.overwrite,
+    parallel=args.parallel,
+)
 
 print("[%s] Generating json..." % args.output_json)
 df = pd.DataFrame(dataset, dtype=object)
 
 # Save json with python. df.to_json() produces back slashed in file paths
-dataset = df.to_dict(orient='records')
-with open(args.output_json, 'w') as fp:
+dataset = df.to_dict(orient="records")
+with open(args.output_json, "w") as fp:
     json.dump(dataset, fp, indent=2)
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/download_librispeech.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/download_librispeech.py
index f7e5eda13..1a0875433 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/download_librispeech.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/download_librispeech.py
@@ -21,29 +21,39 @@
 from download_utils import download_file, md5_checksum, extract
 
 parser = argparse.ArgumentParser(
-    description='Download, verify and extract dataset files')
-parser.add_argument('csv', type=str,
-                    help='CSV file with urls and checksums to download.')
-parser.add_argument('dest', type=str,
-                    help='Download destnation folder.')
-parser.add_argument('-e', type=str, default=None,
-                    help='Extraction destnation folder. Defaults to download folder if not provided')
-parser.add_argument('--skip_download', action='store_true',
-                    help='Skip downloading the files')
-parser.add_argument('--skip_checksum', action='store_true',
-                    help='Skip checksum')
-parser.add_argument('--skip_extract', action='store_true',
-                    help='Skip extracting files')
+    description="Download, verify and extract dataset files"
+)
+parser.add_argument(
+    "csv", type=str, help="CSV file with urls and checksums to download."
+)
+parser.add_argument("dest", type=str, help="Download destnation folder.")
+parser.add_argument(
+    "-e",
+    type=str,
+    default=None,
+    help="Extraction destnation folder. Defaults to download folder if not provided",
+)
+parser.add_argument(
+    "--skip_download", action="store_true", help="Skip downloading the files"
+)
+parser.add_argument(
+    "--skip_checksum",
+    action="store_true",
+    help="Skip checksum")
+parser.add_argument(
+    "--skip_extract",
+    action="store_true",
+    help="Skip extracting files")
 args = parser.parse_args()
 args.e = args.e or args.dest
 
 
-df = pd.read_csv(args.csv, delimiter=',')
+df = pd.read_csv(args.csv, delimiter=",")
 
 
 if not args.skip_download:
     for url in df.url:
-        fname = url.split('/')[-1]
+        fname = url.split("/")[-1]
         print("Downloading %s:" % fname)
         download_file(url=url, dest_folder=args.dest, fname=fname)
 else:
@@ -52,11 +62,11 @@
 
 if not args.skip_checksum:
     for index, row in df.iterrows():
-        url = row['url']
-        md5 = row['md5']
-        fname = url.split('/')[-1]
+        url = row["url"]
+        md5 = row["md5"]
+        fname = url.split("/")[-1]
         fpath = os.path.join(args.dest, fname)
-        print("Verifing %s: " % fname, end='')
+        print("Verifing %s: " % fname, end="")
         ret = md5_checksum(fpath=fpath, target_hash=md5)
         if not ret:
             raise ValueError(f"Checksum for {fname} failed!")
@@ -68,7 +78,7 @@
 
 if not args.skip_extract:
     for url in df.url:
-        fname = url.split('/')[-1]
+        fname = url.split("/")[-1]
         fpath = os.path.join(args.dest, fname)
         print("Decompressing %s:" % fpath)
         extract(fpath=fpath, dest_folder=args.e)
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/download_utils.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/download_utils.py
index bda4193fb..8b059229f 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/download_utils.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/download_utils.py
@@ -30,17 +30,18 @@ def download_file(url, dest_folder, fname, overwrite=False):
             print("File exists, skipping download.")
             return
 
-    tmp_fpath = fpath + '.tmp'
+    tmp_fpath = fpath + ".tmp"
 
     r = requests.get(url, stream=True)
-    file_size = int(r.headers['Content-Length'])
+    file_size = int(r.headers["Content-Length"])
     chunk_size = 1024 * 1024  # 1MB
     total_chunks = int(file_size / chunk_size)
 
-    with open(tmp_fpath, 'wb') as fp:
+    with open(tmp_fpath, "wb") as fp:
         content_iterator = r.iter_content(chunk_size=chunk_size)
-        chunks = tqdm.tqdm(content_iterator, total=total_chunks,
-                           unit='MB', desc=fpath, leave=True)
+        chunks = tqdm.tqdm(
+            content_iterator, total=total_chunks, unit="MB", desc=fpath, leave=True
+        )
         for chunk in chunks:
             fp.write(chunk)
 
@@ -56,14 +57,15 @@ def md5_checksum(fpath, target_hash):
 
 
 def extract(fpath, dest_folder):
-    if fpath.endswith('.tar.gz'):
-        mode = 'r:gz'
-    elif fpath.endswith('.tar'):
-        mode = 'r:'
+    if fpath.endswith(".tar.gz"):
+        mode = "r:gz"
+    elif fpath.endswith(".tar"):
+        mode = "r:"
     else:
-        raise IOError('fpath has unknown extention: %s' % fpath)
+        raise IOError("fpath has unknown extention: %s" % fpath)
 
     with tarfile.open(fpath, mode) as tar:
         members = tar.getmembers()
-        for member in tqdm.tqdm(iterable=members, total=len(members), leave=True):
+        for member in tqdm.tqdm(
+                iterable=members, total=len(members), leave=True):
             tar.extract(path=dest_folder, member=member)
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/preprocessing_utils.py b/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/preprocessing_utils.py
index 260e860b8..8d5eba12d 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/preprocessing_utils.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch/utils/preprocessing_utils.py
@@ -23,55 +23,62 @@
 from tqdm import tqdm
 
 
-def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None,
-               overwrite=True):
+def preprocess(data, input_dir, dest_dir, target_sr=None,
+               speed=None, overwrite=True):
     speed = speed or []
     speed.append(1)
     speed = list(set(speed))  # Make uniqe
 
-    input_fname = os.path.join(input_dir,
-                               data['input_relpath'],
-                               data['input_fname'])
+    input_fname = os.path.join(
+        input_dir,
+        data["input_relpath"],
+        data["input_fname"])
     input_sr = sox.file_info.sample_rate(input_fname)
     target_sr = target_sr or input_sr
 
-    os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True)
+    os.makedirs(os.path.join(dest_dir, data["input_relpath"]), exist_ok=True)
 
     output_dict = {}
-    output_dict['transcript'] = data['transcript'].lower().strip()
-    output_dict['files'] = []
+    output_dict["transcript"] = data["transcript"].lower().strip()
+    output_dict["files"] = []
 
-    fname = os.path.splitext(data['input_fname'])[0]
+    fname = os.path.splitext(data["input_fname"])[0]
     for s in speed:
         output_fname = fname + \
-            '{}.wav'.format('' if s == 1 else '-{}'.format(s))
-        output_fpath = os.path.join(dest_dir,
-                                    data['input_relpath'],
-                                    output_fname)
+            "{}.wav".format("" if s == 1 else "-{}".format(s))
+        output_fpath = os.path.join(
+            dest_dir, data["input_relpath"], output_fname)
 
         if not os.path.exists(output_fpath) or overwrite:
             cbn = sox.Transformer().speed(factor=s).convert(target_sr)
             cbn.build(input_fname, output_fpath)
 
         file_info = sox.file_info.info(output_fpath)
-        file_info['fname'] = os.path.join(os.path.basename(dest_dir),
-                                          data['input_relpath'],
-                                          output_fname)
-        file_info['speed'] = s
-        output_dict['files'].append(file_info)
+        file_info["fname"] = os.path.join(
+            os.path.basename(dest_dir), data["input_relpath"], output_fname
+        )
+        file_info["speed"] = s
+        output_dict["files"].append(file_info)
 
         if s == 1:
             file_info = sox.file_info.info(output_fpath)
-            output_dict['original_duration'] = file_info['duration']
-            output_dict['original_num_samples'] = file_info['num_samples']
+            output_dict["original_duration"] = file_info["duration"]
+            output_dict["original_num_samples"] = file_info["num_samples"]
 
     return output_dict
 
 
-def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel):
+def parallel_preprocess(
+    dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel
+):
     with multiprocessing.Pool(parallel) as p:
-        func = functools.partial(preprocess,
-                                 input_dir=input_dir, dest_dir=dest_dir,
-                                 target_sr=target_sr, speed=speed, overwrite=overwrite)
+        func = functools.partial(
+            preprocess,
+            input_dir=input_dir,
+            dest_dir=dest_dir,
+            target_sr=target_sr,
+            speed=speed,
+            overwrite=overwrite,
+        )
         dataset = list(tqdm(p.imap(func, dataset), total=len(dataset)))
         return dataset
diff --git a/retired_benchmarks/speech_recognition/rnnt/pytorch_SUT.py b/retired_benchmarks/speech_recognition/rnnt/pytorch_SUT.py
index 429d74234..02589becd 100644
--- a/retired_benchmarks/speech_recognition/rnnt/pytorch_SUT.py
+++ b/retired_benchmarks/speech_recognition/rnnt/pytorch_SUT.py
@@ -12,28 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from model_separable_rnnt import RNNT
+from preprocessing import AudioPreprocessing
+from helpers import add_blank_label
+from decoders import ScriptGreedyDecoder
+from QSL import AudioQSL, AudioQSLInMemory
+from tqdm import tqdm
+import mlperf_loadgen as lg
+import toml
+import numpy as np
+import torch
+import array
 import sys
 import os
-sys.path.insert(0, os.path.join(os.getcwd(), "pytorch"))
 
-import array
-import torch
-import numpy as np
-import toml
-import mlperf_loadgen as lg
-from tqdm import tqdm
-
-from QSL import AudioQSL, AudioQSLInMemory
-from decoders import ScriptGreedyDecoder
-from helpers import add_blank_label
-from preprocessing import AudioPreprocessing
-from model_separable_rnnt import RNNT
+sys.path.insert(0, os.path.join(os.getcwd(), "pytorch"))
 
 
 def load_and_migrate_checkpoint(ckpt_path):
     checkpoint = torch.load(ckpt_path, map_location="cpu")
     migrated_state_dict = {}
-    for key, value in checkpoint['state_dict'].items():
+    for key, value in checkpoint["state_dict"].items():
         key = key.replace("joint_net", "joint.net")
         migrated_state_dict[key] = value
     del migrated_state_dict["audio_preprocessor.featurizer.fb"]
@@ -42,46 +41,59 @@ def load_and_migrate_checkpoint(ckpt_path):
 
 
 class PytorchSUT:
-    def __init__(self, config_toml, checkpoint_path, dataset_dir,
-                 manifest_filepath, perf_count):
+    def __init__(
+        self, config_toml, checkpoint_path, dataset_dir, manifest_filepath, perf_count
+    ):
         config = toml.load(config_toml)
 
-        dataset_vocab = config['labels']['labels']
+        dataset_vocab = config["labels"]["labels"]
         rnnt_vocab = add_blank_label(dataset_vocab)
-        featurizer_config = config['input_eval']
+        featurizer_config = config["input_eval"]
 
-        self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in  [ "no", "false" ]  else torch.device("cpu")
+        self.dev = (
+            torch.device("cuda:0")
+            if torch.cuda.is_available()
+            and os.environ.get("USE_GPU", "").lower() not in ["no", "false"]
+            else torch.device("cpu")
+        )
 
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
-        self.qsl = AudioQSLInMemory(dataset_dir,
-                                    manifest_filepath,
-                                    dataset_vocab,
-                                    featurizer_config["sample_rate"],
-                                    perf_count)
+        self.qsl = AudioQSLInMemory(
+            dataset_dir,
+            manifest_filepath,
+            dataset_vocab,
+            featurizer_config["sample_rate"],
+            perf_count,
+        )
         self.audio_preprocessor = AudioPreprocessing(**featurizer_config)
         self.audio_preprocessor.eval()
         self.audio_preprocessor = torch.jit.script(self.audio_preprocessor)
         self.audio_preprocessor = torch.jit._recursive.wrap_cpp_module(
-            torch._C._freeze_module(self.audio_preprocessor._c))
+            torch._C._freeze_module(self.audio_preprocessor._c)
+        )
 
         model = RNNT(
             feature_config=featurizer_config,
-            rnnt=config['rnnt'],
-            num_classes=len(rnnt_vocab)
+            rnnt=config["rnnt"],
+            num_classes=len(rnnt_vocab),
         )
-        model.load_state_dict(load_and_migrate_checkpoint(checkpoint_path),
-                              strict=True)
+        model.load_state_dict(
+            load_and_migrate_checkpoint(checkpoint_path),
+            strict=True)
         model.to(self.dev)
         model.eval()
         model.encoder = torch.jit.script(model.encoder)
         model.encoder = torch.jit._recursive.wrap_cpp_module(
-            torch._C._freeze_module(model.encoder._c))
+            torch._C._freeze_module(model.encoder._c)
+        )
         model.prediction = torch.jit.script(model.prediction)
         model.prediction = torch.jit._recursive.wrap_cpp_module(
-            torch._C._freeze_module(model.prediction._c))
+            torch._C._freeze_module(model.prediction._c)
+        )
         model.joint = torch.jit.script(model.joint)
         model.joint = torch.jit._recursive.wrap_cpp_module(
-            torch._C._freeze_module(model.joint._c))
+            torch._C._freeze_module(model.joint._c)
+        )
         model = torch.jit.script(model)
 
         self.greedy_decoder = ScriptGreedyDecoder(len(rnnt_vocab) - 1, model)
@@ -96,18 +108,22 @@ def issue_queries(self, query_samples):
             with torch.no_grad():
                 waveform = torch.from_numpy(waveform)
                 waveform_length = torch.from_numpy(waveform_length)
-                feature, feature_length = self.audio_preprocessor.forward((waveform, waveform_length))
+                feature, feature_length = self.audio_preprocessor.forward(
+                    (waveform, waveform_length)
+                )
                 assert feature.ndim == 3
                 assert feature_length.ndim == 1
                 feature = feature.permute(2, 0, 1)
 
-                _, _, transcript = self.greedy_decoder.forward(feature, feature_length)
+                _, _, transcript = self.greedy_decoder.forward(
+                    feature, feature_length)
 
             assert len(transcript) == 1
-            response_array = array.array('q', transcript[0])
+            response_array = array.array("q", transcript[0])
             bi = response_array.buffer_info()
-            response = lg.QuerySampleResponse(query_sample.id, bi[0],
-                                              bi[1] * response_array.itemsize)
+            response = lg.QuerySampleResponse(
+                query_sample.id, bi[0], bi[1] * response_array.itemsize
+            )
             lg.QuerySamplesComplete([response])
 
     def flush_queries(self):
diff --git a/retired_benchmarks/speech_recognition/rnnt/run.py b/retired_benchmarks/speech_recognition/rnnt/run.py
index 02f12e73b..1a8e4cef2 100644
--- a/retired_benchmarks/speech_recognition/rnnt/run.py
+++ b/retired_benchmarks/speech_recognition/rnnt/run.py
@@ -20,20 +20,46 @@
 import os
 from pathlib import Path
 
-MLPERF_CONF = Path(os.path.dirname(os.path.realpath(__file__))) / "../../mlperf.conf"
+MLPERF_CONF = Path(
+    os.path.dirname(
+        os.path.realpath(__file__))) / "../../mlperf.conf"
 MLPERF_CONF = MLPERF_CONF.resolve()
 
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--backend", choices=["pytorch"], default="pytorch", help="Backend")
-    parser.add_argument("--scenario", choices=["SingleStream", "Offline", "Server"], default="Offline", help="Scenario")
-    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
-    parser.add_argument("--mlperf_conf", default=str(MLPERF_CONF), help="mlperf rules config")
-    parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS")
-    parser.add_argument("--audit_conf", default="audit.conf", help="audit config for LoadGen settings during compliance runs")
-    parser.add_argument("--pytorch_config_toml", default="pytorch/configs/rnnt.toml")
-    parser.add_argument("--pytorch_checkpoint", default="pytorch/work_dir/rnnt.pt")
+    parser.add_argument(
+        "--backend", choices=["pytorch"], default="pytorch", help="Backend"
+    )
+    parser.add_argument(
+        "--scenario",
+        choices=["SingleStream", "Offline", "Server"],
+        default="Offline",
+        help="Scenario",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--mlperf_conf", default=str(MLPERF_CONF), help="mlperf rules config"
+    )
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+    parser.add_argument(
+        "--audit_conf",
+        default="audit.conf",
+        help="audit config for LoadGen settings during compliance runs",
+    )
+    parser.add_argument(
+        "--pytorch_config_toml",
+        default="pytorch/configs/rnnt.toml")
+    parser.add_argument(
+        "--pytorch_checkpoint",
+        default="pytorch/work_dir/rnnt.pt")
     parser.add_argument("--dataset_dir", required=True)
     parser.add_argument("--manifest", required=True)
     parser.add_argument("--perf_count", type=int, default=None)
@@ -54,8 +80,14 @@ def main():
 
     if args.backend == "pytorch":
         from pytorch_SUT import PytorchSUT
-        sut = PytorchSUT(args.pytorch_config_toml, args.pytorch_checkpoint,
-                         args.dataset_dir, args.manifest, args.perf_count)
+
+        sut = PytorchSUT(
+            args.pytorch_config_toml,
+            args.pytorch_checkpoint,
+            args.dataset_dir,
+            args.manifest,
+            args.perf_count,
+        )
     else:
         raise ValueError("Unknown backend: {:}".format(args.backend))
 
@@ -78,7 +110,9 @@ def main():
     log_settings.log_output = log_output_settings
 
     print("Running Loadgen test...")
-    lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings, log_settings, args.audit_conf)
+    lg.StartTestWithLogSettings(
+        sut.sut, sut.qsl.qsl, settings, log_settings, args.audit_conf
+    )
 
     if args.accuracy:
         cmd = f"python3 accuracy_eval.py --log_dir {log_path} --dataset_dir {args.dataset_dir} --manifest {args.manifest}"
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/generic_loadgen.py b/retired_benchmarks/translation/gnmt/tensorflow/generic_loadgen.py
index 3c280bdd7..50c7bd3b0 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/generic_loadgen.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/generic_loadgen.py
@@ -20,14 +20,17 @@
 import mlperf_loadgen
 import array
 
-class ImplementationException (Exception):
+
+class ImplementationException(Exception):
     def __init__(self, msg):
         self.msg = msg
 
     def __repr__(self):
         return "ImplementationException: {}".format(self.msg)
 
-def flush_queries(): pass
+
+def flush_queries():
+    pass
 
 
 class Task:
@@ -35,8 +38,9 @@ def __init__(self, query_id, sample_id):
         self.query_id = query_id
         self.sample_id = sample_id
 
+
 class Runner:
-    
+
     def __init__(self, qSize=5):
         self.tasks = Queue(maxsize=qSize)
 
@@ -59,7 +63,7 @@ def handle_tasks(self):
             # Block until an item becomes available
             qitem = self.tasks.get(block=True)
 
-            # When a "None" item was added, it is a 
+            # When a "None" item was added, it is a
             # signal from the parent to indicate we should stop
             # working (see finish)
             if qitem is None:
@@ -71,7 +75,7 @@ def handle_tasks(self):
             self.post_process(qitem.query_id, results)
 
             self.tasks.task_done()
-    
+
     ##
     # @brief Post process results
     # @note This should serialize the results for query_ids and hand it over to loadgen
@@ -106,7 +110,8 @@ def start_worker(self):
         self.worker.daemon = True
         self.worker.start()
 
-class DummyRunner (Runner):
+
+class DummyRunner(Runner):
     def __init__(self):
         Runner.__init__(self)
         self.count = 0
@@ -118,11 +123,16 @@ def enqueue(self, query_samples):
             self.tasks.put(task)
 
     def process(self, qitem):
-        print("Default dummy process, processing the {}'th query for sample ID {}.".format(self.count, qitem.sample_id[0]))
+        print(
+            "Default dummy process, processing the {}'th query for sample ID {}.".format(
+                self.count, qitem.sample_id[0]
+            )
+        )
         self.count += 1
-        
+
         return self.count
 
+
 if __name__ == "__main__":
     runner = DummyRunner()
 
@@ -135,14 +145,17 @@ def process(self, qitem):
     # Specify exactly how many queries need to be made
     settings.min_query_count = 3003
     settings.max_query_count = 3003
-    
-    total_queries = 256 # Maximum sample ID + 1
-    perf_queries = 8   # TBD: Doesn't seem to have an effect
+
+    total_queries = 256  # Maximum sample ID + 1
+    perf_queries = 8  # TBD: Doesn't seem to have an effect
 
     sut = mlperf_loadgen.ConstructSUT(runner.enqueue, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        total_queries, perf_queries, runner.load_samples_to_ram, runner.unload_samples_from_ram)
+        total_queries,
+        perf_queries,
+        runner.load_samples_to_ram,
+        runner.unload_samples_from_ram,
+    )
     mlperf_loadgen.StartTest(sut, qsl, settings)
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
-
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/loadgen_gnmt.py b/retired_benchmarks/translation/gnmt/tensorflow/loadgen_gnmt.py
index b99debf66..2661bd274 100755
--- a/retired_benchmarks/translation/gnmt/tensorflow/loadgen_gnmt.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/loadgen_gnmt.py
@@ -28,6 +28,7 @@
 
 NANO_SEC = 1e9
 
+
 ##
 # @brief Translation task that contains 1 sentence ID.
 class TranslationTask:
@@ -37,13 +38,14 @@ def __init__(self, query_id, sentence_id, output_file):
         self.output_file = output_file
         self.start = time.time()
 
+
 ##
 # @brief Translation task that contains an array of sentence IDs
 class BatchTranslationTask:
     def __init__(self, sentence_id_list, query_id_list):
         self.sentence_id_list = sentence_id_list
         self.query_id_list = query_id_list
-        self.query_id = query_id_list   #FIXME generic_loadgen needs this
+        self.query_id = query_id_list  # FIXME generic_loadgen needs this
 
 
 ##
@@ -56,33 +58,46 @@ class GNMTWrapper:
     # @param vocab_prefix: Path to vocabulary file (note: don't add .en or .de suffixes)
     # @param outdir: Output directory to optionally write translations to
     # @param batch_size: batch size to use when processing BatchTranslationTasks
-    def __init__(self, ckpt_path=None, hparams_path=None, vocab_prefix=None, outdir=None, batch_size=32):
-        # If no value is provided for the construtor arguments, set defaults here
+    def __init__(
+        self,
+        ckpt_path=None,
+        hparams_path=None,
+        vocab_prefix=None,
+        outdir=None,
+        batch_size=32,
+    ):
+        # If no value is provided for the construtor arguments, set defaults
+        # here
         if ckpt_path is None:
-            ckpt_path = os.path.join(os.getcwd(), 'ende_gnmt_model_4_layer',
-                        'translate.ckpt')
+            ckpt_path = os.path.join(
+                os.getcwd(), "ende_gnmt_model_4_layer", "translate.ckpt"
+            )
 
         if hparams_path is None:
-            hparams_path= os.path.join(os.getcwd(), 'nmt', 'standard_hparams',
-                             'wmt16_gnmt_4_layer.json')
+            hparams_path = os.path.join(
+                os.getcwd(), "nmt", "standard_hparams", "wmt16_gnmt_4_layer.json"
+            )
 
         if vocab_prefix is None:
-            vocab_prefix = os.path.join(os.getcwd(), 'nmt', 'data', 'vocab.bpe.32000')
+            vocab_prefix = os.path.join(
+                os.getcwd(), "nmt", "data", "vocab.bpe.32000")
 
-
-        flags = self.parse_options(ckpt_path, hparams_path, vocab_prefix, outdir, batch_size)
+        flags = self.parse_options(
+            ckpt_path, hparams_path, vocab_prefix, outdir, batch_size
+        )
 
         self.setup(flags)
 
         self.count = 0
         self.infer_data = []  # This will be filled by load_sentences
 
-
     ##
     # @brief Parse GNMT-specific options before setting up
-    def parse_options(self, ckpt_path, hparams_path, vocab_prefix, outdir, batch_size):
+    def parse_options(self, ckpt_path, hparams_path,
+                      vocab_prefix, outdir, batch_size):
         FLAGS = None
-        # TBD remove argument parsing, and just have it return all default values.
+        # TBD remove argument parsing, and just have it return all default
+        # values.
         nmt_parser = argparse.ArgumentParser()
         add_arguments(nmt_parser)
         FLAGS, unparsed = nmt_parser.parse_known_args()
@@ -93,58 +108,66 @@ def parse_options(self, ckpt_path, hparams_path, vocab_prefix, outdir, batch_siz
         FLAGS.infer_batch_size = batch_size
         FLAGS.num_inter_threads = 1
         FLAGS.num_intra_threads = 1
-        FLAGS.run = "accuracy" # Needs to be set to accuracy to generate output
+        FLAGS.run = "accuracy"  # Needs to be set to accuracy to generate output
 
         # Pass in inference specific flags
         FLAGS.ckpt = ckpt_path
-        FLAGS.src = 'en'
-        FLAGS.tgt = 'de'
+        FLAGS.src = "en"
+        FLAGS.tgt = "de"
         FLAGS.hparams_path = hparams_path
         FLAGS.out_dir = outdir
         FLAGS.vocab_prefix = vocab_prefix
-        
+
         return FLAGS
 
     ##
-    # @brief Configure hparams and setup GNMT graph 
+    # @brief Configure hparams and setup GNMT graph
     # @pre Requires output from parse_options
     def setup(self, flags):
         # Model output directory
         out_dir = flags.out_dir
         if out_dir and not tf.gfile.Exists(out_dir):
-          tf.gfile.MakeDirs(out_dir)
+            tf.gfile.MakeDirs(out_dir)
 
         # Load hparams.
         default_hparams = create_hparams(flags)
         loaded_hparams = False
         if flags.ckpt:  # Try to load hparams from the same directory as ckpt
-          ckpt_dir = os.path.dirname(flags.ckpt)
-          ckpt_hparams_file = os.path.join(ckpt_dir, "hparams")
-          if tf.gfile.Exists(ckpt_hparams_file) or flags.hparams_path:
-                # Note: for some reason this will create an empty "best_bleu" directory and copy vocab files
-                hparams = create_or_load_hparams(ckpt_dir, default_hparams, flags.hparams_path, save_hparams=False)
+            ckpt_dir = os.path.dirname(flags.ckpt)
+            ckpt_hparams_file = os.path.join(ckpt_dir, "hparams")
+            if tf.gfile.Exists(ckpt_hparams_file) or flags.hparams_path:
+                # Note: for some reason this will create an empty "best_bleu"
+                # directory and copy vocab files
+                hparams = create_or_load_hparams(
+                    ckpt_dir, default_hparams, flags.hparams_path, save_hparams=False
+                )
                 loaded_hparams = True
-        
+
         assert loaded_hparams
 
         # GPU device
         config_proto = utils.get_config_proto(
             allow_soft_placement=True,
             num_intra_threads=hparams.num_intra_threads,
-            num_inter_threads=hparams.num_inter_threads)
+            num_inter_threads=hparams.num_inter_threads,
+        )
         utils.print_out(
-            "# Devices visible to TensorFlow: %s" 
-            % repr(tf.Session(config=config_proto).list_devices()))
-
+            "# Devices visible to TensorFlow: %s"
+            % repr(tf.Session(config=config_proto).list_devices())
+        )
 
-        # Inference indices (inference_indices is broken, but without setting it to None we'll crash)
+        # Inference indices (inference_indices is broken, but without setting
+        # it to None we'll crash)
         hparams.inference_indices = None
-        
+
         # Create the graph
         model_creator = get_model_creator(hparams)
-        infer_model = model_helper.create_infer_model(model_creator, hparams, scope=None)
-        sess, loaded_infer_model = start_sess_and_load_model(infer_model, flags.ckpt,
-                                                       hparams)
+        infer_model = model_helper.create_infer_model(
+            model_creator, hparams, scope=None
+        )
+        sess, loaded_infer_model = start_sess_and_load_model(
+            infer_model, flags.ckpt, hparams
+        )
 
         # Parameters needed by TF GNMT
         self.hparams = hparams
@@ -165,28 +188,37 @@ def translate(self, sentence_id_list):
             self.sess.run(
                 self.infer_model.iterator.initializer,
                 feed_dict={
-                    self.infer_model.src_placeholder: [self.infer_data[i] for i in sentence_id_list],
-                    self.infer_model.batch_size_placeholder: min(self.hparams.infer_batch_size, len(sentence_id_list))
-                })
+                    self.infer_model.src_placeholder: [
+                        self.infer_data[i] for i in sentence_id_list
+                    ],
+                    self.infer_model.batch_size_placeholder: min(
+                        self.hparams.infer_batch_size, len(sentence_id_list)
+                    ),
+                },
+            )
 
         # Start the translation
         nmt_outputs, _ = self.loaded_infer_model.decode(self.sess)
         if infer_mode != "beam_search":
-          nmt_outputs = np.expand_dims(nmt_outputs, 0)
+            nmt_outputs = np.expand_dims(nmt_outputs, 0)
 
         batch_size = nmt_outputs.shape[1]
         assert batch_size <= self.hparams.infer_batch_size
 
-        # Whether beam search is being used or not, we only want 1 final translation
+        # Whether beam search is being used or not, we only want 1 final
+        # translation
         assert self.hparams.num_translations_per_input == 1
 
         translation = []
         for decoded_id in range(batch_size):
-            translation += [nmt_utils.get_translation(
-                        nmt_outputs[0],
-                       decoded_id,
-                       tgt_eos=self.hparams.eos,
-                       subword_option=self.hparams.subword_option)]
+            translation += [
+                nmt_utils.get_translation(
+                    nmt_outputs[0],
+                    decoded_id,
+                    tgt_eos=self.hparams.eos,
+                    subword_option=self.hparams.subword_option,
+                )
+            ]
 
         # Keeping track of how many translations happened
         self.count += len(translation)
@@ -205,24 +237,28 @@ def getBatchSize(self):
     def load_sentences(self, input_file):
         self.infer_data = load_data(input_file, self.hparams)
 
+
 ##
 # @brief Basic class in which LoadGen can store queries that will be processed by GNMT
-class GNMTRunner (Runner):
+class GNMTRunner(Runner):
     ##
-    # @brief Constructor 
+    # @brief Constructor
     # @param model: GNMTWrapper object
     # @param input_file: path to the input text
     # @param verbose: provide some information on the progress
     def __init__(self, model, input_file=None, verbose=False):
         Runner.__init__(self)
 
-        # If no value is provided for the construtor arguments, set defaults here
+        # If no value is provided for the construtor arguments, set defaults
+        # here
         if input_file is None:
-            input_file = os.path.join(os.getcwd(), 'nmt', 'data', 'newstest2014.tok.bpe.32000.en')
+            input_file = os.path.join(
+                os.getcwd(), "nmt", "data", "newstest2014.tok.bpe.32000.en"
+            )
 
         self.gnmt = model
         self.input_file = input_file
-        
+
         self.VERBOSE = verbose
 
     ##
@@ -258,12 +294,14 @@ def process(self, qitem):
 
         # Split the samples over batches
         for i in range(0, num_samples, bs):
-            cur_sentid_list = [index for index in qitem.sentence_id_list[i:min(i+bs, num_samples)]] 
+            cur_sentid_list = [
+                index for index in qitem.sentence_id_list[i: min(i + bs, num_samples)]
+            ]
             translation += self.gnmt.translate(cur_sentid_list)
 
         if self.VERBOSE:
             print("Performed {} translations".format(self.gnmt.getCount()))
-        
+
         return translation
 
     ##
@@ -272,40 +310,50 @@ def enqueue(self, query_samples):
         if self.VERBOSE:
             print("Received query")
         query_id_list = [sample.id for sample in query_samples]
-        sentence_id_list = [sample.index for sample in query_samples] 
+        sentence_id_list = [sample.index for sample in query_samples]
         task = BatchTranslationTask(sentence_id_list, query_id_list)
         self.tasks.put(task)
 
     ##
     # @brief Serialize the result and give it to mlperf_loadgen
     # @param query_ids is a list of query ids that generated the samples
-    # @param results is a list of UTF-8 encoded strings 
+    # @param results is a list of UTF-8 encoded strings
     # @note Because of Python's Garbage Collection, we need to call QuerySamplesComplete before returning
     def post_process(self, query_ids, results):
         response = []
         # To prevent the garbage collector from removing serialized data before the call to QuerySamplesComplete
         # we need to keep track of serialized data here.
-        gc_hack = []    
+        gc_hack = []
         for res, q_id in zip(results, query_ids):
-            result_arr = array.array('B', res)
+            result_arr = array.array("B", res)
             gc_hack.append(result_arr)
             r_info = result_arr.buffer_info()
-            response.append(mlperf_loadgen.QuerySampleResponse(q_id, r_info[0], r_info[1]))
+            response.append(
+                mlperf_loadgen.QuerySampleResponse(q_id, r_info[0], r_info[1])
+            )
 
         # Tell loadgen that we're ready with this query
         mlperf_loadgen.QuerySamplesComplete(response)
 
+
 ##
 # @brief Subclass of GNMTRunner, specialized for batch size 1
-class SingleStreamGNMTRunner (GNMTRunner):
+class SingleStreamGNMTRunner(GNMTRunner):
     ##
-    # @brief Constructor 
+    # @brief Constructor
     # @param model: GNMTWrapper object
     # @param input_file: path to the input text
     # @param store_translation: whether output should be stored
     # @param verbose: provide some information on the progress
     # @param outdir: Output directory to optionally write translations to
-    def __init__(self, model, input_file=None, store_translation=False, verbose=False, outdir=None):
+    def __init__(
+        self,
+        model,
+        input_file=None,
+        store_translation=False,
+        verbose=False,
+        outdir=None,
+    ):
         GNMTRunner.__init__(self, model, input_file, verbose=verbose)
 
         # SingleStreamGNMTRunner only handles batch sizes of 1
@@ -319,12 +367,14 @@ def __init__(self, model, input_file=None, store_translation=False, verbose=Fals
     def process(self, qitem):
         if self.store_translation or self.VERBOSE:
             assert len(qitem.query_id) == 1
-            msg = "translate {} (QID {}): Sentence ID {}".format(self.gnmt.getCount(), qitem.query_id[0], qitem.sentence_id)
+            msg = "translate {} (QID {}): Sentence ID {}".format(
+                self.gnmt.getCount(), qitem.query_id[0], qitem.sentence_id
+            )
             if self.store_translation:
                 msg += " --> " + qitem.output_file
-            print (msg)
-       
-        sentence_id = qitem.sentence_id 
+            print(msg)
+
+        sentence_id = qitem.sentence_id
 
         translation = self.gnmt.translate([sentence_id])
 
@@ -337,8 +387,9 @@ def process(self, qitem):
     ##
     # @brief Write translation to file
     def write_output(self, translation, trans_file):
-          with codecs.getwriter("utf-8")(
-              tf.gfile.GFile(trans_file, mode="wb")) as trans_f:
+        with codecs.getwriter("utf-8")(
+            tf.gfile.GFile(trans_file, mode="wb")
+        ) as trans_f:
             trans_f.write((translation + b"\n").decode("utf-8"))
 
     ##
@@ -347,21 +398,25 @@ def enqueue(self, query_samples):
         assert len(query_samples) == 1
         sample = query_samples[0]
         sentence_id = sample.index
-        output_file = os.path.join(self.out_dir, "sentence_{}_de".format(sample.index))
+        output_file = os.path.join(
+            self.out_dir,
+            "sentence_{}_de".format(
+                sample.index))
 
         task = TranslationTask(sample.id, sentence_id, output_file)
         self.tasks.put(task)
 
+
 ##
 # @brief subclass of GNMTRunner, specialized for grouping multiple querries together
 class ServerGNMTRunner(GNMTRunner):
     ##
-    # @brief Constructor 
+    # @brief Constructor
     # @param model: GNMTWrapper object
     # @param input_file: path to the input text
     # @param verbose: provide some information on the progress
     def __init__(self, model, input_file=None, verbose=False):
-            GNMTRunner.__init__(self, model, input_file, verbose)
+        GNMTRunner.__init__(self, model, input_file, verbose)
 
     ##
     # @brief Override the default handle_tasks loop for smart batching
@@ -382,7 +437,7 @@ def handle_tasks(self):
             # or until we aggregated all current qurries
             try:
                 # @note that by definition, Server queries should have no more than 1 element
-                # Therefore we don't need to worry that batched_querries would be come larger than 
+                # Therefore we don't need to worry that batched_querries would be come larger than
                 # the batch size
                 while len(sentence_id_list) < self.gnmt.getBatchSize():
                     qitem = self.tasks.get(block=False)
@@ -396,10 +451,15 @@ def handle_tasks(self):
             except queue.Empty as e:
                 pass
 
-            batched_qitem = BatchTranslationTask(sentence_id_list, query_id_list)
+            batched_qitem = BatchTranslationTask(
+                sentence_id_list, query_id_list)
 
             if self.VERBOSE:
-                print("Aggregated {} single-sample querries.".format(len(batched_qitem.sentence_id_list)))
+                print(
+                    "Aggregated {} single-sample querries.".format(
+                        len(batched_qitem.sentence_id_list)
+                    )
+                )
 
             results = self.process(batched_qitem)
             response = []
@@ -412,43 +472,71 @@ def handle_tasks(self):
 
 if __name__ == "__main__":
     SCENARIO_MAP = {
-    "SingleStream": mlperf_loadgen.TestScenario.SingleStream,
-    "MultiStream": mlperf_loadgen.TestScenario.MultiStream,
-    "Server": mlperf_loadgen.TestScenario.Server,
-    "Offline": mlperf_loadgen.TestScenario.Offline,
+        "SingleStream": mlperf_loadgen.TestScenario.SingleStream,
+        "MultiStream": mlperf_loadgen.TestScenario.MultiStream,
+        "Server": mlperf_loadgen.TestScenario.Server,
+        "Offline": mlperf_loadgen.TestScenario.Offline,
     }
 
     MODE_MAP = {
         "Performance": mlperf_loadgen.TestMode.PerformanceOnly,
-        "Accuracy": mlperf_loadgen.TestMode.AccuracyOnly
+        "Accuracy": mlperf_loadgen.TestMode.AccuracyOnly,
     }
 
     parser = argparse.ArgumentParser()
 
-    parser.add_argument('--scenario', type=str, default='SingleStream',
-                            help="Scenario to be run: can be one of {SingleStream, Offline, MultiStream, Server}")
-
-    parser.add_argument('--batch_size', type=int, default=32,
-                            help="Max batch size to use in Offline and MultiStream scenarios.")
-
-    parser.add_argument('--store_translation', default=False, action='store_true',
-                            help="Store the output of translation? Note: Only valid with SingleStream scenario.")
-
-    parser.add_argument('--verbose', default=False, action='store_true',
-                            help="Verbose output.")
-
-    parser.add_argument("--mode", default="Performance", help="Can be one of {Performance, Accuracy}")
-
-    parser.add_argument("--debug_settings", default=False, action='store_true', 
-        help="For debugging purposes, modify settings to small number of querries.")
-
-    parser.add_argument("--qps", type=int, default=10, help="target qps estimate")
-
-    parser.add_argument("--max-latency", type=str, default="0.100", help="mlperf max latency in 99pct tile")
+    parser.add_argument(
+        "--scenario",
+        type=str,
+        default="SingleStream",
+        help="Scenario to be run: can be one of {SingleStream, Offline, MultiStream, Server}",
+    )
+
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Max batch size to use in Offline and MultiStream scenarios.",
+    )
+
+    parser.add_argument(
+        "--store_translation",
+        default=False,
+        action="store_true",
+        help="Store the output of translation? Note: Only valid with SingleStream scenario.",
+    )
+
+    parser.add_argument(
+        "--verbose", default=False, action="store_true", help="Verbose output."
+    )
+
+    parser.add_argument(
+        "--mode", default="Performance", help="Can be one of {Performance, Accuracy}"
+    )
+
+    parser.add_argument(
+        "--debug_settings",
+        default=False,
+        action="store_true",
+        help="For debugging purposes, modify settings to small number of querries.",
+    )
+
+    parser.add_argument(
+        "--qps",
+        type=int,
+        default=10,
+        help="target qps estimate")
+
+    parser.add_argument(
+        "--max-latency",
+        type=str,
+        default="0.100",
+        help="mlperf max latency in 99pct tile",
+    )
 
     args = parser.parse_args()
 
-    outdir = os.path.join(os.getcwd(), 'lg_output')
+    outdir = os.path.join(os.getcwd(), "lg_output")
 
     # Create loadGen settings
     settings = mlperf_loadgen.TestSettings()
@@ -468,9 +556,13 @@ def handle_tasks(self):
 
     # Specify input file
     if args.mode == "Accuracy":
-        input_file = os.path.join(os.getcwd(), 'nmt', 'data', "newstest2014.tok.bpe.32000.en")
+        input_file = os.path.join(
+            os.getcwd(), "nmt", "data", "newstest2014.tok.bpe.32000.en"
+        )
     else:
-        input_file = os.path.join(os.getcwd(), 'nmt', 'data', "newstest2014.tok.bpe.32000.en.large")
+        input_file = os.path.join(
+            os.getcwd(), "nmt", "data", "newstest2014.tok.bpe.32000.en.large"
+        )
 
     # Build the GNMT model
     if args.scenario == "SingleStream":
@@ -478,27 +570,39 @@ def handle_tasks(self):
     else:
         batch_size = args.batch_size
 
-    gnmt_model = GNMTWrapper(batch_size = batch_size, outdir=outdir)
+    gnmt_model = GNMTWrapper(batch_size=batch_size, outdir=outdir)
 
     if args.scenario == "SingleStream":
-        runner = SingleStreamGNMTRunner(gnmt_model, input_file=input_file, store_translation=args.store_translation, verbose=args.verbose, outdir=outdir)
-        
+        runner = SingleStreamGNMTRunner(
+            gnmt_model,
+            input_file=input_file,
+            store_translation=args.store_translation,
+            verbose=args.verbose,
+            outdir=outdir,
+        )
+
         # Specify exactly how many queries need to be made
         if args.debug_settings:
             settings.min_query_count = 80
             settings.max_query_count = 80
 
     elif args.scenario == "Offline":
-        runner = GNMTRunner(gnmt_model, input_file=input_file, verbose=args.verbose)
-        
+        runner = GNMTRunner(
+            gnmt_model,
+            input_file=input_file,
+            verbose=args.verbose)
+
         # Specify exactly how many queries need to be made
         if args.debug_settings:
             settings.min_query_count = 1
             settings.max_query_count = 1
 
     elif args.scenario == "MultiStream":
-        runner = GNMTRunner(gnmt_model, input_file=input_file, verbose=args.verbose)
-        
+        runner = GNMTRunner(
+            gnmt_model,
+            input_file=input_file,
+            verbose=args.verbose)
+
         # Specify exactly how many queries need to be made
         if args.debug_settings:
             settings.min_query_count = 100
@@ -506,8 +610,10 @@ def handle_tasks(self):
             settings.multi_stream_samples_per_query = 8
 
     elif args.scenario == "Server":
-        runner = ServerGNMTRunner(gnmt_model, input_file=input_file, verbose=args.verbose)
-        
+        runner = ServerGNMTRunner(
+            gnmt_model, input_file=input_file, verbose=args.verbose
+        )
+
         # Specify exactly how many queries need to be made
         if args.debug_settings:
             settings.min_query_count = 20
@@ -519,13 +625,18 @@ def handle_tasks(self):
     # Create a thread in the GNMTRunner to start accepting work
     runner.start_worker()
 
-    total_queries = runner.getTotalNumSentences() # Maximum sample ID + 1
-    perf_queries = min(total_queries, 3003)   # Select the same subset of $perf_queries samples
+    total_queries = runner.getTotalNumSentences()  # Maximum sample ID + 1
+    perf_queries = min(
+        total_queries, 3003
+    )  # Select the same subset of $perf_queries samples
 
     sut = mlperf_loadgen.ConstructSUT(runner.enqueue, flush_queries)
     qsl = mlperf_loadgen.ConstructQSL(
-        total_queries, perf_queries, runner.load_samples_to_ram, runner.unload_samples_from_ram)
-
+        total_queries,
+        perf_queries,
+        runner.load_samples_to_ram,
+        runner.unload_samples_from_ram,
+    )
 
     # Start generating queries by starting the test
     # A single test for all non-server scenarios
@@ -536,7 +647,10 @@ def handle_tasks(self):
     # Multiple tests (depending on target latency array) for server scenario
     else:
         for target_latency in max_latency:
-            print("starting {} scenario, latency={}".format(args.scenario, target_latency))
+            print(
+                "starting {} scenario, latency={}".format(
+                    args.scenario, target_latency)
+            )
             settings.server_target_latency_ns = int(target_latency * NANO_SEC)
 
             mlperf_loadgen.StartTest(sut, qsl, settings)
@@ -544,4 +658,3 @@ def handle_tasks(self):
     runner.finish()
     mlperf_loadgen.DestroyQSL(qsl)
     mlperf_loadgen.DestroySUT(sut)
-
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/mlcube.py b/retired_benchmarks/translation/gnmt/tensorflow/mlcube.py
index f24708869..0aab3c3e1 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/mlcube.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/mlcube.py
@@ -1,4 +1,5 @@
 """MLCube handler file"""
+
 import os
 import yaml
 import typer
@@ -21,7 +22,9 @@ def run(data_dir: str) -> None:
 
         env = os.environ.copy()
         env.update(
-            {"DATA_DIR": data_dir,}
+            {
+                "DATA_DIR": data_dir,
+            }
         )
 
         process = subprocess.Popen("./download_dataset.sh", cwd=".", env=env)
@@ -42,10 +45,13 @@ def run(model_dir: str) -> None:
 
         env = os.environ.copy()
         env.update(
-            {"MODEL_DIR": model_dir,}
+            {
+                "MODEL_DIR": model_dir,
+            }
         )
 
-        process = subprocess.Popen("./download_trained_model.sh", cwd=".", env=env)
+        process = subprocess.Popen(
+            "./download_trained_model.sh", cwd=".", env=env)
         process.wait()
 
 
@@ -59,18 +65,21 @@ class RunPerformanceTask(object):
     Then executes the benchmark script"""
 
     @staticmethod
-    def run(data_dir: str, model_dir: str, output_dir: str, parameters_file: str) -> None:
+    def run(
+        data_dir: str, model_dir: str, output_dir: str, parameters_file: str
+    ) -> None:
         with open(parameters_file, "r") as stream:
             parameters = yaml.safe_load(stream)
 
-        
         env = os.environ.copy()
-        env.update({
-            'DATA_DIR': os.path.join(data_dir, 'nmt', 'data'),
-            'MODEL_DIR': os.path.join(model_dir, 'ende_gnmt_model_4_layer'),
-            'OUTPUT_DIR': output_dir,
-            'TASK': "performance"
-        })
+        env.update(
+            {
+                "DATA_DIR": os.path.join(data_dir, "nmt", "data"),
+                "MODEL_DIR": os.path.join(model_dir, "ende_gnmt_model_4_layer"),
+                "OUTPUT_DIR": output_dir,
+                "TASK": "performance",
+            }
+        )
 
         env.update(parameters)
 
@@ -88,18 +97,21 @@ class RunAccuracyTask(object):
     Then executes the benchmark script"""
 
     @staticmethod
-    def run(data_dir: str, model_dir: str, output_dir: str, parameters_file: str) -> None:
+    def run(
+        data_dir: str, model_dir: str, output_dir: str, parameters_file: str
+    ) -> None:
         with open(parameters_file, "r") as stream:
             parameters = yaml.safe_load(stream)
 
-        
         env = os.environ.copy()
-        env.update({
-            'DATA_DIR': os.path.join(data_dir, 'nmt', 'data'),
-            'MODEL_DIR': os.path.join(model_dir, 'ende_gnmt_model_4_layer'),
-            'OUTPUT_DIR': output_dir,
-            'TASK': "accuracy"
-        })
+        env.update(
+            {
+                "DATA_DIR": os.path.join(data_dir, "nmt", "data"),
+                "MODEL_DIR": os.path.join(model_dir, "ende_gnmt_model_4_layer"),
+                "OUTPUT_DIR": output_dir,
+                "TASK": "accuracy",
+            }
+        )
 
         env.update(parameters)
 
@@ -126,6 +138,7 @@ def run_performance(
 ):
     RunPerformanceTask.run(data_dir, model_dir, output_dir, parameters_file)
 
+
 @app.command("run_accuracy")
 def run_accuracy(
     data_dir: str = typer.Option(..., "--data_dir"),
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/attention_model.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/attention_model.py
index d262b8e8e..908be0824 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/attention_model.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/attention_model.py
@@ -26,169 +26,184 @@
 
 
 class AttentionModel(model.Model):
-  """Sequence-to-sequence dynamic model with attention.
-
-  This class implements a multi-layer recurrent neural network as encoder,
-  and an attention-based decoder. This is the same as the model described in
-  (Luong et al., EMNLP'2015) paper: https://arxiv.org/pdf/1508.04025v5.pdf.
-  This class also allows to use GRU cells in addition to LSTM cells with
-  support for dropout.
-  """
-
-  def __init__(self,
-               hparams,
-               mode,
-               iterator,
-               source_vocab_table,
-               target_vocab_table,
-               reverse_target_vocab_table=None,
-               scope=None,
-               extra_args=None):
-    self.has_attention = hparams.attention_architecture and hparams.attention
-
-    # Set attention_mechanism_fn
-    if self.has_attention:
-      if extra_args and extra_args.attention_mechanism_fn:
-        self.attention_mechanism_fn = extra_args.attention_mechanism_fn
-      else:
-        self.attention_mechanism_fn = create_attention_mechanism
-
-    super(AttentionModel, self).__init__(
-        hparams=hparams,
-        mode=mode,
-        iterator=iterator,
-        source_vocab_table=source_vocab_table,
-        target_vocab_table=target_vocab_table,
-        reverse_target_vocab_table=reverse_target_vocab_table,
-        scope=scope,
-        extra_args=extra_args)
-
-  def _prepare_beam_search_decoder_inputs(
-      self, beam_width, memory, source_sequence_length, encoder_state):
-    memory = tf.contrib.seq2seq.tile_batch(
-        memory, multiplier=beam_width)
-    source_sequence_length = tf.contrib.seq2seq.tile_batch(
-        source_sequence_length, multiplier=beam_width)
-    encoder_state = tf.contrib.seq2seq.tile_batch(
-        encoder_state, multiplier=beam_width)
-    batch_size = self.batch_size * beam_width
-    return memory, source_sequence_length, encoder_state, batch_size
-
-  def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
-                          source_sequence_length):
-    """Build a RNN cell with attention mechanism that can be used by decoder."""
-    # No Attention
-    if not self.has_attention:
-      return super(AttentionModel, self)._build_decoder_cell(
-          hparams, encoder_outputs, encoder_state, source_sequence_length)
-    elif hparams.attention_architecture != "standard":
-      raise ValueError(
-          "Unknown attention architecture %s" % hparams.attention_architecture)
-
-    num_units = hparams.num_units
-    num_layers = self.num_decoder_layers
-    num_residual_layers = self.num_decoder_residual_layers
-    infer_mode = hparams.infer_mode
-
-    dtype = tf.float32
-
-    # Ensure memory is batch-major
-    if self.time_major:
-      memory = tf.transpose(encoder_outputs, [1, 0, 2])
+    """Sequence-to-sequence dynamic model with attention.
+
+    This class implements a multi-layer recurrent neural network as encoder,
+    and an attention-based decoder. This is the same as the model described in
+    (Luong et al., EMNLP'2015) paper: https://arxiv.org/pdf/1508.04025v5.pdf.
+    This class also allows to use GRU cells in addition to LSTM cells with
+    support for dropout.
+    """
+
+    def __init__(
+        self,
+        hparams,
+        mode,
+        iterator,
+        source_vocab_table,
+        target_vocab_table,
+        reverse_target_vocab_table=None,
+        scope=None,
+        extra_args=None,
+    ):
+        self.has_attention = hparams.attention_architecture and hparams.attention
+
+        # Set attention_mechanism_fn
+        if self.has_attention:
+            if extra_args and extra_args.attention_mechanism_fn:
+                self.attention_mechanism_fn = extra_args.attention_mechanism_fn
+            else:
+                self.attention_mechanism_fn = create_attention_mechanism
+
+        super(AttentionModel, self).__init__(
+            hparams=hparams,
+            mode=mode,
+            iterator=iterator,
+            source_vocab_table=source_vocab_table,
+            target_vocab_table=target_vocab_table,
+            reverse_target_vocab_table=reverse_target_vocab_table,
+            scope=scope,
+            extra_args=extra_args,
+        )
+
+    def _prepare_beam_search_decoder_inputs(
+        self, beam_width, memory, source_sequence_length, encoder_state
+    ):
+        memory = tf.contrib.seq2seq.tile_batch(memory, multiplier=beam_width)
+        source_sequence_length = tf.contrib.seq2seq.tile_batch(
+            source_sequence_length, multiplier=beam_width
+        )
+        encoder_state = tf.contrib.seq2seq.tile_batch(
+            encoder_state, multiplier=beam_width
+        )
+        batch_size = self.batch_size * beam_width
+        return memory, source_sequence_length, encoder_state, batch_size
+
+    def _build_decoder_cell(
+        self, hparams, encoder_outputs, encoder_state, source_sequence_length
+    ):
+        """Build a RNN cell with attention mechanism that can be used by decoder."""
+        # No Attention
+        if not self.has_attention:
+            return super(AttentionModel, self)._build_decoder_cell(
+                hparams, encoder_outputs, encoder_state, source_sequence_length
+            )
+        elif hparams.attention_architecture != "standard":
+            raise ValueError(
+                "Unknown attention architecture %s" % hparams.attention_architecture
+            )
+
+        num_units = hparams.num_units
+        num_layers = self.num_decoder_layers
+        num_residual_layers = self.num_decoder_residual_layers
+        infer_mode = hparams.infer_mode
+
+        dtype = tf.float32
+
+        # Ensure memory is batch-major
+        if self.time_major:
+            memory = tf.transpose(encoder_outputs, [1, 0, 2])
+        else:
+            memory = encoder_outputs
+
+        if self.mode == tf.contrib.learn.ModeKeys.INFER and infer_mode == "beam_search":
+            memory, source_sequence_length, encoder_state, batch_size = (
+                self._prepare_beam_search_decoder_inputs(
+                    hparams.beam_width, memory, source_sequence_length, encoder_state
+                )
+            )
+        else:
+            batch_size = self.batch_size
+
+        # Attention
+        attention_mechanism = self.attention_mechanism_fn(
+            hparams.attention, num_units, memory, source_sequence_length, self.mode
+        )
+
+        cell = model_helper.create_rnn_cell(
+            unit_type=hparams.unit_type,
+            num_units=num_units,
+            num_layers=num_layers,
+            num_residual_layers=num_residual_layers,
+            forget_bias=hparams.forget_bias,
+            dropout=hparams.dropout,
+            num_gpus=self.num_gpus,
+            mode=self.mode,
+            single_cell_fn=self.single_cell_fn,
+        )
+
+        # Only generate alignment in greedy INFER mode.
+        alignment_history = (
+            self.mode == tf.contrib.learn.ModeKeys.INFER and infer_mode != "beam_search"
+        )
+        cell = tf.contrib.seq2seq.AttentionWrapper(
+            cell,
+            attention_mechanism,
+            attention_layer_size=num_units,
+            alignment_history=alignment_history,
+            output_attention=hparams.output_attention,
+            name="attention",
+        )
+
+        # TODO(thangluong): do we need num_layers, num_gpus?
+        cell = tf.contrib.rnn.DeviceWrapper(
+            cell, model_helper.get_device_str(num_layers - 1, self.num_gpus)
+        )
+
+        if hparams.pass_hidden_state:
+            decoder_initial_state = cell.zero_state(batch_size, dtype).clone(
+                cell_state=encoder_state
+            )
+        else:
+            decoder_initial_state = cell.zero_state(batch_size, dtype)
+
+        return cell, decoder_initial_state
+
+    def _get_infer_summary(self, hparams):
+        if not self.has_attention or hparams.infer_mode == "beam_search":
+            return tf.no_op()
+        return _create_attention_images_summary(self.final_context_state)
+
+
+def create_attention_mechanism(
+    attention_option, num_units, memory, source_sequence_length, mode
+):
+    """Create attention mechanism based on the attention_option."""
+    del mode  # unused
+
+    # Mechanism
+    if attention_option == "luong":
+        attention_mechanism = tf.contrib.seq2seq.LuongAttention(
+            num_units, memory, memory_sequence_length=source_sequence_length
+        )
+    elif attention_option == "scaled_luong":
+        attention_mechanism = tf.contrib.seq2seq.LuongAttention(
+            num_units, memory, memory_sequence_length=source_sequence_length, scale=True
+        )
+    elif attention_option == "bahdanau":
+        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
+            num_units, memory, memory_sequence_length=source_sequence_length
+        )
+    elif attention_option == "normed_bahdanau":
+        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
+            num_units,
+            memory,
+            memory_sequence_length=source_sequence_length,
+            normalize=True,
+        )
     else:
-      memory = encoder_outputs
-
-    if (self.mode == tf.contrib.learn.ModeKeys.INFER and
-        infer_mode == "beam_search"):
-      memory, source_sequence_length, encoder_state, batch_size = (
-          self._prepare_beam_search_decoder_inputs(
-              hparams.beam_width, memory, source_sequence_length,
-              encoder_state))
-    else:
-      batch_size = self.batch_size
-
-    # Attention
-    attention_mechanism = self.attention_mechanism_fn(
-        hparams.attention, num_units, memory, source_sequence_length, self.mode)
-
-    cell = model_helper.create_rnn_cell(
-        unit_type=hparams.unit_type,
-        num_units=num_units,
-        num_layers=num_layers,
-        num_residual_layers=num_residual_layers,
-        forget_bias=hparams.forget_bias,
-        dropout=hparams.dropout,
-        num_gpus=self.num_gpus,
-        mode=self.mode,
-        single_cell_fn=self.single_cell_fn)
-
-    # Only generate alignment in greedy INFER mode.
-    alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and
-                         infer_mode != "beam_search")
-    cell = tf.contrib.seq2seq.AttentionWrapper(
-        cell,
-        attention_mechanism,
-        attention_layer_size=num_units,
-        alignment_history=alignment_history,
-        output_attention=hparams.output_attention,
-        name="attention")
-
-    # TODO(thangluong): do we need num_layers, num_gpus?
-    cell = tf.contrib.rnn.DeviceWrapper(cell,
-                                        model_helper.get_device_str(
-                                            num_layers - 1, self.num_gpus))
-
-    if hparams.pass_hidden_state:
-      decoder_initial_state = cell.zero_state(batch_size, dtype).clone(
-          cell_state=encoder_state)
-    else:
-      decoder_initial_state = cell.zero_state(batch_size, dtype)
-
-    return cell, decoder_initial_state
-
-  def _get_infer_summary(self, hparams):
-    if not self.has_attention or hparams.infer_mode == "beam_search":
-      return tf.no_op()
-    return _create_attention_images_summary(self.final_context_state)
-
-
-def create_attention_mechanism(attention_option, num_units, memory,
-                               source_sequence_length, mode):
-  """Create attention mechanism based on the attention_option."""
-  del mode  # unused
-
-  # Mechanism
-  if attention_option == "luong":
-    attention_mechanism = tf.contrib.seq2seq.LuongAttention(
-        num_units, memory, memory_sequence_length=source_sequence_length)
-  elif attention_option == "scaled_luong":
-    attention_mechanism = tf.contrib.seq2seq.LuongAttention(
-        num_units,
-        memory,
-        memory_sequence_length=source_sequence_length,
-        scale=True)
-  elif attention_option == "bahdanau":
-    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
-        num_units, memory, memory_sequence_length=source_sequence_length)
-  elif attention_option == "normed_bahdanau":
-    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
-        num_units,
-        memory,
-        memory_sequence_length=source_sequence_length,
-        normalize=True)
-  else:
-    raise ValueError("Unknown attention option %s" % attention_option)
-
-  return attention_mechanism
+        raise ValueError("Unknown attention option %s" % attention_option)
+
+    return attention_mechanism
 
 
 def _create_attention_images_summary(final_context_state):
-  """create attention image and attention summary."""
-  attention_images = (final_context_state.alignment_history.stack())
-  # Reshape to (batch, src_seq_len, tgt_seq_len,1)
-  attention_images = tf.expand_dims(
-      tf.transpose(attention_images, [1, 2, 0]), -1)
-  # Scale to range [0, 255]
-  attention_images *= 255
-  attention_summary = tf.summary.image("attention_images", attention_images)
-  return attention_summary
+    """create attention image and attention summary."""
+    attention_images = final_context_state.alignment_history.stack()
+    # Reshape to (batch, src_seq_len, tgt_seq_len,1)
+    attention_images = tf.expand_dims(
+        tf.transpose(attention_images, [1, 2, 0]), -1)
+    # Scale to range [0, 255]
+    attention_images *= 255
+    attention_summary = tf.summary.image("attention_images", attention_images)
+    return attention_summary
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/gnmt_model.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/gnmt_model.py
index 468a5d00c..d86e6e23a 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/gnmt_model.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/gnmt_model.py
@@ -29,305 +29,344 @@
 
 
 class GNMTModel(attention_model.AttentionModel):
-  """Sequence-to-sequence dynamic model with GNMT attention architecture.
-  """
-
-  def __init__(self,
-               hparams,
-               mode,
-               iterator,
-               source_vocab_table,
-               target_vocab_table,
-               reverse_target_vocab_table=None,
-               scope=None,
-               extra_args=None):
-    self.is_gnmt_attention = (
-        hparams.attention_architecture in ["gnmt", "gnmt_v2"])
-
-    super(GNMTModel, self).__init__(
-        hparams=hparams,
-        mode=mode,
-        iterator=iterator,
-        source_vocab_table=source_vocab_table,
-        target_vocab_table=target_vocab_table,
-        reverse_target_vocab_table=reverse_target_vocab_table,
-        scope=scope,
-        extra_args=extra_args)
-
-  def _build_encoder(self, hparams):
-    """Build a GNMT encoder."""
-    if hparams.encoder_type == "uni" or hparams.encoder_type == "bi":
-      return super(GNMTModel, self)._build_encoder(hparams)
-
-    if hparams.encoder_type != "gnmt":
-      raise ValueError("Unknown encoder_type %s" % hparams.encoder_type)
-
-    # Build GNMT encoder.
-    num_bi_layers = 1
-    num_uni_layers = self.num_encoder_layers - num_bi_layers
-    utils.print_out("# Build a GNMT encoder")
-    utils.print_out("  num_bi_layers = %d" % num_bi_layers)
-    utils.print_out("  num_uni_layers = %d" % num_uni_layers)
-
-    iterator = self.iterator
-    source = iterator.source
-    if self.time_major:
-      source = tf.transpose(source)
-
-    with tf.variable_scope("encoder") as scope:
-      dtype = scope.dtype
-
-      self.encoder_emb_inp = self.encoder_emb_lookup_fn(
-          self.embedding_encoder, source)
-
-      # Execute _build_bidirectional_rnn from Model class
-      bi_encoder_outputs, bi_encoder_state = self._build_bidirectional_rnn(
-          inputs=self.encoder_emb_inp,
-          sequence_length=iterator.source_sequence_length,
-          dtype=dtype,
-          hparams=hparams,
-          num_bi_layers=num_bi_layers,
-          num_bi_residual_layers=0,  # no residual connection
-      )
-
-      # Build unidirectional layers
-      if self.extract_encoder_layers:
-        encoder_state, encoder_outputs = self._build_individual_encoder_layers(
-            bi_encoder_outputs, num_uni_layers, dtype, hparams)
-      else:
-        encoder_state, encoder_outputs = self._build_all_encoder_layers(
-            bi_encoder_outputs, num_uni_layers, dtype, hparams)
-
-      # Pass all encoder states to the decoder
-      #   except the first bi-directional layer
-      encoder_state = (bi_encoder_state[1],) + (
-          (encoder_state,) if num_uni_layers == 1 else encoder_state)
-
-    return encoder_outputs, encoder_state
-
-  def _build_all_encoder_layers(self, bi_encoder_outputs,
-                                num_uni_layers, dtype, hparams):
-    """Build encoder layers all at once."""
-    uni_cell = model_helper.create_rnn_cell(
-        unit_type=hparams.unit_type,
-        num_units=hparams.num_units,
-        num_layers=num_uni_layers,
-        num_residual_layers=self.num_encoder_residual_layers,
-        forget_bias=hparams.forget_bias,
-        dropout=hparams.dropout,
-        num_gpus=self.num_gpus,
-        base_gpu=1,
-        mode=self.mode,
-        single_cell_fn=self.single_cell_fn)
-    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
-        uni_cell,
-        bi_encoder_outputs,
-        dtype=dtype,
-        sequence_length=self.iterator.source_sequence_length,
-        time_major=self.time_major)
-
-    # Use the top layer for now
-    self.encoder_state_list = [encoder_outputs]
-
-    return encoder_state, encoder_outputs
-
-  def _build_individual_encoder_layers(self, bi_encoder_outputs,
-                                       num_uni_layers, dtype, hparams):
-    """Run each of the encoder layer separately, not used in general seq2seq."""
-    uni_cell_lists = model_helper._cell_list(
-        unit_type=hparams.unit_type,
-        num_units=hparams.num_units,
-        num_layers=num_uni_layers,
-        num_residual_layers=self.num_encoder_residual_layers,
-        forget_bias=hparams.forget_bias,
-        dropout=hparams.dropout,
-        num_gpus=self.num_gpus,
-        base_gpu=1,
-        mode=self.mode,
-        single_cell_fn=self.single_cell_fn)
-
-    encoder_inp = bi_encoder_outputs
-    encoder_states = []
-    self.encoder_state_list = [bi_encoder_outputs[:, :, :hparams.num_units],
-                               bi_encoder_outputs[:, :, hparams.num_units:]]
-    with tf.variable_scope("rnn/multi_rnn_cell"):
-      for i, cell in enumerate(uni_cell_lists):
-        with tf.variable_scope("cell_%d" % i) as scope:
-          encoder_inp, encoder_state = tf.nn.dynamic_rnn(
-              cell,
-              encoder_inp,
-              dtype=dtype,
-              sequence_length=self.iterator.source_sequence_length,
-              time_major=self.time_major,
-              scope=scope)
-          encoder_states.append(encoder_state)
-          self.encoder_state_list.append(encoder_inp)
-
-    encoder_state = tuple(encoder_states)
-    encoder_outputs = self.encoder_state_list[-1]
-    return encoder_state, encoder_outputs
-
-  def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
-                          source_sequence_length):
-    """Build a RNN cell with GNMT attention architecture."""
-    # Standard attention
-    if not self.is_gnmt_attention:
-      return super(GNMTModel, self)._build_decoder_cell(
-          hparams, encoder_outputs, encoder_state, source_sequence_length)
-
-    # GNMT attention
-    attention_option = hparams.attention
-    attention_architecture = hparams.attention_architecture
-    num_units = hparams.num_units
-    infer_mode = hparams.infer_mode
-
-    dtype = tf.float32
-
-    if self.time_major:
-      memory = tf.transpose(encoder_outputs, [1, 0, 2])
-    else:
-      memory = encoder_outputs
-
-    if (self.mode == tf.contrib.learn.ModeKeys.INFER and
-        infer_mode == "beam_search"):
-      memory, source_sequence_length, encoder_state, batch_size = (
-          self._prepare_beam_search_decoder_inputs(
-              hparams.beam_width, memory, source_sequence_length,
-              encoder_state))
-    else:
-      batch_size = self.batch_size
-
-    attention_mechanism = self.attention_mechanism_fn(
-        attention_option, num_units, memory, source_sequence_length, self.mode)
-
-    cell_list = model_helper._cell_list(  # pylint: disable=protected-access
-        unit_type=hparams.unit_type,
-        num_units=num_units,
-        num_layers=self.num_decoder_layers,
-        num_residual_layers=self.num_decoder_residual_layers,
-        forget_bias=hparams.forget_bias,
-        dropout=hparams.dropout,
-        num_gpus=self.num_gpus,
-        mode=self.mode,
-        single_cell_fn=self.single_cell_fn,
-        residual_fn=gnmt_residual_fn
-    )
-
-    # Only wrap the bottom layer with the attention mechanism.
-    attention_cell = cell_list.pop(0)
-
-    # Only generate alignment in greedy INFER mode.
-    alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and
-                         infer_mode != "beam_search")
-    attention_cell = tf.contrib.seq2seq.AttentionWrapper(
-        attention_cell,
-        attention_mechanism,
-        attention_layer_size=None,  # don't use attention layer.
-        output_attention=False,
-        alignment_history=alignment_history,
-        name="attention")
-
-    if attention_architecture == "gnmt":
-      cell = GNMTAttentionMultiCell(
-          attention_cell, cell_list)
-    elif attention_architecture == "gnmt_v2":
-      cell = GNMTAttentionMultiCell(
-          attention_cell, cell_list, use_new_attention=True)
-    else:
-      raise ValueError(
-          "Unknown attention_architecture %s" % attention_architecture)
-
-    if hparams.pass_hidden_state:
-      decoder_initial_state = tuple(
-          zs.clone(cell_state=es)
-          if isinstance(zs, tf.contrib.seq2seq.AttentionWrapperState) else es
-          for zs, es in zip(
-              cell.zero_state(batch_size, dtype), encoder_state))
-    else:
-      decoder_initial_state = cell.zero_state(batch_size, dtype)
-
-    return cell, decoder_initial_state
-
-  def _get_infer_summary(self, hparams):
-    if hparams.infer_mode == "beam_search":
-      return tf.no_op()
-    elif self.is_gnmt_attention:
-      return attention_model._create_attention_images_summary(
-          self.final_context_state[0])
-    else:
-      return super(GNMTModel, self)._get_infer_summary(hparams)
+    """Sequence-to-sequence dynamic model with GNMT attention architecture."""
+
+    def __init__(
+        self,
+        hparams,
+        mode,
+        iterator,
+        source_vocab_table,
+        target_vocab_table,
+        reverse_target_vocab_table=None,
+        scope=None,
+        extra_args=None,
+    ):
+        self.is_gnmt_attention = hparams.attention_architecture in [
+            "gnmt", "gnmt_v2"]
+
+        super(GNMTModel, self).__init__(
+            hparams=hparams,
+            mode=mode,
+            iterator=iterator,
+            source_vocab_table=source_vocab_table,
+            target_vocab_table=target_vocab_table,
+            reverse_target_vocab_table=reverse_target_vocab_table,
+            scope=scope,
+            extra_args=extra_args,
+        )
+
+    def _build_encoder(self, hparams):
+        """Build a GNMT encoder."""
+        if hparams.encoder_type == "uni" or hparams.encoder_type == "bi":
+            return super(GNMTModel, self)._build_encoder(hparams)
+
+        if hparams.encoder_type != "gnmt":
+            raise ValueError("Unknown encoder_type %s" % hparams.encoder_type)
+
+        # Build GNMT encoder.
+        num_bi_layers = 1
+        num_uni_layers = self.num_encoder_layers - num_bi_layers
+        utils.print_out("# Build a GNMT encoder")
+        utils.print_out("  num_bi_layers = %d" % num_bi_layers)
+        utils.print_out("  num_uni_layers = %d" % num_uni_layers)
+
+        iterator = self.iterator
+        source = iterator.source
+        if self.time_major:
+            source = tf.transpose(source)
+
+        with tf.variable_scope("encoder") as scope:
+            dtype = scope.dtype
+
+            self.encoder_emb_inp = self.encoder_emb_lookup_fn(
+                self.embedding_encoder, source
+            )
+
+            # Execute _build_bidirectional_rnn from Model class
+            bi_encoder_outputs, bi_encoder_state = self._build_bidirectional_rnn(
+                inputs=self.encoder_emb_inp,
+                sequence_length=iterator.source_sequence_length,
+                dtype=dtype,
+                hparams=hparams,
+                num_bi_layers=num_bi_layers,
+                num_bi_residual_layers=0,  # no residual connection
+            )
+
+            # Build unidirectional layers
+            if self.extract_encoder_layers:
+                encoder_state, encoder_outputs = self._build_individual_encoder_layers(
+                    bi_encoder_outputs, num_uni_layers, dtype, hparams
+                )
+            else:
+                encoder_state, encoder_outputs = self._build_all_encoder_layers(
+                    bi_encoder_outputs, num_uni_layers, dtype, hparams
+                )
+
+            # Pass all encoder states to the decoder
+            #   except the first bi-directional layer
+            encoder_state = (bi_encoder_state[1],) + (
+                (encoder_state,) if num_uni_layers == 1 else encoder_state
+            )
+
+        return encoder_outputs, encoder_state
+
+    def _build_all_encoder_layers(
+        self, bi_encoder_outputs, num_uni_layers, dtype, hparams
+    ):
+        """Build encoder layers all at once."""
+        uni_cell = model_helper.create_rnn_cell(
+            unit_type=hparams.unit_type,
+            num_units=hparams.num_units,
+            num_layers=num_uni_layers,
+            num_residual_layers=self.num_encoder_residual_layers,
+            forget_bias=hparams.forget_bias,
+            dropout=hparams.dropout,
+            num_gpus=self.num_gpus,
+            base_gpu=1,
+            mode=self.mode,
+            single_cell_fn=self.single_cell_fn,
+        )
+        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
+            uni_cell,
+            bi_encoder_outputs,
+            dtype=dtype,
+            sequence_length=self.iterator.source_sequence_length,
+            time_major=self.time_major,
+        )
+
+        # Use the top layer for now
+        self.encoder_state_list = [encoder_outputs]
+
+        return encoder_state, encoder_outputs
+
+    def _build_individual_encoder_layers(
+        self, bi_encoder_outputs, num_uni_layers, dtype, hparams
+    ):
+        """Run each of the encoder layer separately, not used in general seq2seq."""
+        uni_cell_lists = model_helper._cell_list(
+            unit_type=hparams.unit_type,
+            num_units=hparams.num_units,
+            num_layers=num_uni_layers,
+            num_residual_layers=self.num_encoder_residual_layers,
+            forget_bias=hparams.forget_bias,
+            dropout=hparams.dropout,
+            num_gpus=self.num_gpus,
+            base_gpu=1,
+            mode=self.mode,
+            single_cell_fn=self.single_cell_fn,
+        )
+
+        encoder_inp = bi_encoder_outputs
+        encoder_states = []
+        self.encoder_state_list = [
+            bi_encoder_outputs[:, :, : hparams.num_units],
+            bi_encoder_outputs[:, :, hparams.num_units:],
+        ]
+        with tf.variable_scope("rnn/multi_rnn_cell"):
+            for i, cell in enumerate(uni_cell_lists):
+                with tf.variable_scope("cell_%d" % i) as scope:
+                    encoder_inp, encoder_state = tf.nn.dynamic_rnn(
+                        cell,
+                        encoder_inp,
+                        dtype=dtype,
+                        sequence_length=self.iterator.source_sequence_length,
+                        time_major=self.time_major,
+                        scope=scope,
+                    )
+                    encoder_states.append(encoder_state)
+                    self.encoder_state_list.append(encoder_inp)
+
+        encoder_state = tuple(encoder_states)
+        encoder_outputs = self.encoder_state_list[-1]
+        return encoder_state, encoder_outputs
+
+    def _build_decoder_cell(
+        self, hparams, encoder_outputs, encoder_state, source_sequence_length
+    ):
+        """Build a RNN cell with GNMT attention architecture."""
+        # Standard attention
+        if not self.is_gnmt_attention:
+            return super(GNMTModel, self)._build_decoder_cell(
+                hparams, encoder_outputs, encoder_state, source_sequence_length
+            )
+
+        # GNMT attention
+        attention_option = hparams.attention
+        attention_architecture = hparams.attention_architecture
+        num_units = hparams.num_units
+        infer_mode = hparams.infer_mode
+
+        dtype = tf.float32
+
+        if self.time_major:
+            memory = tf.transpose(encoder_outputs, [1, 0, 2])
+        else:
+            memory = encoder_outputs
+
+        if self.mode == tf.contrib.learn.ModeKeys.INFER and infer_mode == "beam_search":
+            memory, source_sequence_length, encoder_state, batch_size = (
+                self._prepare_beam_search_decoder_inputs(
+                    hparams.beam_width, memory, source_sequence_length, encoder_state
+                )
+            )
+        else:
+            batch_size = self.batch_size
+
+        attention_mechanism = self.attention_mechanism_fn(
+            attention_option, num_units, memory, source_sequence_length, self.mode
+        )
+
+        cell_list = model_helper._cell_list(  # pylint: disable=protected-access
+            unit_type=hparams.unit_type,
+            num_units=num_units,
+            num_layers=self.num_decoder_layers,
+            num_residual_layers=self.num_decoder_residual_layers,
+            forget_bias=hparams.forget_bias,
+            dropout=hparams.dropout,
+            num_gpus=self.num_gpus,
+            mode=self.mode,
+            single_cell_fn=self.single_cell_fn,
+            residual_fn=gnmt_residual_fn,
+        )
+
+        # Only wrap the bottom layer with the attention mechanism.
+        attention_cell = cell_list.pop(0)
+
+        # Only generate alignment in greedy INFER mode.
+        alignment_history = (
+            self.mode == tf.contrib.learn.ModeKeys.INFER and infer_mode != "beam_search"
+        )
+        attention_cell = tf.contrib.seq2seq.AttentionWrapper(
+            attention_cell,
+            attention_mechanism,
+            attention_layer_size=None,  # don't use attention layer.
+            output_attention=False,
+            alignment_history=alignment_history,
+            name="attention",
+        )
+
+        if attention_architecture == "gnmt":
+            cell = GNMTAttentionMultiCell(attention_cell, cell_list)
+        elif attention_architecture == "gnmt_v2":
+            cell = GNMTAttentionMultiCell(
+                attention_cell, cell_list, use_new_attention=True
+            )
+        else:
+            raise ValueError(
+                "Unknown attention_architecture %s" % attention_architecture
+            )
+
+        if hparams.pass_hidden_state:
+            decoder_initial_state = tuple(
+                (
+                    zs.clone(cell_state=es)
+                    if isinstance(zs, tf.contrib.seq2seq.AttentionWrapperState)
+                    else es
+                )
+                for zs, es in zip(cell.zero_state(batch_size, dtype), encoder_state)
+            )
+        else:
+            decoder_initial_state = cell.zero_state(batch_size, dtype)
+
+        return cell, decoder_initial_state
+
+    def _get_infer_summary(self, hparams):
+        if hparams.infer_mode == "beam_search":
+            return tf.no_op()
+        elif self.is_gnmt_attention:
+            return attention_model._create_attention_images_summary(
+                self.final_context_state[0]
+            )
+        else:
+            return super(GNMTModel, self)._get_infer_summary(hparams)
 
 
 class GNMTAttentionMultiCell(tf.nn.rnn_cell.MultiRNNCell):
-  """A MultiCell with GNMT attention style."""
-
-  def __init__(self, attention_cell, cells, use_new_attention=False):
-    """Creates a GNMTAttentionMultiCell.
-
-    Args:
-      attention_cell: An instance of AttentionWrapper.
-      cells: A list of RNNCell wrapped with AttentionInputWrapper.
-      use_new_attention: Whether to use the attention generated from current
-        step bottom layer's output. Default is False.
-    """
-    cells = [attention_cell] + cells
-    self.use_new_attention = use_new_attention
-    super(GNMTAttentionMultiCell, self).__init__(cells, state_is_tuple=True)
-
-  def __call__(self, inputs, state, scope=None):
-    """Run the cell with bottom layer's attention copied to all upper layers."""
-    if not tf.contrib.framework.nest.is_sequence(state):
-      raise ValueError(
-          "Expected state to be a tuple of length %d, but received: %s"
-          % (len(self.state_size), state))
+    """A MultiCell with GNMT attention style."""
+
+    def __init__(self, attention_cell, cells, use_new_attention=False):
+        """Creates a GNMTAttentionMultiCell.
+
+        Args:
+          attention_cell: An instance of AttentionWrapper.
+          cells: A list of RNNCell wrapped with AttentionInputWrapper.
+          use_new_attention: Whether to use the attention generated from current
+            step bottom layer's output. Default is False.
+        """
+        cells = [attention_cell] + cells
+        self.use_new_attention = use_new_attention
+        super(
+            GNMTAttentionMultiCell,
+            self).__init__(
+            cells,
+            state_is_tuple=True)
+
+    def __call__(self, inputs, state, scope=None):
+        """Run the cell with bottom layer's attention copied to all upper layers."""
+        if not tf.contrib.framework.nest.is_sequence(state):
+            raise ValueError(
+                "Expected state to be a tuple of length %d, but received: %s"
+                % (len(self.state_size), state)
+            )
+
+        with tf.variable_scope(scope or "multi_rnn_cell"):
+            new_states = []
+
+            with tf.variable_scope("cell_0_attention"):
+                attention_cell = self._cells[0]
+                attention_state = state[0]
+                cur_inp, new_attention_state = attention_cell(
+                    inputs, attention_state)
+                new_states.append(new_attention_state)
+
+            for i in range(1, len(self._cells)):
+                with tf.variable_scope("cell_%d" % i):
+
+                    cell = self._cells[i]
+                    cur_state = state[i]
+
+                    if self.use_new_attention:
+                        cur_inp = tf.concat(
+                            [cur_inp, new_attention_state.attention], -1
+                        )
+                    else:
+                        cur_inp = tf.concat(
+                            [cur_inp, attention_state.attention], -1)
+
+                    cur_inp, new_state = cell(cur_inp, cur_state)
+                    new_states.append(new_state)
+
+        return cur_inp, tuple(new_states)
 
-    with tf.variable_scope(scope or "multi_rnn_cell"):
-      new_states = []
 
-      with tf.variable_scope("cell_0_attention"):
-        attention_cell = self._cells[0]
-        attention_state = state[0]
-        cur_inp, new_attention_state = attention_cell(inputs, attention_state)
-        new_states.append(new_attention_state)
-
-      for i in range(1, len(self._cells)):
-        with tf.variable_scope("cell_%d" % i):
+def gnmt_residual_fn(inputs, outputs):
+    """Residual function that handles different inputs and outputs inner dims.
 
-          cell = self._cells[i]
-          cur_state = state[i]
+    Args:
+      inputs: cell inputs, this is actual inputs concatenated with the attention
+        vector.
+      outputs: cell outputs
 
-          if self.use_new_attention:
-            cur_inp = tf.concat([cur_inp, new_attention_state.attention], -1)
-          else:
-            cur_inp = tf.concat([cur_inp, attention_state.attention], -1)
+    Returns:
+      outputs + actual inputs
+    """
 
-          cur_inp, new_state = cell(cur_inp, cur_state)
-          new_states.append(new_state)
+    def split_input(inp, out):
+        out_dim = out.get_shape().as_list()[-1]
+        inp_dim = inp.get_shape().as_list()[-1]
+        return tf.split(inp, [out_dim, inp_dim - out_dim], axis=-1)
 
-    return cur_inp, tuple(new_states)
+    actual_inputs, _ = tf.contrib.framework.nest.map_structure(
+        split_input, inputs, outputs
+    )
 
+    def assert_shape_match(inp, out):
+        inp.get_shape().assert_is_compatible_with(out.get_shape())
 
-def gnmt_residual_fn(inputs, outputs):
-  """Residual function that handles different inputs and outputs inner dims.
-
-  Args:
-    inputs: cell inputs, this is actual inputs concatenated with the attention
-      vector.
-    outputs: cell outputs
-
-  Returns:
-    outputs + actual inputs
-  """
-  def split_input(inp, out):
-    out_dim = out.get_shape().as_list()[-1]
-    inp_dim = inp.get_shape().as_list()[-1]
-    return tf.split(inp, [out_dim, inp_dim - out_dim], axis=-1)
-  actual_inputs, _ = tf.contrib.framework.nest.map_structure(
-      split_input, inputs, outputs)
-  def assert_shape_match(inp, out):
-    inp.get_shape().assert_is_compatible_with(out.get_shape())
-  tf.contrib.framework.nest.assert_same_structure(actual_inputs, outputs)
-  tf.contrib.framework.nest.map_structure(
-      assert_shape_match, actual_inputs, outputs)
-  return tf.contrib.framework.nest.map_structure(
-      lambda inp, out: inp + out, actual_inputs, outputs)
+    tf.contrib.framework.nest.assert_same_structure(actual_inputs, outputs)
+    tf.contrib.framework.nest.map_structure(
+        assert_shape_match, actual_inputs, outputs)
+    return tf.contrib.framework.nest.map_structure(
+        lambda inp, out: inp + out, actual_inputs, outputs
+    )
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference.py
index dd425d2d4..7d7127ecb 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference.py
@@ -28,244 +28,284 @@
 from .utils import misc_utils as utils
 from .utils import nmt_utils
 
-__all__ = ["load_data", "inference",
-           "single_worker_inference", "multi_worker_inference"]
-
-
-def _decode_inference_indices(model, sess, output_infer,
-                              output_infer_summary_prefix,
-                              inference_indices,
-                              tgt_eos,
-                              subword_option):
-  """Decoding only a specific set of sentences."""
-  utils.print_out("  decoding to output %s , num sents %d." %
-                  (output_infer, len(inference_indices)))
-  start_time = time.time()
-  with codecs.getwriter("utf-8")(
-      tf.gfile.GFile(output_infer, mode="wb")) as trans_f:
-    trans_f.write("")  # Write empty string to ensure file is created.
-    for decode_id in inference_indices:
-      nmt_outputs, infer_summary = model.decode(sess)
-
-      # get text translation
-      assert nmt_outputs.shape[0] == 1
-      translation = nmt_utils.get_translation(
-          nmt_outputs,
-          sent_id=0,
-          tgt_eos=tgt_eos,
-          subword_option=subword_option)
-
-      if infer_summary is not None:  # Attention models
-        image_file = output_infer_summary_prefix + str(decode_id) + ".png"
-        utils.print_out("  save attention image to %s*" % image_file)
-        image_summ = tf.Summary()
-        image_summ.ParseFromString(infer_summary)
-        with tf.gfile.GFile(image_file, mode="w") as img_f:
-          img_f.write(image_summ.value[0].image.encoded_image_string)
-
-      trans_f.write("%s\n" % translation)
-      utils.print_out(translation + b"\n")
-  utils.print_time("  done", start_time)
+__all__ = [
+    "load_data",
+    "inference",
+    "single_worker_inference",
+    "multi_worker_inference",
+]
+
+
+def _decode_inference_indices(
+    model,
+    sess,
+    output_infer,
+    output_infer_summary_prefix,
+    inference_indices,
+    tgt_eos,
+    subword_option,
+):
+    """Decoding only a specific set of sentences."""
+    utils.print_out(
+        "  decoding to output %s , num sents %d."
+        % (output_infer, len(inference_indices))
+    )
+    start_time = time.time()
+    with codecs.getwriter("utf-8")(tf.gfile.GFile(output_infer, mode="wb")) as trans_f:
+        trans_f.write("")  # Write empty string to ensure file is created.
+        for decode_id in inference_indices:
+            nmt_outputs, infer_summary = model.decode(sess)
+
+            # get text translation
+            assert nmt_outputs.shape[0] == 1
+            translation = nmt_utils.get_translation(
+                nmt_outputs, sent_id=0, tgt_eos=tgt_eos, subword_option=subword_option
+            )
+
+            if infer_summary is not None:  # Attention models
+                image_file = output_infer_summary_prefix + \
+                    str(decode_id) + ".png"
+                utils.print_out("  save attention image to %s*" % image_file)
+                image_summ = tf.Summary()
+                image_summ.ParseFromString(infer_summary)
+                with tf.gfile.GFile(image_file, mode="w") as img_f:
+                    img_f.write(image_summ.value[0].image.encoded_image_string)
+
+            trans_f.write("%s\n" % translation)
+            utils.print_out(translation + b"\n")
+    utils.print_time("  done", start_time)
 
 
 def load_data(inference_input_file, hparams=None):
-  """Load inference data."""
-  with codecs.getreader("utf-8")(
-      tf.gfile.GFile(inference_input_file, mode="rb")) as f:
-    inference_data = f.read().splitlines()
+    """Load inference data."""
+    with codecs.getreader("utf-8")(
+        tf.gfile.GFile(inference_input_file, mode="rb")
+    ) as f:
+        inference_data = f.read().splitlines()
 
-  if hparams and hparams.inference_indices:
-    inference_data = [inference_data[i] for i in hparams.inference_indices]
+    if hparams and hparams.inference_indices:
+        inference_data = [inference_data[i] for i in hparams.inference_indices]
 
-  return inference_data
+    return inference_data
 
 
 def get_model_creator(hparams):
-  """Get the right model class depending on configuration."""
-  if (hparams.encoder_type == "gnmt" or
-      hparams.attention_architecture in ["gnmt", "gnmt_v2"]):
-    model_creator = gnmt_model.GNMTModel
-  elif hparams.attention_architecture == "standard":
-    model_creator = attention_model.AttentionModel
-  elif not hparams.attention:
-    model_creator = nmt_model.Model
-  else:
-    raise ValueError("Unknown attention architecture %s" %
-                     hparams.attention_architecture)
-  return model_creator
+    """Get the right model class depending on configuration."""
+    if hparams.encoder_type == "gnmt" or hparams.attention_architecture in [
+        "gnmt",
+        "gnmt_v2",
+    ]:
+        model_creator = gnmt_model.GNMTModel
+    elif hparams.attention_architecture == "standard":
+        model_creator = attention_model.AttentionModel
+    elif not hparams.attention:
+        model_creator = nmt_model.Model
+    else:
+        raise ValueError(
+            "Unknown attention architecture %s" % hparams.attention_architecture
+        )
+    return model_creator
 
 
 def start_sess_and_load_model(infer_model, ckpt_path, hparams):
-  """Start session and load model."""
-  print("num_intra_threads = %d, num_inter_threads = %d \n"
-        %(hparams.num_intra_threads, hparams.num_inter_threads))
-  sess = tf.Session(graph=infer_model.graph,
-         config=utils.get_config_proto(
-         num_intra_threads=hparams.num_intra_threads,
-         num_inter_threads=hparams.num_inter_threads)
-         )
-  with infer_model.graph.as_default():
-    loaded_infer_model = model_helper.load_model(
-        infer_model.model, ckpt_path, sess, "infer")
-  return sess, loaded_infer_model
-
-
-def inference(run,
-              iterations,
-              ckpt_path,
-              inference_input_file,
-              inference_output_file,
-              hparams,
-              num_workers=1,
-              jobid=0,
-              scope=None):
-  """Perform translation."""
-  if hparams.inference_indices:
-    assert num_workers == 1
-
-  model_creator = get_model_creator(hparams)
-  infer_model = model_helper.create_infer_model(model_creator, hparams, scope)
-  sess, loaded_infer_model = start_sess_and_load_model(infer_model, ckpt_path,
-                                                       hparams)
-
-  if num_workers == 1:
-    single_worker_inference(
-        run,
-        iterations,
-        sess,
-        infer_model,
-        loaded_infer_model,
-        inference_input_file,
-        inference_output_file,
-        hparams)
-  else:
-    multi_worker_inference(
-        sess,
-        infer_model,
-        loaded_infer_model,
-        inference_input_file,
-        inference_output_file,
-        hparams,
-        num_workers=num_workers,
-        jobid=jobid)
-  sess.close()
-
-
-def single_worker_inference(run,
-                            iterations,
-                            sess,
-                            infer_model,
-                            loaded_infer_model,
-                            inference_input_file,
-                            inference_output_file,
-                            hparams):
-  """Inference with a single worker."""
-  output_infer = inference_output_file
-
-  # Read data
-  infer_data = load_data(inference_input_file, hparams)
-
-  with infer_model.graph.as_default():
-    sess.run(
-        infer_model.iterator.initializer,
-        feed_dict={
-            infer_model.src_placeholder: infer_data,
-            infer_model.batch_size_placeholder: hparams.infer_batch_size
-        })
-    # Decode
-    utils.print_out("# Start decoding")
+    """Start session and load model."""
+    print(
+        "num_intra_threads = %d, num_inter_threads = %d \n"
+        % (hparams.num_intra_threads, hparams.num_inter_threads)
+    )
+    sess = tf.Session(
+        graph=infer_model.graph,
+        config=utils.get_config_proto(
+            num_intra_threads=hparams.num_intra_threads,
+            num_inter_threads=hparams.num_inter_threads,
+        ),
+    )
+    with infer_model.graph.as_default():
+        loaded_infer_model = model_helper.load_model(
+            infer_model.model, ckpt_path, sess, "infer"
+        )
+    return sess, loaded_infer_model
+
+
+def inference(
+    run,
+    iterations,
+    ckpt_path,
+    inference_input_file,
+    inference_output_file,
+    hparams,
+    num_workers=1,
+    jobid=0,
+    scope=None,
+):
+    """Perform translation."""
     if hparams.inference_indices:
-      _decode_inference_indices(
-          loaded_infer_model,
-          sess,
-          output_infer=output_infer,
-          output_infer_summary_prefix=output_infer,
-          inference_indices=hparams.inference_indices,
-          tgt_eos=hparams.eos,
-          subword_option=hparams.subword_option)
+        assert num_workers == 1
+
+    model_creator = get_model_creator(hparams)
+    infer_model = model_helper.create_infer_model(
+        model_creator, hparams, scope)
+    sess, loaded_infer_model = start_sess_and_load_model(
+        infer_model, ckpt_path, hparams
+    )
+
+    if num_workers == 1:
+        single_worker_inference(
+            run,
+            iterations,
+            sess,
+            infer_model,
+            loaded_infer_model,
+            inference_input_file,
+            inference_output_file,
+            hparams,
+        )
     else:
-      nmt_utils.decode_and_evaluate(
-          run,
-          iterations,
-          "infer",
-          loaded_infer_model,
-          sess,
-          output_infer,
-          ref_file=None,
-          metrics=hparams.metrics,
-          subword_option=hparams.subword_option,
-          beam_width=hparams.beam_width,
-          tgt_eos=hparams.eos,
-          num_translations_per_input=hparams.num_translations_per_input,
-          infer_mode=hparams.infer_mode)
-
-
-def multi_worker_inference(sess,
-                           infer_model,
-                           loaded_infer_model,
-                           inference_input_file,
-                           inference_output_file,
-                           hparams,
-                           num_workers,
-                           jobid):
-  """Inference using multiple workers."""
-  assert num_workers > 1
-
-  final_output_infer = inference_output_file
-  output_infer = "%s_%d" % (inference_output_file, jobid)
-  output_infer_done = "%s_done_%d" % (inference_output_file, jobid)
-
-  # Read data
-  infer_data = load_data(inference_input_file, hparams)
-
-  # Split data to multiple workers
-  total_load = len(infer_data)
-  load_per_worker = int((total_load - 1) / num_workers) + 1
-  start_position = jobid * load_per_worker
-  end_position = min(start_position + load_per_worker, total_load)
-  infer_data = infer_data[start_position:end_position]
-
-  with infer_model.graph.as_default():
-    sess.run(infer_model.iterator.initializer,
-             {
-                 infer_model.src_placeholder: infer_data,
-                 infer_model.batch_size_placeholder: hparams.infer_batch_size
-             })
-    # Decode
-    utils.print_out("# Start decoding")
-    nmt_utils.decode_and_evaluate(
-        "infer",
-        loaded_infer_model,
-        sess,
-        output_infer,
-        ref_file=None,
-        metrics=hparams.metrics,
-        subword_option=hparams.subword_option,
-        beam_width=hparams.beam_width,
-        tgt_eos=hparams.eos,
-        num_translations_per_input=hparams.num_translations_per_input,
-        infer_mode=hparams.infer_mode)
-
-    # Change file name to indicate the file writing is completed.
-    tf.gfile.Rename(output_infer, output_infer_done, overwrite=True)
-
-    # Job 0 is responsible for the clean up.
-    if jobid != 0: return
-
-    # Now write all translations
-    with codecs.getwriter("utf-8")(
-        tf.gfile.GFile(final_output_infer, mode="wb")) as final_f:
-      for worker_id in range(num_workers):
-        worker_infer_done = "%s_done_%d" % (inference_output_file, worker_id)
-        while not tf.gfile.Exists(worker_infer_done):
-          utils.print_out("  waiting job %d to complete." % worker_id)
-          time.sleep(10)
-
-        with codecs.getreader("utf-8")(
-            tf.gfile.GFile(worker_infer_done, mode="rb")) as f:
-          for translation in f:
-            final_f.write("%s" % translation)
-
-      for worker_id in range(num_workers):
-        worker_infer_done = "%s_done_%d" % (inference_output_file, worker_id)
-        tf.gfile.Remove(worker_infer_done)
+        multi_worker_inference(
+            sess,
+            infer_model,
+            loaded_infer_model,
+            inference_input_file,
+            inference_output_file,
+            hparams,
+            num_workers=num_workers,
+            jobid=jobid,
+        )
+    sess.close()
+
+
+def single_worker_inference(
+    run,
+    iterations,
+    sess,
+    infer_model,
+    loaded_infer_model,
+    inference_input_file,
+    inference_output_file,
+    hparams,
+):
+    """Inference with a single worker."""
+    output_infer = inference_output_file
+
+    # Read data
+    infer_data = load_data(inference_input_file, hparams)
+
+    with infer_model.graph.as_default():
+        sess.run(
+            infer_model.iterator.initializer,
+            feed_dict={
+                infer_model.src_placeholder: infer_data,
+                infer_model.batch_size_placeholder: hparams.infer_batch_size,
+            },
+        )
+        # Decode
+        utils.print_out("# Start decoding")
+        if hparams.inference_indices:
+            _decode_inference_indices(
+                loaded_infer_model,
+                sess,
+                output_infer=output_infer,
+                output_infer_summary_prefix=output_infer,
+                inference_indices=hparams.inference_indices,
+                tgt_eos=hparams.eos,
+                subword_option=hparams.subword_option,
+            )
+        else:
+            nmt_utils.decode_and_evaluate(
+                run,
+                iterations,
+                "infer",
+                loaded_infer_model,
+                sess,
+                output_infer,
+                ref_file=None,
+                metrics=hparams.metrics,
+                subword_option=hparams.subword_option,
+                beam_width=hparams.beam_width,
+                tgt_eos=hparams.eos,
+                num_translations_per_input=hparams.num_translations_per_input,
+                infer_mode=hparams.infer_mode,
+            )
+
+
+def multi_worker_inference(
+    sess,
+    infer_model,
+    loaded_infer_model,
+    inference_input_file,
+    inference_output_file,
+    hparams,
+    num_workers,
+    jobid,
+):
+    """Inference using multiple workers."""
+    assert num_workers > 1
+
+    final_output_infer = inference_output_file
+    output_infer = "%s_%d" % (inference_output_file, jobid)
+    output_infer_done = "%s_done_%d" % (inference_output_file, jobid)
+
+    # Read data
+    infer_data = load_data(inference_input_file, hparams)
+
+    # Split data to multiple workers
+    total_load = len(infer_data)
+    load_per_worker = int((total_load - 1) / num_workers) + 1
+    start_position = jobid * load_per_worker
+    end_position = min(start_position + load_per_worker, total_load)
+    infer_data = infer_data[start_position:end_position]
+
+    with infer_model.graph.as_default():
+        sess.run(
+            infer_model.iterator.initializer,
+            {
+                infer_model.src_placeholder: infer_data,
+                infer_model.batch_size_placeholder: hparams.infer_batch_size,
+            },
+        )
+        # Decode
+        utils.print_out("# Start decoding")
+        nmt_utils.decode_and_evaluate(
+            "infer",
+            loaded_infer_model,
+            sess,
+            output_infer,
+            ref_file=None,
+            metrics=hparams.metrics,
+            subword_option=hparams.subword_option,
+            beam_width=hparams.beam_width,
+            tgt_eos=hparams.eos,
+            num_translations_per_input=hparams.num_translations_per_input,
+            infer_mode=hparams.infer_mode,
+        )
+
+        # Change file name to indicate the file writing is completed.
+        tf.gfile.Rename(output_infer, output_infer_done, overwrite=True)
+
+        # Job 0 is responsible for the clean up.
+        if jobid != 0:
+            return
+
+        # Now write all translations
+        with codecs.getwriter("utf-8")(
+            tf.gfile.GFile(final_output_infer, mode="wb")
+        ) as final_f:
+            for worker_id in range(num_workers):
+                worker_infer_done = "%s_done_%d" % (
+                    inference_output_file, worker_id)
+                while not tf.gfile.Exists(worker_infer_done):
+                    utils.print_out(
+                        "  waiting job %d to complete." %
+                        worker_id)
+                    time.sleep(10)
+
+                with codecs.getreader("utf-8")(
+                    tf.gfile.GFile(worker_infer_done, mode="rb")
+                ) as f:
+                    for translation in f:
+                        final_f.write("%s" % translation)
+
+            for worker_id in range(num_workers):
+                worker_infer_done = "%s_done_%d" % (
+                    inference_output_file, worker_id)
+                tf.gfile.Remove(worker_infer_done)
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference_test.py
index 317024b81..989f9fe33 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference_test.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference_test.py
@@ -34,142 +34,153 @@
 
 class InferenceTest(tf.test.TestCase):
 
-  def _createTestInferCheckpoint(self, hparams, name):
-    # Prepare
-    hparams.vocab_prefix = (
-        "nmt/testdata/test_infer_vocab")
-    hparams.src_vocab_file = hparams.vocab_prefix + "." + hparams.src
-    hparams.tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
-    out_dir = os.path.join(tf.test.get_temp_dir(), name)
-    os.makedirs(out_dir)
-    hparams.out_dir = out_dir
-
-    # Create check point
-    model_creator = inference.get_model_creator(hparams)
-    infer_model = model_helper.create_infer_model(model_creator, hparams)
-    with self.test_session(graph=infer_model.graph) as sess:
-      loaded_model, global_step = model_helper.create_or_load_model(
-          infer_model.model, out_dir, sess, "infer_name")
-      ckpt_path = loaded_model.saver.save(
-          sess, os.path.join(out_dir, "translate.ckpt"),
-          global_step=global_step)
-    return ckpt_path
-
-  def testBasicModel(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type="uni",
-        num_layers=1,
-        attention="",
-        attention_architecture="",
-        use_residual=False,)
-    ckpt_path = self._createTestInferCheckpoint(hparams, "basic_infer")
-    infer_file = "nmt/testdata/test_infer_file"
-    output_infer = os.path.join(hparams.out_dir, "output_infer")
-    inference.inference(ckpt_path, infer_file, output_infer, hparams)
-    with open(output_infer) as f:
-      self.assertEqual(5, len(list(f)))
-
-  def testBasicModelWithMultipleTranslations(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type="uni",
-        num_layers=1,
-        attention="",
-        attention_architecture="",
-        use_residual=False,
-        num_translations_per_input=2,
-        beam_width=2,
-    )
-    hparams.infer_mode = "beam_search"
-
-    ckpt_path = self._createTestInferCheckpoint(hparams, "multi_basic_infer")
-    infer_file = "nmt/testdata/test_infer_file"
-    output_infer = os.path.join(hparams.out_dir, "output_infer")
-    inference.inference(ckpt_path, infer_file, output_infer, hparams)
-    with open(output_infer) as f:
-      self.assertEqual(10, len(list(f)))
-
-  def testAttentionModel(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type="uni",
-        num_layers=1,
-        attention="scaled_luong",
-        attention_architecture="standard",
-        use_residual=False,)
-    ckpt_path = self._createTestInferCheckpoint(hparams, "attention_infer")
-    infer_file = "nmt/testdata/test_infer_file"
-    output_infer = os.path.join(hparams.out_dir, "output_infer")
-    inference.inference(ckpt_path, infer_file, output_infer, hparams)
-    with open(output_infer) as f:
-      self.assertEqual(5, len(list(f)))
-
-  def testMultiWorkers(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type="uni",
-        num_layers=2,
-        attention="scaled_luong",
-        attention_architecture="standard",
-        use_residual=False,)
-
-    num_workers = 3
-
-    # There are 5 examples, make batch_size=3 makes job0 has 3 examples, job1
-    # has 2 examples, and job2 has 0 example. This helps testing some edge
-    # cases.
-    hparams.batch_size = 3
-
-    ckpt_path = self._createTestInferCheckpoint(hparams, "multi_worker_infer")
-    infer_file = "nmt/testdata/test_infer_file"
-    output_infer = os.path.join(hparams.out_dir, "output_infer")
-    inference.inference(
-        ckpt_path, infer_file, output_infer, hparams, num_workers, jobid=1)
-
-    inference.inference(
-        ckpt_path, infer_file, output_infer, hparams, num_workers, jobid=2)
-
-    # Note: Need to start job 0 at the end; otherwise, it will block the testing
-    # thread.
-    inference.inference(
-        ckpt_path, infer_file, output_infer, hparams, num_workers, jobid=0)
-
-    with open(output_infer) as f:
-      self.assertEqual(5, len(list(f)))
-
-  def testBasicModelWithInferIndices(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type="uni",
-        num_layers=1,
-        attention="",
-        attention_architecture="",
-        use_residual=False,
-        inference_indices=[0])
-    ckpt_path = self._createTestInferCheckpoint(hparams,
-                                                "basic_infer_with_indices")
-    infer_file = "nmt/testdata/test_infer_file"
-    output_infer = os.path.join(hparams.out_dir, "output_infer")
-    inference.inference(ckpt_path, infer_file, output_infer, hparams)
-    with open(output_infer) as f:
-      self.assertEqual(1, len(list(f)))
-
-  def testAttentionModelWithInferIndices(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type="uni",
-        num_layers=1,
-        attention="scaled_luong",
-        attention_architecture="standard",
-        use_residual=False,
-        inference_indices=[1, 2])
-    # TODO(rzhao): Make infer indices support batch_size > 1.
-    hparams.infer_batch_size = 1
-    ckpt_path = self._createTestInferCheckpoint(hparams,
-                                                "attention_infer_with_indices")
-    infer_file = "nmt/testdata/test_infer_file"
-    output_infer = os.path.join(hparams.out_dir, "output_infer")
-    inference.inference(ckpt_path, infer_file, output_infer, hparams)
-    with open(output_infer) as f:
-      self.assertEqual(2, len(list(f)))
-    self.assertTrue(os.path.exists(output_infer+str(1)+".png"))
-    self.assertTrue(os.path.exists(output_infer+str(2)+".png"))
+    def _createTestInferCheckpoint(self, hparams, name):
+        # Prepare
+        hparams.vocab_prefix = "nmt/testdata/test_infer_vocab"
+        hparams.src_vocab_file = hparams.vocab_prefix + "." + hparams.src
+        hparams.tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
+        out_dir = os.path.join(tf.test.get_temp_dir(), name)
+        os.makedirs(out_dir)
+        hparams.out_dir = out_dir
+
+        # Create check point
+        model_creator = inference.get_model_creator(hparams)
+        infer_model = model_helper.create_infer_model(model_creator, hparams)
+        with self.test_session(graph=infer_model.graph) as sess:
+            loaded_model, global_step = model_helper.create_or_load_model(
+                infer_model.model, out_dir, sess, "infer_name"
+            )
+            ckpt_path = loaded_model.saver.save(
+                sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step
+            )
+        return ckpt_path
+
+    def testBasicModel(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=1,
+            attention="",
+            attention_architecture="",
+            use_residual=False,
+        )
+        ckpt_path = self._createTestInferCheckpoint(hparams, "basic_infer")
+        infer_file = "nmt/testdata/test_infer_file"
+        output_infer = os.path.join(hparams.out_dir, "output_infer")
+        inference.inference(ckpt_path, infer_file, output_infer, hparams)
+        with open(output_infer) as f:
+            self.assertEqual(5, len(list(f)))
+
+    def testBasicModelWithMultipleTranslations(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=1,
+            attention="",
+            attention_architecture="",
+            use_residual=False,
+            num_translations_per_input=2,
+            beam_width=2,
+        )
+        hparams.infer_mode = "beam_search"
+
+        ckpt_path = self._createTestInferCheckpoint(
+            hparams, "multi_basic_infer")
+        infer_file = "nmt/testdata/test_infer_file"
+        output_infer = os.path.join(hparams.out_dir, "output_infer")
+        inference.inference(ckpt_path, infer_file, output_infer, hparams)
+        with open(output_infer) as f:
+            self.assertEqual(10, len(list(f)))
+
+    def testAttentionModel(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=1,
+            attention="scaled_luong",
+            attention_architecture="standard",
+            use_residual=False,
+        )
+        ckpt_path = self._createTestInferCheckpoint(hparams, "attention_infer")
+        infer_file = "nmt/testdata/test_infer_file"
+        output_infer = os.path.join(hparams.out_dir, "output_infer")
+        inference.inference(ckpt_path, infer_file, output_infer, hparams)
+        with open(output_infer) as f:
+            self.assertEqual(5, len(list(f)))
+
+    def testMultiWorkers(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=2,
+            attention="scaled_luong",
+            attention_architecture="standard",
+            use_residual=False,
+        )
+
+        num_workers = 3
+
+        # There are 5 examples, make batch_size=3 makes job0 has 3 examples, job1
+        # has 2 examples, and job2 has 0 example. This helps testing some edge
+        # cases.
+        hparams.batch_size = 3
+
+        ckpt_path = self._createTestInferCheckpoint(
+            hparams, "multi_worker_infer")
+        infer_file = "nmt/testdata/test_infer_file"
+        output_infer = os.path.join(hparams.out_dir, "output_infer")
+        inference.inference(
+            ckpt_path, infer_file, output_infer, hparams, num_workers, jobid=1
+        )
+
+        inference.inference(
+            ckpt_path, infer_file, output_infer, hparams, num_workers, jobid=2
+        )
+
+        # Note: Need to start job 0 at the end; otherwise, it will block the testing
+        # thread.
+        inference.inference(
+            ckpt_path, infer_file, output_infer, hparams, num_workers, jobid=0
+        )
+
+        with open(output_infer) as f:
+            self.assertEqual(5, len(list(f)))
+
+    def testBasicModelWithInferIndices(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=1,
+            attention="",
+            attention_architecture="",
+            use_residual=False,
+            inference_indices=[0],
+        )
+        ckpt_path = self._createTestInferCheckpoint(
+            hparams, "basic_infer_with_indices")
+        infer_file = "nmt/testdata/test_infer_file"
+        output_infer = os.path.join(hparams.out_dir, "output_infer")
+        inference.inference(ckpt_path, infer_file, output_infer, hparams)
+        with open(output_infer) as f:
+            self.assertEqual(1, len(list(f)))
+
+    def testAttentionModelWithInferIndices(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=1,
+            attention="scaled_luong",
+            attention_architecture="standard",
+            use_residual=False,
+            inference_indices=[1, 2],
+        )
+        # TODO(rzhao): Make infer indices support batch_size > 1.
+        hparams.infer_batch_size = 1
+        ckpt_path = self._createTestInferCheckpoint(
+            hparams, "attention_infer_with_indices"
+        )
+        infer_file = "nmt/testdata/test_infer_file"
+        output_infer = os.path.join(hparams.out_dir, "output_infer")
+        inference.inference(ckpt_path, infer_file, output_infer, hparams)
+        with open(output_infer) as f:
+            self.assertEqual(2, len(list(f)))
+        self.assertTrue(os.path.exists(output_infer + str(1) + ".png"))
+        self.assertTrue(os.path.exists(output_infer + str(2) + ".png"))
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/model.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/model.py
index e0c4f4e03..99639668e 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/model.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/model.py
@@ -34,844 +34,975 @@
 __all__ = ["BaseModel", "Model"]
 
 
-class TrainOutputTuple(collections.namedtuple(
-    "TrainOutputTuple", ("train_summary", "train_loss", "predict_count",
-                         "global_step", "word_count", "batch_size", "grad_norm",
-                         "learning_rate"))):
-  """To allow for flexibily in returing different outputs."""
-  pass
+class TrainOutputTuple(
+    collections.namedtuple(
+        "TrainOutputTuple",
+        (
+            "train_summary",
+            "train_loss",
+            "predict_count",
+            "global_step",
+            "word_count",
+            "batch_size",
+            "grad_norm",
+            "learning_rate",
+        ),
+    )
+):
+    """To allow for flexibily in returing different outputs."""
 
+    pass
 
-class EvalOutputTuple(collections.namedtuple(
-    "EvalOutputTuple", ("eval_loss", "predict_count", "batch_size"))):
-  """To allow for flexibily in returing different outputs."""
-  pass
 
+class EvalOutputTuple(
+    collections.namedtuple(
+        "EvalOutputTuple", ("eval_loss", "predict_count", "batch_size")
+    )
+):
+    """To allow for flexibily in returing different outputs."""
 
-class InferOutputTuple(collections.namedtuple(
-    "InferOutputTuple", ("infer_logits", "infer_summary", "sample_id",
-                         "sample_words"))):
-  """To allow for flexibily in returing different outputs."""
-  pass
+    pass
 
 
-class BaseModel(object):
-  """Sequence-to-sequence base class.
-  """
-
-  def __init__(self,
-               hparams,
-               mode,
-               iterator,
-               source_vocab_table,
-               target_vocab_table,
-               reverse_target_vocab_table=None,
-               scope=None,
-               extra_args=None):
-    """Create the model.
-
-    Args:
-      hparams: Hyperparameter configurations.
-      mode: TRAIN | EVAL | INFER
-      iterator: Dataset Iterator that feeds data.
-      source_vocab_table: Lookup table mapping source words to ids.
-      target_vocab_table: Lookup table mapping target words to ids.
-      reverse_target_vocab_table: Lookup table mapping ids to target words. Only
-        required in INFER mode. Defaults to None.
-      scope: scope of the model.
-      extra_args: model_helper.ExtraArgs, for passing customizable functions.
+class InferOutputTuple(
+    collections.namedtuple(
+        "InferOutputTuple",
+        ("infer_logits", "infer_summary", "sample_id", "sample_words"),
+    )
+):
+    """To allow for flexibily in returing different outputs."""
 
-    """
-    # Set params
-    self._set_params_initializer(hparams, mode, iterator,
-                                 source_vocab_table, target_vocab_table,
-                                 scope, extra_args)
-
-    # Not used in general seq2seq models; when True, ignore decoder & training
-    self.extract_encoder_layers = (hasattr(hparams, "extract_encoder_layers")
-                                   and hparams.extract_encoder_layers)
-
-    # Train graph
-    res = self.build_graph(hparams, scope=scope)
-    if not self.extract_encoder_layers:
-      self._set_train_or_infer(res, reverse_target_vocab_table, hparams)
-
-    # Saver
-    self.saver = tf.train.Saver(
-        tf.global_variables(), max_to_keep=hparams.num_keep_ckpts)
-
-  def _set_params_initializer(self,
-                              hparams,
-                              mode,
-                              iterator,
-                              source_vocab_table,
-                              target_vocab_table,
-                              scope,
-                              extra_args=None):
-    """Set various params for self and initialize."""
-    assert isinstance(iterator, iterator_utils.BatchedInput)
-    self.iterator = iterator
-    self.mode = mode
-    self.src_vocab_table = source_vocab_table
-    self.tgt_vocab_table = target_vocab_table
-
-    self.src_vocab_size = hparams.src_vocab_size
-    self.tgt_vocab_size = hparams.tgt_vocab_size
-    self.num_gpus = hparams.num_gpus
-    self.time_major = hparams.time_major
-
-    if hparams.use_char_encode:
-      assert (not self.time_major), ("Can't use time major for"
-                                     " char-level inputs.")
-
-    self.dtype = tf.float32
-    self.num_sampled_softmax = hparams.num_sampled_softmax
-
-    # extra_args: to make it flexible for adding external customizable code
-    self.single_cell_fn = None
-    if extra_args:
-      self.single_cell_fn = extra_args.single_cell_fn
-
-    # Set num units
-    self.num_units = hparams.num_units
-
-    # Set num layers
-    self.num_encoder_layers = hparams.num_encoder_layers
-    self.num_decoder_layers = hparams.num_decoder_layers
-    assert self.num_encoder_layers
-    assert self.num_decoder_layers
-
-    # Set num residual layers
-    if hasattr(hparams, "num_residual_layers"):  # compatible common_test_utils
-      self.num_encoder_residual_layers = hparams.num_residual_layers
-      self.num_decoder_residual_layers = hparams.num_residual_layers
-    else:
-      self.num_encoder_residual_layers = hparams.num_encoder_residual_layers
-      self.num_decoder_residual_layers = hparams.num_decoder_residual_layers
-
-    # Batch size
-    self.batch_size = tf.size(self.iterator.source_sequence_length)
-
-    # Global step
-    self.global_step = tf.Variable(0, trainable=False)
-
-    # Initializer
-    self.random_seed = hparams.random_seed
-    initializer = model_helper.get_initializer(
-        hparams.init_op, self.random_seed, hparams.init_weight)
-    tf.get_variable_scope().set_initializer(initializer)
-
-    # Embeddings
-    if extra_args and extra_args.encoder_emb_lookup_fn:
-      self.encoder_emb_lookup_fn = extra_args.encoder_emb_lookup_fn
-    else:
-      self.encoder_emb_lookup_fn = tf.nn.embedding_lookup
-    self.init_embeddings(hparams, scope)
-
-  def _set_train_or_infer(self, res, reverse_target_vocab_table, hparams):
-    """Set up training and inference."""
-    if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
-      self.train_loss = res[1]
-      self.word_count = tf.reduce_sum(
-          self.iterator.source_sequence_length) + tf.reduce_sum(
-              self.iterator.target_sequence_length)
-    elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
-      self.eval_loss = res[1]
-    elif self.mode == tf.contrib.learn.ModeKeys.INFER:
-      self.infer_logits, _, self.final_context_state, self.sample_id = res
-      self.sample_words = reverse_target_vocab_table.lookup(
-          tf.to_int64(self.sample_id))
-
-    if self.mode != tf.contrib.learn.ModeKeys.INFER:
-      ## Count the number of predicted words for compute ppl.
-      self.predict_count = tf.reduce_sum(
-          self.iterator.target_sequence_length)
-
-    params = tf.trainable_variables()
-
-    # Gradients and SGD update operation for training the model.
-    # Arrange for the embedding vars to appear at the beginning.
-    if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
-      self.learning_rate = tf.constant(hparams.learning_rate)
-      # warm-up
-      self.learning_rate = self._get_learning_rate_warmup(hparams)
-      # decay
-      self.learning_rate = self._get_learning_rate_decay(hparams)
-
-      # Optimizer
-      if hparams.optimizer == "sgd":
-        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
-      elif hparams.optimizer == "adam":
-        opt = tf.train.AdamOptimizer(self.learning_rate)
-      else:
-        raise ValueError("Unknown optimizer type %s" % hparams.optimizer)
-
-      # Gradients
-      gradients = tf.gradients(
-          self.train_loss,
-          params,
-          colocate_gradients_with_ops=hparams.colocate_gradients_with_ops)
-
-      clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
-          gradients, max_gradient_norm=hparams.max_gradient_norm)
-      self.grad_norm_summary = grad_norm_summary
-      self.grad_norm = grad_norm
-
-      self.update = opt.apply_gradients(
-          zip(clipped_grads, params), global_step=self.global_step)
-
-      # Summary
-      self.train_summary = self._get_train_summary()
-    elif self.mode == tf.contrib.learn.ModeKeys.INFER:
-      self.infer_summary = self._get_infer_summary(hparams)
-
-    # Print trainable variables
-    utils.print_out("# Trainable variables")
-    utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
-    for param in params:
-      utils.print_out("  %s, %s, %s" % (param.name, str(param.get_shape()),
-                                        param.op.device))
-
-  def _get_learning_rate_warmup(self, hparams):
-    """Get learning rate warmup."""
-    warmup_steps = hparams.warmup_steps
-    warmup_scheme = hparams.warmup_scheme
-    utils.print_out("  learning_rate=%g, warmup_steps=%d, warmup_scheme=%s" %
-                    (hparams.learning_rate, warmup_steps, warmup_scheme))
-
-    # Apply inverse decay if global steps less than warmup steps.
-    # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3)
-    # When step < warmup_steps,
-    #   learing_rate *= warmup_factor ** (warmup_steps - step)
-    if warmup_scheme == "t2t":
-      # 0.01^(1/warmup_steps): we start with a lr, 100 times smaller
-      warmup_factor = tf.exp(tf.log(0.01) / warmup_steps)
-      inv_decay = warmup_factor**(
-          tf.to_float(warmup_steps - self.global_step))
-    else:
-      raise ValueError("Unknown warmup scheme %s" % warmup_scheme)
-
-    return tf.cond(
-        self.global_step < hparams.warmup_steps,
-        lambda: inv_decay * self.learning_rate,
-        lambda: self.learning_rate,
-        name="learning_rate_warump_cond")
-
-  def _get_decay_info(self, hparams):
-    """Return decay info based on decay_scheme."""
-    if hparams.decay_scheme in ["luong5", "luong10", "luong234"]:
-      decay_factor = 0.5
-      if hparams.decay_scheme == "luong5":
-        start_decay_step = int(hparams.num_train_steps / 2)
-        decay_times = 5
-      elif hparams.decay_scheme == "luong10":
-        start_decay_step = int(hparams.num_train_steps / 2)
-        decay_times = 10
-      elif hparams.decay_scheme == "luong234":
-        start_decay_step = int(hparams.num_train_steps * 2 / 3)
-        decay_times = 4
-      remain_steps = hparams.num_train_steps - start_decay_step
-      decay_steps = int(remain_steps / decay_times)
-    elif not hparams.decay_scheme:  # no decay
-      start_decay_step = hparams.num_train_steps
-      decay_steps = 0
-      decay_factor = 1.0
-    elif hparams.decay_scheme:
-      raise ValueError("Unknown decay scheme %s" % hparams.decay_scheme)
-    return start_decay_step, decay_steps, decay_factor
-
-  def _get_learning_rate_decay(self, hparams):
-    """Get learning rate decay."""
-    start_decay_step, decay_steps, decay_factor = self._get_decay_info(hparams)
-    utils.print_out("  decay_scheme=%s, start_decay_step=%d, decay_steps %d, "
-                    "decay_factor %g" % (hparams.decay_scheme,
-                                         start_decay_step,
-                                         decay_steps,
-                                         decay_factor))
-
-    return tf.cond(
-        self.global_step < start_decay_step,
-        lambda: self.learning_rate,
-        lambda: tf.train.exponential_decay(
-            self.learning_rate,
-            (self.global_step - start_decay_step),
-            decay_steps, decay_factor, staircase=True),
-        name="learning_rate_decay_cond")
-
-  def init_embeddings(self, hparams, scope):
-    """Init embeddings."""
-    self.embedding_encoder, self.embedding_decoder = (
-        model_helper.create_emb_for_encoder_and_decoder(
-            share_vocab=hparams.share_vocab,
-            src_vocab_size=self.src_vocab_size,
-            tgt_vocab_size=self.tgt_vocab_size,
-            src_embed_size=self.num_units,
-            tgt_embed_size=self.num_units,
-            num_enc_partitions=hparams.num_enc_emb_partitions,
-            num_dec_partitions=hparams.num_dec_emb_partitions,
-            src_vocab_file=hparams.src_vocab_file,
-            tgt_vocab_file=hparams.tgt_vocab_file,
-            src_embed_file=hparams.src_embed_file,
-            tgt_embed_file=hparams.tgt_embed_file,
-            use_char_encode=hparams.use_char_encode,
-            scope=scope,))
-
-  def _get_train_summary(self):
-    """Get train summary."""
-    train_summary = tf.summary.merge(
-        [tf.summary.scalar("lr", self.learning_rate),
-         tf.summary.scalar("train_loss", self.train_loss)] +
-        self.grad_norm_summary)
-    return train_summary
-
-  def train(self, sess):
-    """Execute train graph."""
-    assert self.mode == tf.contrib.learn.ModeKeys.TRAIN
-    output_tuple = TrainOutputTuple(train_summary=self.train_summary,
-                                    train_loss=self.train_loss,
-                                    predict_count=self.predict_count,
-                                    global_step=self.global_step,
-                                    word_count=self.word_count,
-                                    batch_size=self.batch_size,
-                                    grad_norm=self.grad_norm,
-                                    learning_rate=self.learning_rate)
-    return sess.run([self.update, output_tuple])
-
-  def eval(self, sess):
-    """Execute eval graph."""
-    assert self.mode == tf.contrib.learn.ModeKeys.EVAL
-    output_tuple = EvalOutputTuple(eval_loss=self.eval_loss,
-                                   predict_count=self.predict_count,
-                                   batch_size=self.batch_size)
-    return sess.run(output_tuple)
-
-  def build_graph(self, hparams, scope=None):
-    """Subclass must implement this method.
-
-    Creates a sequence-to-sequence model with dynamic RNN decoder API.
-    Args:
-      hparams: Hyperparameter configurations.
-      scope: VariableScope for the created subgraph; default "dynamic_seq2seq".
-
-    Returns:
-      A tuple of the form (logits, loss_tuple, final_context_state, sample_id),
-      where:
-        logits: float32 Tensor [batch_size x num_decoder_symbols].
-        loss: loss = the total loss / batch_size.
-        final_context_state: the final state of decoder RNN.
-        sample_id: sampling indices.
-
-    Raises:
-      ValueError: if encoder_type differs from mono and bi, or
-        attention_option is not (luong | scaled_luong |
-        bahdanau | normed_bahdanau).
-    """
-    utils.print_out("# Creating %s graph ..." % self.mode)
-
-    # Projection
-    if not self.extract_encoder_layers:
-      with tf.variable_scope(scope or "build_network"):
-        with tf.variable_scope("decoder/output_projection"):
-          self.output_layer = tf.layers.Dense(
-              self.tgt_vocab_size, use_bias=False, name="output_projection")
-
-    with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype):
-      # Encoder
-      if hparams.language_model:  # no encoder for language modeling
-        utils.print_out("  language modeling: no encoder")
-        self.encoder_outputs = None
-        encoder_state = None
-      else:
-        self.encoder_outputs, encoder_state = self._build_encoder(hparams)
-
-      # Skip decoder if extracting only encoder layers
-      if self.extract_encoder_layers:
-        return
-
-      ## Decoder
-      logits, decoder_cell_outputs, sample_id, final_context_state = (
-          self._build_decoder(self.encoder_outputs, encoder_state, hparams))
-
-      ## Loss
-      if self.mode != tf.contrib.learn.ModeKeys.INFER:
-        with tf.device(model_helper.get_device_str(self.num_encoder_layers - 1,
-                                                   self.num_gpus)):
-          loss = self._compute_loss(logits, decoder_cell_outputs)
-      else:
-        loss = tf.constant(0.0)
-
-      return logits, loss, final_context_state, sample_id
-
-  @abc.abstractmethod
-  def _build_encoder(self, hparams):
-    """Subclass must implement this.
-
-    Build and run an RNN encoder.
-
-    Args:
-      hparams: Hyperparameters configurations.
-
-    Returns:
-      A tuple of encoder_outputs and encoder_state.
-    """
     pass
 
-  def _build_encoder_cell(self, hparams, num_layers, num_residual_layers,
-                          base_gpu=0):
-    """Build a multi-layer RNN cell that can be used by encoder."""
-
-    return model_helper.create_rnn_cell(
-        unit_type=hparams.unit_type,
-        num_units=self.num_units,
-        num_layers=num_layers,
-        num_residual_layers=num_residual_layers,
-        forget_bias=hparams.forget_bias,
-        dropout=hparams.dropout,
-        num_gpus=hparams.num_gpus,
-        mode=self.mode,
-        base_gpu=base_gpu,
-        single_cell_fn=self.single_cell_fn)
-
-  def _get_infer_maximum_iterations(self, hparams, source_sequence_length):
-    """Maximum decoding steps at inference time."""
-    if hparams.tgt_max_len_infer:
-      maximum_iterations = hparams.tgt_max_len_infer
-      utils.print_out("  decoding maximum_iterations %d" % maximum_iterations)
-    else:
-      # TODO(thangluong): add decoding_length_factor flag
-      decoding_length_factor = 2.0
-      max_encoder_length = tf.reduce_max(source_sequence_length)
-      maximum_iterations = tf.to_int32(tf.round(
-          tf.to_float(max_encoder_length) * decoding_length_factor))
-    return maximum_iterations
-
-  def _build_decoder(self, encoder_outputs, encoder_state, hparams):
-    """Build and run a RNN decoder with a final projection layer.
-
-    Args:
-      encoder_outputs: The outputs of encoder for every time step.
-      encoder_state: The final state of the encoder.
-      hparams: The Hyperparameters configurations.
-
-    Returns:
-      A tuple of final logits and final decoder state:
-        logits: size [time, batch_size, vocab_size] when time_major=True.
-    """
-    tgt_sos_id = tf.cast(self.tgt_vocab_table.lookup(tf.constant(hparams.sos)),
-                         tf.int32)
-    tgt_eos_id = tf.cast(self.tgt_vocab_table.lookup(tf.constant(hparams.eos)),
-                         tf.int32)
-    iterator = self.iterator
-
-    # maximum_iteration: The maximum decoding steps.
-    maximum_iterations = self._get_infer_maximum_iterations(
-        hparams, iterator.source_sequence_length)
-
-    ## Decoder.
-    with tf.variable_scope("decoder") as decoder_scope:
-      cell, decoder_initial_state = self._build_decoder_cell(
-          hparams, encoder_outputs, encoder_state,
-          iterator.source_sequence_length)
-
-      # Optional ops depends on which mode we are in and which loss function we
-      # are using.
-      logits = tf.no_op()
-      decoder_cell_outputs = None
-
-      ## Train or eval
-      if self.mode != tf.contrib.learn.ModeKeys.INFER:
-        # decoder_emp_inp: [max_time, batch_size, num_units]
-        target_input = iterator.target_input
-        if self.time_major:
-          target_input = tf.transpose(target_input)
-        decoder_emb_inp = tf.nn.embedding_lookup(
-            self.embedding_decoder, target_input)
-
-        # Helper
-        helper = tf.contrib.seq2seq.TrainingHelper(
-            decoder_emb_inp, iterator.target_sequence_length,
-            time_major=self.time_major)
-
-        # Decoder
-        my_decoder = tf.contrib.seq2seq.BasicDecoder(
-            cell,
-            helper,
-            decoder_initial_state,)
-
-        # Dynamic decoding
-        outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
-            my_decoder,
-            output_time_major=self.time_major,
-            swap_memory=True,
-            scope=decoder_scope)
 
-        sample_id = outputs.sample_id
+class BaseModel(object):
+    """Sequence-to-sequence base class."""
+
+    def __init__(
+        self,
+        hparams,
+        mode,
+        iterator,
+        source_vocab_table,
+        target_vocab_table,
+        reverse_target_vocab_table=None,
+        scope=None,
+        extra_args=None,
+    ):
+        """Create the model.
+
+        Args:
+          hparams: Hyperparameter configurations.
+          mode: TRAIN | EVAL | INFER
+          iterator: Dataset Iterator that feeds data.
+          source_vocab_table: Lookup table mapping source words to ids.
+          target_vocab_table: Lookup table mapping target words to ids.
+          reverse_target_vocab_table: Lookup table mapping ids to target words. Only
+            required in INFER mode. Defaults to None.
+          scope: scope of the model.
+          extra_args: model_helper.ExtraArgs, for passing customizable functions.
+
+        """
+        # Set params
+        self._set_params_initializer(
+            hparams,
+            mode,
+            iterator,
+            source_vocab_table,
+            target_vocab_table,
+            scope,
+            extra_args,
+        )
+
+        # Not used in general seq2seq models; when True, ignore decoder &
+        # training
+        self.extract_encoder_layers = (
+            hasattr(hparams, "extract_encoder_layers")
+            and hparams.extract_encoder_layers
+        )
+
+        # Train graph
+        res = self.build_graph(hparams, scope=scope)
+        if not self.extract_encoder_layers:
+            self._set_train_or_infer(res, reverse_target_vocab_table, hparams)
+
+        # Saver
+        self.saver = tf.train.Saver(
+            tf.global_variables(), max_to_keep=hparams.num_keep_ckpts
+        )
+
+    def _set_params_initializer(
+        self,
+        hparams,
+        mode,
+        iterator,
+        source_vocab_table,
+        target_vocab_table,
+        scope,
+        extra_args=None,
+    ):
+        """Set various params for self and initialize."""
+        assert isinstance(iterator, iterator_utils.BatchedInput)
+        self.iterator = iterator
+        self.mode = mode
+        self.src_vocab_table = source_vocab_table
+        self.tgt_vocab_table = target_vocab_table
+
+        self.src_vocab_size = hparams.src_vocab_size
+        self.tgt_vocab_size = hparams.tgt_vocab_size
+        self.num_gpus = hparams.num_gpus
+        self.time_major = hparams.time_major
+
+        if hparams.use_char_encode:
+            assert not self.time_major, "Can't use time major for" " char-level inputs."
+
+        self.dtype = tf.float32
+        self.num_sampled_softmax = hparams.num_sampled_softmax
+
+        # extra_args: to make it flexible for adding external customizable code
+        self.single_cell_fn = None
+        if extra_args:
+            self.single_cell_fn = extra_args.single_cell_fn
+
+        # Set num units
+        self.num_units = hparams.num_units
+
+        # Set num layers
+        self.num_encoder_layers = hparams.num_encoder_layers
+        self.num_decoder_layers = hparams.num_decoder_layers
+        assert self.num_encoder_layers
+        assert self.num_decoder_layers
+
+        # Set num residual layers
+        if hasattr(
+                hparams, "num_residual_layers"):  # compatible common_test_utils
+            self.num_encoder_residual_layers = hparams.num_residual_layers
+            self.num_decoder_residual_layers = hparams.num_residual_layers
+        else:
+            self.num_encoder_residual_layers = hparams.num_encoder_residual_layers
+            self.num_decoder_residual_layers = hparams.num_decoder_residual_layers
 
-        if self.num_sampled_softmax > 0:
-          # Note: this is required when using sampled_softmax_loss.
-          decoder_cell_outputs = outputs.rnn_output
-
-        # Note: there's a subtle difference here between train and inference.
-        # We could have set output_layer when create my_decoder
-        #   and shared more code between train and inference.
-        # We chose to apply the output_layer to all timesteps for speed:
-        #   10% improvements for small models & 20% for larger ones.
-        # If memory is a concern, we should apply output_layer per timestep.
-        num_layers = self.num_decoder_layers
-        num_gpus = self.num_gpus
-        device_id = num_layers if num_layers < num_gpus else (num_layers - 1)
-        # Colocate output layer with the last RNN cell if there is no extra GPU
-        # available. Otherwise, put last layer on a separate GPU.
-        with tf.device(model_helper.get_device_str(device_id, num_gpus)):
-          logits = self.output_layer(outputs.rnn_output)
+        # Batch size
+        self.batch_size = tf.size(self.iterator.source_sequence_length)
 
-        if self.num_sampled_softmax > 0:
-          logits = tf.no_op()  # unused when using sampled softmax loss.
+        # Global step
+        self.global_step = tf.Variable(0, trainable=False)
 
-      ## Inference
-      else:
-        infer_mode = hparams.infer_mode
-        start_tokens = tf.fill([self.batch_size], tgt_sos_id)
-        end_token = tgt_eos_id
+        # Initializer
+        self.random_seed = hparams.random_seed
+        initializer = model_helper.get_initializer(
+            hparams.init_op, self.random_seed, hparams.init_weight
+        )
+        tf.get_variable_scope().set_initializer(initializer)
+
+        # Embeddings
+        if extra_args and extra_args.encoder_emb_lookup_fn:
+            self.encoder_emb_lookup_fn = extra_args.encoder_emb_lookup_fn
+        else:
+            self.encoder_emb_lookup_fn = tf.nn.embedding_lookup
+        self.init_embeddings(hparams, scope)
+
+    def _set_train_or_infer(self, res, reverse_target_vocab_table, hparams):
+        """Set up training and inference."""
+        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
+            self.train_loss = res[1]
+            self.word_count = tf.reduce_sum(
+                self.iterator.source_sequence_length
+            ) + tf.reduce_sum(self.iterator.target_sequence_length)
+        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
+            self.eval_loss = res[1]
+        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
+            self.infer_logits, _, self.final_context_state, self.sample_id = res
+            self.sample_words = reverse_target_vocab_table.lookup(
+                tf.to_int64(self.sample_id)
+            )
+
+        if self.mode != tf.contrib.learn.ModeKeys.INFER:
+            # Count the number of predicted words for compute ppl.
+            self.predict_count = tf.reduce_sum(
+                self.iterator.target_sequence_length)
+
+        params = tf.trainable_variables()
+
+        # Gradients and SGD update operation for training the model.
+        # Arrange for the embedding vars to appear at the beginning.
+        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
+            self.learning_rate = tf.constant(hparams.learning_rate)
+            # warm-up
+            self.learning_rate = self._get_learning_rate_warmup(hparams)
+            # decay
+            self.learning_rate = self._get_learning_rate_decay(hparams)
+
+            # Optimizer
+            if hparams.optimizer == "sgd":
+                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
+            elif hparams.optimizer == "adam":
+                opt = tf.train.AdamOptimizer(self.learning_rate)
+            else:
+                raise ValueError(
+                    "Unknown optimizer type %s" %
+                    hparams.optimizer)
+
+            # Gradients
+            gradients = tf.gradients(
+                self.train_loss,
+                params,
+                colocate_gradients_with_ops=hparams.colocate_gradients_with_ops,
+            )
+
+            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
+                gradients, max_gradient_norm=hparams.max_gradient_norm
+            )
+            self.grad_norm_summary = grad_norm_summary
+            self.grad_norm = grad_norm
+
+            self.update = opt.apply_gradients(
+                zip(clipped_grads, params), global_step=self.global_step
+            )
+
+            # Summary
+            self.train_summary = self._get_train_summary()
+        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
+            self.infer_summary = self._get_infer_summary(hparams)
+
+        # Print trainable variables
+        utils.print_out("# Trainable variables")
+        utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
+        for param in params:
+            utils.print_out(
+                "  %s, %s, %s" % (param.name, str(
+                    param.get_shape()), param.op.device)
+            )
+
+    def _get_learning_rate_warmup(self, hparams):
+        """Get learning rate warmup."""
+        warmup_steps = hparams.warmup_steps
+        warmup_scheme = hparams.warmup_scheme
         utils.print_out(
-            "  decoder: infer_mode=%sbeam_width=%d, length_penalty=%f" % (
-                infer_mode, hparams.beam_width, hparams.length_penalty_weight))
-
-        if infer_mode == "beam_search":
-          beam_width = hparams.beam_width
-          length_penalty_weight = hparams.length_penalty_weight
-
-          my_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
-              cell=cell,
-              embedding=self.embedding_decoder,
-              start_tokens=start_tokens,
-              end_token=end_token,
-              initial_state=decoder_initial_state,
-              beam_width=beam_width,
-              output_layer=self.output_layer,
-              length_penalty_weight=length_penalty_weight)
-        elif infer_mode == "sample":
-          # Helper
-          sampling_temperature = hparams.sampling_temperature
-          assert sampling_temperature > 0.0, (
-              "sampling_temperature must greater than 0.0 when using sample"
-              " decoder.")
-          helper = tf.contrib.seq2seq.SampleEmbeddingHelper(
-              self.embedding_decoder, start_tokens, end_token,
-              softmax_temperature=sampling_temperature,
-              seed=self.random_seed)
-        elif infer_mode == "greedy":
-          helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
-              self.embedding_decoder, start_tokens, end_token)
+            "  learning_rate=%g, warmup_steps=%d, warmup_scheme=%s"
+            % (hparams.learning_rate, warmup_steps, warmup_scheme)
+        )
+
+        # Apply inverse decay if global steps less than warmup steps.
+        # Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3)
+        # When step < warmup_steps,
+        #   learing_rate *= warmup_factor ** (warmup_steps - step)
+        if warmup_scheme == "t2t":
+            # 0.01^(1/warmup_steps): we start with a lr, 100 times smaller
+            warmup_factor = tf.exp(tf.log(0.01) / warmup_steps)
+            inv_decay = warmup_factor ** (
+                tf.to_float(warmup_steps - self.global_step))
         else:
-          raise ValueError("Unknown infer_mode '%s'", infer_mode)
-
-        if infer_mode != "beam_search":
-          my_decoder = tf.contrib.seq2seq.BasicDecoder(
-              cell,
-              helper,
-              decoder_initial_state,
-              output_layer=self.output_layer  # applied per timestep
-          )
-
-        # Dynamic decoding
-        outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
-            my_decoder,
-            maximum_iterations=maximum_iterations,
-            output_time_major=self.time_major,
-            swap_memory=True,
-            scope=decoder_scope)
-
-        if infer_mode == "beam_search":
-          sample_id = outputs.predicted_ids
+            raise ValueError("Unknown warmup scheme %s" % warmup_scheme)
+
+        return tf.cond(
+            self.global_step < hparams.warmup_steps,
+            lambda: inv_decay * self.learning_rate,
+            lambda: self.learning_rate,
+            name="learning_rate_warump_cond",
+        )
+
+    def _get_decay_info(self, hparams):
+        """Return decay info based on decay_scheme."""
+        if hparams.decay_scheme in ["luong5", "luong10", "luong234"]:
+            decay_factor = 0.5
+            if hparams.decay_scheme == "luong5":
+                start_decay_step = int(hparams.num_train_steps / 2)
+                decay_times = 5
+            elif hparams.decay_scheme == "luong10":
+                start_decay_step = int(hparams.num_train_steps / 2)
+                decay_times = 10
+            elif hparams.decay_scheme == "luong234":
+                start_decay_step = int(hparams.num_train_steps * 2 / 3)
+                decay_times = 4
+            remain_steps = hparams.num_train_steps - start_decay_step
+            decay_steps = int(remain_steps / decay_times)
+        elif not hparams.decay_scheme:  # no decay
+            start_decay_step = hparams.num_train_steps
+            decay_steps = 0
+            decay_factor = 1.0
+        elif hparams.decay_scheme:
+            raise ValueError("Unknown decay scheme %s" % hparams.decay_scheme)
+        return start_decay_step, decay_steps, decay_factor
+
+    def _get_learning_rate_decay(self, hparams):
+        """Get learning rate decay."""
+        start_decay_step, decay_steps, decay_factor = self._get_decay_info(
+            hparams)
+        utils.print_out(
+            "  decay_scheme=%s, start_decay_step=%d, decay_steps %d, "
+            "decay_factor %g"
+            % (hparams.decay_scheme, start_decay_step, decay_steps, decay_factor)
+        )
+
+        return tf.cond(
+            self.global_step < start_decay_step,
+            lambda: self.learning_rate,
+            lambda: tf.train.exponential_decay(
+                self.learning_rate,
+                (self.global_step - start_decay_step),
+                decay_steps,
+                decay_factor,
+                staircase=True,
+            ),
+            name="learning_rate_decay_cond",
+        )
+
+    def init_embeddings(self, hparams, scope):
+        """Init embeddings."""
+        self.embedding_encoder, self.embedding_decoder = (
+            model_helper.create_emb_for_encoder_and_decoder(
+                share_vocab=hparams.share_vocab,
+                src_vocab_size=self.src_vocab_size,
+                tgt_vocab_size=self.tgt_vocab_size,
+                src_embed_size=self.num_units,
+                tgt_embed_size=self.num_units,
+                num_enc_partitions=hparams.num_enc_emb_partitions,
+                num_dec_partitions=hparams.num_dec_emb_partitions,
+                src_vocab_file=hparams.src_vocab_file,
+                tgt_vocab_file=hparams.tgt_vocab_file,
+                src_embed_file=hparams.src_embed_file,
+                tgt_embed_file=hparams.tgt_embed_file,
+                use_char_encode=hparams.use_char_encode,
+                scope=scope,
+            )
+        )
+
+    def _get_train_summary(self):
+        """Get train summary."""
+        train_summary = tf.summary.merge(
+            [
+                tf.summary.scalar("lr", self.learning_rate),
+                tf.summary.scalar("train_loss", self.train_loss),
+            ]
+            + self.grad_norm_summary
+        )
+        return train_summary
+
+    def train(self, sess):
+        """Execute train graph."""
+        assert self.mode == tf.contrib.learn.ModeKeys.TRAIN
+        output_tuple = TrainOutputTuple(
+            train_summary=self.train_summary,
+            train_loss=self.train_loss,
+            predict_count=self.predict_count,
+            global_step=self.global_step,
+            word_count=self.word_count,
+            batch_size=self.batch_size,
+            grad_norm=self.grad_norm,
+            learning_rate=self.learning_rate,
+        )
+        return sess.run([self.update, output_tuple])
+
+    def eval(self, sess):
+        """Execute eval graph."""
+        assert self.mode == tf.contrib.learn.ModeKeys.EVAL
+        output_tuple = EvalOutputTuple(
+            eval_loss=self.eval_loss,
+            predict_count=self.predict_count,
+            batch_size=self.batch_size,
+        )
+        return sess.run(output_tuple)
+
+    def build_graph(self, hparams, scope=None):
+        """Subclass must implement this method.
+
+        Creates a sequence-to-sequence model with dynamic RNN decoder API.
+        Args:
+          hparams: Hyperparameter configurations.
+          scope: VariableScope for the created subgraph; default "dynamic_seq2seq".
+
+        Returns:
+          A tuple of the form (logits, loss_tuple, final_context_state, sample_id),
+          where:
+            logits: float32 Tensor [batch_size x num_decoder_symbols].
+            loss: loss = the total loss / batch_size.
+            final_context_state: the final state of decoder RNN.
+            sample_id: sampling indices.
+
+        Raises:
+          ValueError: if encoder_type differs from mono and bi, or
+            attention_option is not (luong | scaled_luong |
+            bahdanau | normed_bahdanau).
+        """
+        utils.print_out("# Creating %s graph ..." % self.mode)
+
+        # Projection
+        if not self.extract_encoder_layers:
+            with tf.variable_scope(scope or "build_network"):
+                with tf.variable_scope("decoder/output_projection"):
+                    self.output_layer = tf.layers.Dense(
+                        self.tgt_vocab_size, use_bias=False, name="output_projection"
+                    )
+
+        with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype):
+            # Encoder
+            if hparams.language_model:  # no encoder for language modeling
+                utils.print_out("  language modeling: no encoder")
+                self.encoder_outputs = None
+                encoder_state = None
+            else:
+                self.encoder_outputs, encoder_state = self._build_encoder(
+                    hparams)
+
+            # Skip decoder if extracting only encoder layers
+            if self.extract_encoder_layers:
+                return
+
+            # Decoder
+            logits, decoder_cell_outputs, sample_id, final_context_state = (
+                self._build_decoder(
+                    self.encoder_outputs, encoder_state, hparams)
+            )
+
+            # Loss
+            if self.mode != tf.contrib.learn.ModeKeys.INFER:
+                with tf.device(
+                    model_helper.get_device_str(
+                        self.num_encoder_layers - 1, self.num_gpus
+                    )
+                ):
+                    loss = self._compute_loss(logits, decoder_cell_outputs)
+            else:
+                loss = tf.constant(0.0)
+
+            return logits, loss, final_context_state, sample_id
+
+    @abc.abstractmethod
+    def _build_encoder(self, hparams):
+        """Subclass must implement this.
+
+        Build and run an RNN encoder.
+
+        Args:
+          hparams: Hyperparameters configurations.
+
+        Returns:
+          A tuple of encoder_outputs and encoder_state.
+        """
+        pass
+
+    def _build_encoder_cell(self, hparams, num_layers,
+                            num_residual_layers, base_gpu=0):
+        """Build a multi-layer RNN cell that can be used by encoder."""
+
+        return model_helper.create_rnn_cell(
+            unit_type=hparams.unit_type,
+            num_units=self.num_units,
+            num_layers=num_layers,
+            num_residual_layers=num_residual_layers,
+            forget_bias=hparams.forget_bias,
+            dropout=hparams.dropout,
+            num_gpus=hparams.num_gpus,
+            mode=self.mode,
+            base_gpu=base_gpu,
+            single_cell_fn=self.single_cell_fn,
+        )
+
+    def _get_infer_maximum_iterations(self, hparams, source_sequence_length):
+        """Maximum decoding steps at inference time."""
+        if hparams.tgt_max_len_infer:
+            maximum_iterations = hparams.tgt_max_len_infer
+            utils.print_out(
+                "  decoding maximum_iterations %d" %
+                maximum_iterations)
         else:
-          logits = outputs.rnn_output
-          sample_id = outputs.sample_id
-
-    return logits, decoder_cell_outputs, sample_id, final_context_state
-
-  def get_max_time(self, tensor):
-    time_axis = 0 if self.time_major else 1
-    return tensor.shape[time_axis].value or tf.shape(tensor)[time_axis]
-
-  @abc.abstractmethod
-  def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
-                          source_sequence_length):
-    """Subclass must implement this.
+            # TODO(thangluong): add decoding_length_factor flag
+            decoding_length_factor = 2.0
+            max_encoder_length = tf.reduce_max(source_sequence_length)
+            maximum_iterations = tf.to_int32(
+                tf.round(
+                    tf.to_float(max_encoder_length) *
+                    decoding_length_factor)
+            )
+        return maximum_iterations
+
+    def _build_decoder(self, encoder_outputs, encoder_state, hparams):
+        """Build and run a RNN decoder with a final projection layer.
+
+        Args:
+          encoder_outputs: The outputs of encoder for every time step.
+          encoder_state: The final state of the encoder.
+          hparams: The Hyperparameters configurations.
+
+        Returns:
+          A tuple of final logits and final decoder state:
+            logits: size [time, batch_size, vocab_size] when time_major=True.
+        """
+        tgt_sos_id = tf.cast(
+            self.tgt_vocab_table.lookup(tf.constant(hparams.sos)), tf.int32
+        )
+        tgt_eos_id = tf.cast(
+            self.tgt_vocab_table.lookup(tf.constant(hparams.eos)), tf.int32
+        )
+        iterator = self.iterator
+
+        # maximum_iteration: The maximum decoding steps.
+        maximum_iterations = self._get_infer_maximum_iterations(
+            hparams, iterator.source_sequence_length
+        )
+
+        # Decoder.
+        with tf.variable_scope("decoder") as decoder_scope:
+            cell, decoder_initial_state = self._build_decoder_cell(
+                hparams, encoder_outputs, encoder_state, iterator.source_sequence_length
+            )
+
+            # Optional ops depends on which mode we are in and which loss function we
+            # are using.
+            logits = tf.no_op()
+            decoder_cell_outputs = None
+
+            # Train or eval
+            if self.mode != tf.contrib.learn.ModeKeys.INFER:
+                # decoder_emp_inp: [max_time, batch_size, num_units]
+                target_input = iterator.target_input
+                if self.time_major:
+                    target_input = tf.transpose(target_input)
+                decoder_emb_inp = tf.nn.embedding_lookup(
+                    self.embedding_decoder, target_input
+                )
+
+                # Helper
+                helper = tf.contrib.seq2seq.TrainingHelper(
+                    decoder_emb_inp,
+                    iterator.target_sequence_length,
+                    time_major=self.time_major,
+                )
+
+                # Decoder
+                my_decoder = tf.contrib.seq2seq.BasicDecoder(
+                    cell,
+                    helper,
+                    decoder_initial_state,
+                )
+
+                # Dynamic decoding
+                outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
+                    my_decoder,
+                    output_time_major=self.time_major,
+                    swap_memory=True,
+                    scope=decoder_scope,
+                )
+
+                sample_id = outputs.sample_id
+
+                if self.num_sampled_softmax > 0:
+                    # Note: this is required when using sampled_softmax_loss.
+                    decoder_cell_outputs = outputs.rnn_output
+
+                # Note: there's a subtle difference here between train and inference.
+                # We could have set output_layer when create my_decoder
+                #   and shared more code between train and inference.
+                # We chose to apply the output_layer to all timesteps for speed:
+                #   10% improvements for small models & 20% for larger ones.
+                # If memory is a concern, we should apply output_layer per
+                # timestep.
+                num_layers = self.num_decoder_layers
+                num_gpus = self.num_gpus
+                device_id = num_layers if num_layers < num_gpus else (
+                    num_layers - 1)
+                # Colocate output layer with the last RNN cell if there is no extra GPU
+                # available. Otherwise, put last layer on a separate GPU.
+                with tf.device(model_helper.get_device_str(device_id, num_gpus)):
+                    logits = self.output_layer(outputs.rnn_output)
+
+                if self.num_sampled_softmax > 0:
+                    # unused when using sampled softmax loss.
+                    logits = tf.no_op()
+
+            # Inference
+            else:
+                infer_mode = hparams.infer_mode
+                start_tokens = tf.fill([self.batch_size], tgt_sos_id)
+                end_token = tgt_eos_id
+                utils.print_out(
+                    "  decoder: infer_mode=%sbeam_width=%d, length_penalty=%f"
+                    % (infer_mode, hparams.beam_width, hparams.length_penalty_weight)
+                )
+
+                if infer_mode == "beam_search":
+                    beam_width = hparams.beam_width
+                    length_penalty_weight = hparams.length_penalty_weight
+
+                    my_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
+                        cell=cell,
+                        embedding=self.embedding_decoder,
+                        start_tokens=start_tokens,
+                        end_token=end_token,
+                        initial_state=decoder_initial_state,
+                        beam_width=beam_width,
+                        output_layer=self.output_layer,
+                        length_penalty_weight=length_penalty_weight,
+                    )
+                elif infer_mode == "sample":
+                    # Helper
+                    sampling_temperature = hparams.sampling_temperature
+                    assert sampling_temperature > 0.0, (
+                        "sampling_temperature must greater than 0.0 when using sample"
+                        " decoder."
+                    )
+                    helper = tf.contrib.seq2seq.SampleEmbeddingHelper(
+                        self.embedding_decoder,
+                        start_tokens,
+                        end_token,
+                        softmax_temperature=sampling_temperature,
+                        seed=self.random_seed,
+                    )
+                elif infer_mode == "greedy":
+                    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
+                        self.embedding_decoder, start_tokens, end_token
+                    )
+                else:
+                    raise ValueError("Unknown infer_mode '%s'", infer_mode)
+
+                if infer_mode != "beam_search":
+                    my_decoder = tf.contrib.seq2seq.BasicDecoder(
+                        cell,
+                        helper,
+                        decoder_initial_state,
+                        output_layer=self.output_layer,  # applied per timestep
+                    )
+
+                # Dynamic decoding
+                outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
+                    my_decoder,
+                    maximum_iterations=maximum_iterations,
+                    output_time_major=self.time_major,
+                    swap_memory=True,
+                    scope=decoder_scope,
+                )
+
+                if infer_mode == "beam_search":
+                    sample_id = outputs.predicted_ids
+                else:
+                    logits = outputs.rnn_output
+                    sample_id = outputs.sample_id
+
+        return logits, decoder_cell_outputs, sample_id, final_context_state
+
+    def get_max_time(self, tensor):
+        time_axis = 0 if self.time_major else 1
+        return tensor.shape[time_axis].value or tf.shape(tensor)[time_axis]
+
+    @abc.abstractmethod
+    def _build_decoder_cell(
+        self, hparams, encoder_outputs, encoder_state, source_sequence_length
+    ):
+        """Subclass must implement this.
+
+        Args:
+          hparams: Hyperparameters configurations.
+          encoder_outputs: The outputs of encoder for every time step.
+          encoder_state: The final state of the encoder.
+          source_sequence_length: sequence length of encoder_outputs.
+
+        Returns:
+          A tuple of a multi-layer RNN cell used by decoder and the intial state of
+          the decoder RNN.
+        """
+        pass
+
+    def _softmax_cross_entropy_loss(
+            self, logits, decoder_cell_outputs, labels):
+        """Compute softmax loss or sampled softmax loss."""
+        if self.num_sampled_softmax > 0:
 
-    Args:
-      hparams: Hyperparameters configurations.
-      encoder_outputs: The outputs of encoder for every time step.
-      encoder_state: The final state of the encoder.
-      source_sequence_length: sequence length of encoder_outputs.
+            is_sequence = decoder_cell_outputs.shape.ndims == 3
+
+            if is_sequence:
+                labels = tf.reshape(labels, [-1, 1])
+                inputs = tf.reshape(decoder_cell_outputs, [-1, self.num_units])
+
+            crossent = tf.nn.sampled_softmax_loss(
+                weights=tf.transpose(self.output_layer.kernel),
+                biases=self.output_layer.bias or tf.zeros(
+                    [self.tgt_vocab_size]),
+                labels=labels,
+                inputs=inputs,
+                num_sampled=self.num_sampled_softmax,
+                num_classes=self.tgt_vocab_size,
+                partition_strategy="div",
+                seed=self.random_seed,
+            )
+
+            if is_sequence:
+                if self.time_major:
+                    crossent = tf.reshape(crossent, [-1, self.batch_size])
+                else:
+                    crossent = tf.reshape(crossent, [self.batch_size, -1])
 
-    Returns:
-      A tuple of a multi-layer RNN cell used by decoder and the intial state of
-      the decoder RNN.
-    """
-    pass
-
-  def _softmax_cross_entropy_loss(
-      self, logits, decoder_cell_outputs, labels):
-    """Compute softmax loss or sampled softmax loss."""
-    if self.num_sampled_softmax > 0:
+        else:
+            crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=labels, logits=logits
+            )
 
-      is_sequence = (decoder_cell_outputs.shape.ndims == 3)
+        return crossent
 
-      if is_sequence:
-        labels = tf.reshape(labels, [-1, 1])
-        inputs = tf.reshape(decoder_cell_outputs, [-1, self.num_units])
+    def _compute_loss(self, logits, decoder_cell_outputs):
+        """Compute optimization loss."""
+        target_output = self.iterator.target_output
+        if self.time_major:
+            target_output = tf.transpose(target_output)
+        max_time = self.get_max_time(target_output)
 
-      crossent = tf.nn.sampled_softmax_loss(
-          weights=tf.transpose(self.output_layer.kernel),
-          biases=self.output_layer.bias or tf.zeros([self.tgt_vocab_size]),
-          labels=labels,
-          inputs=inputs,
-          num_sampled=self.num_sampled_softmax,
-          num_classes=self.tgt_vocab_size,
-          partition_strategy="div",
-          seed=self.random_seed)
+        crossent = self._softmax_cross_entropy_loss(
+            logits, decoder_cell_outputs, target_output
+        )
 
-      if is_sequence:
+        target_weights = tf.sequence_mask(
+            self.iterator.target_sequence_length, max_time, dtype=self.dtype
+        )
         if self.time_major:
-          crossent = tf.reshape(crossent, [-1, self.batch_size])
+            target_weights = tf.transpose(target_weights)
+
+        loss = tf.reduce_sum(crossent * target_weights) / \
+            tf.to_float(self.batch_size)
+        return loss
+
+    def _get_infer_summary(self, hparams):
+        del hparams
+        return tf.no_op()
+
+    def infer(self, sess):
+        assert self.mode == tf.contrib.learn.ModeKeys.INFER
+        output_tuple = InferOutputTuple(
+            infer_logits=self.infer_logits,
+            infer_summary=self.infer_summary,
+            sample_id=self.sample_id,
+            sample_words=self.sample_words,
+        )
+        return sess.run(output_tuple)
+
+    def decode(self, sess):
+        """Decode a batch.
+
+        Args:
+          sess: tensorflow session to use.
+
+        Returns:
+          A tuple consiting of outputs, infer_summary.
+            outputs: of size [batch_size, time]
+        """
+        output_tuple = self.infer(sess)
+        sample_words = output_tuple.sample_words
+        infer_summary = output_tuple.infer_summary
+
+        # make sure outputs is of shape [batch_size, time] or [beam_width,
+        # batch_size, time] when using beam search.
+        if self.time_major:
+            sample_words = sample_words.transpose()
+        elif sample_words.ndim == 3:
+            # beam search output in [batch_size, time, beam_width] shape.
+            sample_words = sample_words.transpose([2, 0, 1])
+        return sample_words, infer_summary
+
+    def build_encoder_states(self, include_embeddings=False):
+        """Stack encoder states and return tensor [batch, length, layer, size]."""
+        assert self.mode == tf.contrib.learn.ModeKeys.INFER
+        if include_embeddings:
+            stack_state_list = tf.stack(
+                [self.encoder_emb_inp] + self.encoder_state_list, 2
+            )
         else:
-          crossent = tf.reshape(crossent, [self.batch_size, -1])
-
-    else:
-      crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
-          labels=labels, logits=logits)
-
-    return crossent
-
-  def _compute_loss(self, logits, decoder_cell_outputs):
-    """Compute optimization loss."""
-    target_output = self.iterator.target_output
-    if self.time_major:
-      target_output = tf.transpose(target_output)
-    max_time = self.get_max_time(target_output)
-
-    crossent = self._softmax_cross_entropy_loss(
-        logits, decoder_cell_outputs, target_output)
-
-    target_weights = tf.sequence_mask(
-        self.iterator.target_sequence_length, max_time, dtype=self.dtype)
-    if self.time_major:
-      target_weights = tf.transpose(target_weights)
-
-    loss = tf.reduce_sum(
-        crossent * target_weights) / tf.to_float(self.batch_size)
-    return loss
-
-  def _get_infer_summary(self, hparams):
-    del hparams
-    return tf.no_op()
-
-  def infer(self, sess):
-    assert self.mode == tf.contrib.learn.ModeKeys.INFER
-    output_tuple = InferOutputTuple(infer_logits=self.infer_logits,
-                                    infer_summary=self.infer_summary,
-                                    sample_id=self.sample_id,
-                                    sample_words=self.sample_words)
-    return sess.run(output_tuple)
-
-  def decode(self, sess):
-    """Decode a batch.
-
-    Args:
-      sess: tensorflow session to use.
-
-    Returns:
-      A tuple consiting of outputs, infer_summary.
-        outputs: of size [batch_size, time]
-    """
-    output_tuple = self.infer(sess)
-    sample_words = output_tuple.sample_words
-    infer_summary = output_tuple.infer_summary
-
-    # make sure outputs is of shape [batch_size, time] or [beam_width,
-    # batch_size, time] when using beam search.
-    if self.time_major:
-      sample_words = sample_words.transpose()
-    elif sample_words.ndim == 3:
-      # beam search output in [batch_size, time, beam_width] shape.
-      sample_words = sample_words.transpose([2, 0, 1])
-    return sample_words, infer_summary
-
-  def build_encoder_states(self, include_embeddings=False):
-    """Stack encoder states and return tensor [batch, length, layer, size]."""
-    assert self.mode == tf.contrib.learn.ModeKeys.INFER
-    if include_embeddings:
-      stack_state_list = tf.stack(
-          [self.encoder_emb_inp] + self.encoder_state_list, 2)
-    else:
-      stack_state_list = tf.stack(self.encoder_state_list, 2)
-
-    # transform from [length, batch, ...] -> [batch, length, ...]
-    if self.time_major:
-      stack_state_list = tf.transpose(stack_state_list, [1, 0, 2, 3])
-
-    return stack_state_list
+            stack_state_list = tf.stack(self.encoder_state_list, 2)
 
+        # transform from [length, batch, ...] -> [batch, length, ...]
+        if self.time_major:
+            stack_state_list = tf.transpose(stack_state_list, [1, 0, 2, 3])
 
-class Model(BaseModel):
-  """Sequence-to-sequence dynamic model.
-
-  This class implements a multi-layer recurrent neural network as encoder,
-  and a multi-layer recurrent neural network decoder.
-  """
-  def _build_encoder_from_sequence(self, hparams, sequence, sequence_length):
-    """Build an encoder from a sequence.
+        return stack_state_list
 
-    Args:
-      hparams: hyperparameters.
-      sequence: tensor with input sequence data.
-      sequence_length: tensor with length of the input sequence.
 
-    Returns:
-      encoder_outputs: RNN encoder outputs.
-      encoder_state: RNN encoder state.
+class Model(BaseModel):
+    """Sequence-to-sequence dynamic model.
 
-    Raises:
-      ValueError: if encoder_type is neither "uni" nor "bi".
+    This class implements a multi-layer recurrent neural network as encoder,
+    and a multi-layer recurrent neural network decoder.
     """
-    num_layers = self.num_encoder_layers
-    num_residual_layers = self.num_encoder_residual_layers
 
-    if self.time_major:
-      sequence = tf.transpose(sequence)
+    def _build_encoder_from_sequence(self, hparams, sequence, sequence_length):
+        """Build an encoder from a sequence.
 
-    with tf.variable_scope("encoder") as scope:
-      dtype = scope.dtype
+        Args:
+          hparams: hyperparameters.
+          sequence: tensor with input sequence data.
+          sequence_length: tensor with length of the input sequence.
 
-      self.encoder_emb_inp = self.encoder_emb_lookup_fn(
-          self.embedding_encoder, sequence)
+        Returns:
+          encoder_outputs: RNN encoder outputs.
+          encoder_state: RNN encoder state.
 
-      # Encoder_outputs: [max_time, batch_size, num_units]
-      if hparams.encoder_type == "uni":
-        utils.print_out("  num_layers = %d, num_residual_layers=%d" %
-                        (num_layers, num_residual_layers))
-        cell = self._build_encoder_cell(hparams, num_layers,
-                                        num_residual_layers)
+        Raises:
+          ValueError: if encoder_type is neither "uni" nor "bi".
+        """
+        num_layers = self.num_encoder_layers
+        num_residual_layers = self.num_encoder_residual_layers
 
-        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
-            cell,
-            self.encoder_emb_inp,
+        if self.time_major:
+            sequence = tf.transpose(sequence)
+
+        with tf.variable_scope("encoder") as scope:
+            dtype = scope.dtype
+
+            self.encoder_emb_inp = self.encoder_emb_lookup_fn(
+                self.embedding_encoder, sequence
+            )
+
+            # Encoder_outputs: [max_time, batch_size, num_units]
+            if hparams.encoder_type == "uni":
+                utils.print_out(
+                    "  num_layers = %d, num_residual_layers=%d"
+                    % (num_layers, num_residual_layers)
+                )
+                cell = self._build_encoder_cell(
+                    hparams, num_layers, num_residual_layers
+                )
+
+                encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
+                    cell,
+                    self.encoder_emb_inp,
+                    dtype=dtype,
+                    sequence_length=sequence_length,
+                    time_major=self.time_major,
+                    swap_memory=True,
+                )
+            elif hparams.encoder_type == "bi":
+                num_bi_layers = int(num_layers / 2)
+                num_bi_residual_layers = int(num_residual_layers / 2)
+                utils.print_out(
+                    "  num_bi_layers = %d, num_bi_residual_layers=%d"
+                    % (num_bi_layers, num_bi_residual_layers)
+                )
+
+                encoder_outputs, bi_encoder_state = self._build_bidirectional_rnn(
+                    inputs=self.encoder_emb_inp,
+                    sequence_length=sequence_length,
+                    dtype=dtype,
+                    hparams=hparams,
+                    num_bi_layers=num_bi_layers,
+                    num_bi_residual_layers=num_bi_residual_layers,
+                )
+
+                if num_bi_layers == 1:
+                    encoder_state = bi_encoder_state
+                else:
+                    # alternatively concat forward and backward states
+                    encoder_state = []
+                    for layer_id in range(num_bi_layers):
+                        encoder_state.append(
+                            bi_encoder_state[0][layer_id])  # forward
+                        encoder_state.append(
+                            bi_encoder_state[1][layer_id])  # backward
+                    encoder_state = tuple(encoder_state)
+            else:
+                raise ValueError(
+                    "Unknown encoder_type %s" %
+                    hparams.encoder_type)
+
+        # Use the top layer for now
+        self.encoder_state_list = [encoder_outputs]
+
+        return encoder_outputs, encoder_state
+
+    def _build_encoder(self, hparams):
+        """Build encoder from source."""
+        utils.print_out("# Build a basic encoder")
+        return self._build_encoder_from_sequence(
+            hparams, self.iterator.source, self.iterator.source_sequence_length
+        )
+
+    def _build_bidirectional_rnn(
+        self,
+        inputs,
+        sequence_length,
+        dtype,
+        hparams,
+        num_bi_layers,
+        num_bi_residual_layers,
+        base_gpu=0,
+    ):
+        """Create and call biddirectional RNN cells.
+
+        Args:
+          num_residual_layers: Number of residual layers from top to bottom. For
+            example, if `num_bi_layers=4` and `num_residual_layers=2`, the last 2 RNN
+            layers in each RNN cell will be wrapped with `ResidualWrapper`.
+          base_gpu: The gpu device id to use for the first forward RNN layer. The
+            i-th forward RNN layer will use `(base_gpu + i) % num_gpus` as its
+            device id. The `base_gpu` for backward RNN cell is `(base_gpu +
+            num_bi_layers)`.
+
+        Returns:
+          The concatenated bidirectional output and the bidirectional RNN cell"s
+          state.
+        """
+        # Construct forward and backward cells
+        fw_cell = self._build_encoder_cell(
+            hparams, num_bi_layers, num_bi_residual_layers, base_gpu=base_gpu
+        )
+        bw_cell = self._build_encoder_cell(
+            hparams,
+            num_bi_layers,
+            num_bi_residual_layers,
+            base_gpu=(base_gpu + num_bi_layers),
+        )
+
+        bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(
+            fw_cell,
+            bw_cell,
+            inputs,
             dtype=dtype,
             sequence_length=sequence_length,
             time_major=self.time_major,
-            swap_memory=True)
-      elif hparams.encoder_type == "bi":
-        num_bi_layers = int(num_layers / 2)
-        num_bi_residual_layers = int(num_residual_layers / 2)
-        utils.print_out("  num_bi_layers = %d, num_bi_residual_layers=%d" %
-                        (num_bi_layers, num_bi_residual_layers))
-
-        encoder_outputs, bi_encoder_state = (
-            self._build_bidirectional_rnn(
-                inputs=self.encoder_emb_inp,
-                sequence_length=sequence_length,
-                dtype=dtype,
-                hparams=hparams,
-                num_bi_layers=num_bi_layers,
-                num_bi_residual_layers=num_bi_residual_layers))
-
-        if num_bi_layers == 1:
-          encoder_state = bi_encoder_state
+            swap_memory=True,
+        )
+
+        return tf.concat(bi_outputs, -1), bi_state
+
+    def _build_decoder_cell(
+        self,
+        hparams,
+        encoder_outputs,
+        encoder_state,
+        source_sequence_length,
+        base_gpu=0,
+    ):
+        """Build an RNN cell that can be used by decoder."""
+        # We only make use of encoder_outputs in attention-based models
+        if hparams.attention:
+            raise ValueError("BasicModel doesn't support attention.")
+
+        cell = model_helper.create_rnn_cell(
+            unit_type=hparams.unit_type,
+            num_units=self.num_units,
+            num_layers=self.num_decoder_layers,
+            num_residual_layers=self.num_decoder_residual_layers,
+            forget_bias=hparams.forget_bias,
+            dropout=hparams.dropout,
+            num_gpus=self.num_gpus,
+            mode=self.mode,
+            single_cell_fn=self.single_cell_fn,
+            base_gpu=base_gpu,
+        )
+
+        if hparams.language_model:
+            encoder_state = cell.zero_state(self.batch_size, self.dtype)
+        elif not hparams.pass_hidden_state:
+            raise ValueError(
+                "For non-attentional model, "
+                "pass_hidden_state needs to be set to True"
+            )
+
+        # For beam search, we need to replicate encoder infos beam_width times
+        if (
+            self.mode == tf.contrib.learn.ModeKeys.INFER
+            and hparams.infer_mode == "beam_search"
+        ):
+            decoder_initial_state = tf.contrib.seq2seq.tile_batch(
+                encoder_state, multiplier=hparams.beam_width
+            )
         else:
-          # alternatively concat forward and backward states
-          encoder_state = []
-          for layer_id in range(num_bi_layers):
-            encoder_state.append(bi_encoder_state[0][layer_id])  # forward
-            encoder_state.append(bi_encoder_state[1][layer_id])  # backward
-          encoder_state = tuple(encoder_state)
-      else:
-        raise ValueError("Unknown encoder_type %s" % hparams.encoder_type)
-
-    # Use the top layer for now
-    self.encoder_state_list = [encoder_outputs]
-
-    return encoder_outputs, encoder_state
-
-  def _build_encoder(self, hparams):
-    """Build encoder from source."""
-    utils.print_out("# Build a basic encoder")
-    return self._build_encoder_from_sequence(
-        hparams, self.iterator.source, self.iterator.source_sequence_length)
-
-  def _build_bidirectional_rnn(self, inputs, sequence_length,
-                               dtype, hparams,
-                               num_bi_layers,
-                               num_bi_residual_layers,
-                               base_gpu=0):
-    """Create and call biddirectional RNN cells.
-
-    Args:
-      num_residual_layers: Number of residual layers from top to bottom. For
-        example, if `num_bi_layers=4` and `num_residual_layers=2`, the last 2 RNN
-        layers in each RNN cell will be wrapped with `ResidualWrapper`.
-      base_gpu: The gpu device id to use for the first forward RNN layer. The
-        i-th forward RNN layer will use `(base_gpu + i) % num_gpus` as its
-        device id. The `base_gpu` for backward RNN cell is `(base_gpu +
-        num_bi_layers)`.
-
-    Returns:
-      The concatenated bidirectional output and the bidirectional RNN cell"s
-      state.
-    """
-    # Construct forward and backward cells
-    fw_cell = self._build_encoder_cell(hparams,
-                                       num_bi_layers,
-                                       num_bi_residual_layers,
-                                       base_gpu=base_gpu)
-    bw_cell = self._build_encoder_cell(hparams,
-                                       num_bi_layers,
-                                       num_bi_residual_layers,
-                                       base_gpu=(base_gpu + num_bi_layers))
-
-    bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(
-        fw_cell,
-        bw_cell,
-        inputs,
-        dtype=dtype,
-        sequence_length=sequence_length,
-        time_major=self.time_major,
-        swap_memory=True)
-
-    return tf.concat(bi_outputs, -1), bi_state
-
-  def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
-                          source_sequence_length, base_gpu=0):
-    """Build an RNN cell that can be used by decoder."""
-    # We only make use of encoder_outputs in attention-based models
-    if hparams.attention:
-      raise ValueError("BasicModel doesn't support attention.")
-
-    cell = model_helper.create_rnn_cell(
-        unit_type=hparams.unit_type,
-        num_units=self.num_units,
-        num_layers=self.num_decoder_layers,
-        num_residual_layers=self.num_decoder_residual_layers,
-        forget_bias=hparams.forget_bias,
-        dropout=hparams.dropout,
-        num_gpus=self.num_gpus,
-        mode=self.mode,
-        single_cell_fn=self.single_cell_fn,
-        base_gpu=base_gpu
-    )
+            decoder_initial_state = encoder_state
 
-    if hparams.language_model:
-      encoder_state = cell.zero_state(self.batch_size, self.dtype)
-    elif not hparams.pass_hidden_state:
-      raise ValueError("For non-attentional model, "
-                       "pass_hidden_state needs to be set to True")
-
-    # For beam search, we need to replicate encoder infos beam_width times
-    if (self.mode == tf.contrib.learn.ModeKeys.INFER and
-        hparams.infer_mode == "beam_search"):
-      decoder_initial_state = tf.contrib.seq2seq.tile_batch(
-          encoder_state, multiplier=hparams.beam_width)
-    else:
-      decoder_initial_state = encoder_state
-
-    return cell, decoder_initial_state
+        return cell, decoder_initial_state
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_helper.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_helper.py
index 65e111414..5a2ab2ac8 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_helper.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_helper.py
@@ -29,11 +29,18 @@
 from .utils import vocab_utils
 
 __all__ = [
-    "get_initializer", "get_device_str", "create_train_model",
-    "create_eval_model", "create_infer_model",
-    "create_emb_for_encoder_and_decoder", "create_rnn_cell", "gradient_clip",
-    "create_or_load_model", "load_model", "avg_checkpoints",
-    "compute_perplexity"
+    "get_initializer",
+    "get_device_str",
+    "create_train_model",
+    "create_eval_model",
+    "create_infer_model",
+    "create_emb_for_encoder_and_decoder",
+    "create_rnn_cell",
+    "gradient_clip",
+    "create_or_load_model",
+    "load_model",
+    "avg_checkpoints",
+    "compute_perplexity",
 ]
 
 # If a vocab size is greater than this value, put the embedding on cpu instead
@@ -41,623 +48,725 @@
 
 
 def get_initializer(init_op, seed=None, init_weight=None):
-  """Create an initializer. init_weight is only for uniform."""
-  if init_op == "uniform":
-    assert init_weight
-    return tf.random_uniform_initializer(
-        -init_weight, init_weight, seed=seed)
-  elif init_op == "glorot_normal":
-    return tf.keras.initializers.glorot_normal(
-        seed=seed)
-  elif init_op == "glorot_uniform":
-    return tf.keras.initializers.glorot_uniform(
-        seed=seed)
-  else:
-    raise ValueError("Unknown init_op %s" % init_op)
+    """Create an initializer. init_weight is only for uniform."""
+    if init_op == "uniform":
+        assert init_weight
+        return tf.random_uniform_initializer(-init_weight,
+                                             init_weight, seed=seed)
+    elif init_op == "glorot_normal":
+        return tf.keras.initializers.glorot_normal(seed=seed)
+    elif init_op == "glorot_uniform":
+        return tf.keras.initializers.glorot_uniform(seed=seed)
+    else:
+        raise ValueError("Unknown init_op %s" % init_op)
 
 
 def get_device_str(device_id, num_gpus):
-  """Return a device string for multi-GPU setup."""
-  if num_gpus == 0:
-    return "/cpu:0"
-  device_str_output = "/gpu:%d" % (device_id % num_gpus)
-  return device_str_output
-
-
-class ExtraArgs(collections.namedtuple(
-    "ExtraArgs", ("single_cell_fn", "model_device_fn",
-                  "attention_mechanism_fn", "encoder_emb_lookup_fn"))):
-  pass
+    """Return a device string for multi-GPU setup."""
+    if num_gpus == 0:
+        return "/cpu:0"
+    device_str_output = "/gpu:%d" % (device_id % num_gpus)
+    return device_str_output
+
+
+class ExtraArgs(
+    collections.namedtuple(
+        "ExtraArgs",
+        (
+            "single_cell_fn",
+            "model_device_fn",
+            "attention_mechanism_fn",
+            "encoder_emb_lookup_fn",
+        ),
+    )
+):
+    pass
 
 
 class TrainModel(
-    collections.namedtuple("TrainModel", ("graph", "model", "iterator",
-                                          "skip_count_placeholder"))):
-  pass
+    collections.namedtuple(
+        "TrainModel", ("graph", "model", "iterator", "skip_count_placeholder")
+    )
+):
+    pass
 
 
 def create_train_model(
-    model_creator, hparams, scope=None, num_workers=1, jobid=0,
-    extra_args=None):
-  """Create train graph, model, and iterator."""
-  src_file = "%s.%s" % (hparams.train_prefix, hparams.src)
-  tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt)
-  src_vocab_file = hparams.src_vocab_file
-  tgt_vocab_file = hparams.tgt_vocab_file
-
-  graph = tf.Graph()
-
-  with graph.as_default(), tf.container(scope or "train"):
-    src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(
-        src_vocab_file, tgt_vocab_file, hparams.share_vocab)
-
-    src_dataset = tf.data.TextLineDataset(tf.gfile.Glob(src_file))
-    tgt_dataset = tf.data.TextLineDataset(tf.gfile.Glob(tgt_file))
-    skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64)
-
-    iterator = iterator_utils.get_iterator(
-        src_dataset,
-        tgt_dataset,
-        src_vocab_table,
-        tgt_vocab_table,
-        batch_size=hparams.batch_size,
-        sos=hparams.sos,
-        eos=hparams.eos,
-        random_seed=hparams.random_seed,
-        num_buckets=hparams.num_buckets,
-        src_max_len=hparams.src_max_len,
-        tgt_max_len=hparams.tgt_max_len,
-        skip_count=skip_count_placeholder,
-        num_shards=num_workers,
-        shard_index=jobid,
-        use_char_encode=hparams.use_char_encode)
-
-    # Note: One can set model_device_fn to
-    # `tf.train.replica_device_setter(ps_tasks)` for distributed training.
-    model_device_fn = None
-    if extra_args: model_device_fn = extra_args.model_device_fn
-    with tf.device(model_device_fn):
-      model = model_creator(
-          hparams,
-          iterator=iterator,
-          mode=tf.contrib.learn.ModeKeys.TRAIN,
-          source_vocab_table=src_vocab_table,
-          target_vocab_table=tgt_vocab_table,
-          scope=scope,
-          extra_args=extra_args)
-
-  return TrainModel(
-      graph=graph,
-      model=model,
-      iterator=iterator,
-      skip_count_placeholder=skip_count_placeholder)
+    model_creator, hparams, scope=None, num_workers=1, jobid=0, extra_args=None
+):
+    """Create train graph, model, and iterator."""
+    src_file = "%s.%s" % (hparams.train_prefix, hparams.src)
+    tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt)
+    src_vocab_file = hparams.src_vocab_file
+    tgt_vocab_file = hparams.tgt_vocab_file
+
+    graph = tf.Graph()
+
+    with graph.as_default(), tf.container(scope or "train"):
+        src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(
+            src_vocab_file, tgt_vocab_file, hparams.share_vocab
+        )
+
+        src_dataset = tf.data.TextLineDataset(tf.gfile.Glob(src_file))
+        tgt_dataset = tf.data.TextLineDataset(tf.gfile.Glob(tgt_file))
+        skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64)
+
+        iterator = iterator_utils.get_iterator(
+            src_dataset,
+            tgt_dataset,
+            src_vocab_table,
+            tgt_vocab_table,
+            batch_size=hparams.batch_size,
+            sos=hparams.sos,
+            eos=hparams.eos,
+            random_seed=hparams.random_seed,
+            num_buckets=hparams.num_buckets,
+            src_max_len=hparams.src_max_len,
+            tgt_max_len=hparams.tgt_max_len,
+            skip_count=skip_count_placeholder,
+            num_shards=num_workers,
+            shard_index=jobid,
+            use_char_encode=hparams.use_char_encode,
+        )
+
+        # Note: One can set model_device_fn to
+        # `tf.train.replica_device_setter(ps_tasks)` for distributed training.
+        model_device_fn = None
+        if extra_args:
+            model_device_fn = extra_args.model_device_fn
+        with tf.device(model_device_fn):
+            model = model_creator(
+                hparams,
+                iterator=iterator,
+                mode=tf.contrib.learn.ModeKeys.TRAIN,
+                source_vocab_table=src_vocab_table,
+                target_vocab_table=tgt_vocab_table,
+                scope=scope,
+                extra_args=extra_args,
+            )
+
+    return TrainModel(
+        graph=graph,
+        model=model,
+        iterator=iterator,
+        skip_count_placeholder=skip_count_placeholder,
+    )
 
 
 class EvalModel(
-    collections.namedtuple("EvalModel",
-                           ("graph", "model", "src_file_placeholder",
-                            "tgt_file_placeholder", "iterator"))):
-  pass
+    collections.namedtuple(
+        "EvalModel",
+        ("graph", "model", "src_file_placeholder",
+         "tgt_file_placeholder", "iterator"),
+    )
+):
+    pass
 
 
 def create_eval_model(model_creator, hparams, scope=None, extra_args=None):
-  """Create train graph, model, src/tgt file holders, and iterator."""
-  src_vocab_file = hparams.src_vocab_file
-  tgt_vocab_file = hparams.tgt_vocab_file
-  graph = tf.Graph()
-
-  with graph.as_default(), tf.container(scope or "eval"):
-    src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(
-        src_vocab_file, tgt_vocab_file, hparams.share_vocab)
-    reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file(
-        tgt_vocab_file, default_value=vocab_utils.UNK)
-
-    src_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
-    tgt_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
-    src_dataset = tf.data.TextLineDataset(src_file_placeholder)
-    tgt_dataset = tf.data.TextLineDataset(tgt_file_placeholder)
-    iterator = iterator_utils.get_iterator(
-        src_dataset,
-        tgt_dataset,
-        src_vocab_table,
-        tgt_vocab_table,
-        hparams.batch_size,
-        sos=hparams.sos,
-        eos=hparams.eos,
-        random_seed=hparams.random_seed,
-        num_buckets=hparams.num_buckets,
-        src_max_len=hparams.src_max_len_infer,
-        tgt_max_len=hparams.tgt_max_len_infer,
-        use_char_encode=hparams.use_char_encode)
-    model = model_creator(
-        hparams,
+    """Create train graph, model, src/tgt file holders, and iterator."""
+    src_vocab_file = hparams.src_vocab_file
+    tgt_vocab_file = hparams.tgt_vocab_file
+    graph = tf.Graph()
+
+    with graph.as_default(), tf.container(scope or "eval"):
+        src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(
+            src_vocab_file, tgt_vocab_file, hparams.share_vocab
+        )
+        reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file(
+            tgt_vocab_file, default_value=vocab_utils.UNK
+        )
+
+        src_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
+        tgt_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
+        src_dataset = tf.data.TextLineDataset(src_file_placeholder)
+        tgt_dataset = tf.data.TextLineDataset(tgt_file_placeholder)
+        iterator = iterator_utils.get_iterator(
+            src_dataset,
+            tgt_dataset,
+            src_vocab_table,
+            tgt_vocab_table,
+            hparams.batch_size,
+            sos=hparams.sos,
+            eos=hparams.eos,
+            random_seed=hparams.random_seed,
+            num_buckets=hparams.num_buckets,
+            src_max_len=hparams.src_max_len_infer,
+            tgt_max_len=hparams.tgt_max_len_infer,
+            use_char_encode=hparams.use_char_encode,
+        )
+        model = model_creator(
+            hparams,
+            iterator=iterator,
+            mode=tf.contrib.learn.ModeKeys.EVAL,
+            source_vocab_table=src_vocab_table,
+            target_vocab_table=tgt_vocab_table,
+            reverse_target_vocab_table=reverse_tgt_vocab_table,
+            scope=scope,
+            extra_args=extra_args,
+        )
+    return EvalModel(
+        graph=graph,
+        model=model,
+        src_file_placeholder=src_file_placeholder,
+        tgt_file_placeholder=tgt_file_placeholder,
         iterator=iterator,
-        mode=tf.contrib.learn.ModeKeys.EVAL,
-        source_vocab_table=src_vocab_table,
-        target_vocab_table=tgt_vocab_table,
-        reverse_target_vocab_table=reverse_tgt_vocab_table,
-        scope=scope,
-        extra_args=extra_args)
-  return EvalModel(
-      graph=graph,
-      model=model,
-      src_file_placeholder=src_file_placeholder,
-      tgt_file_placeholder=tgt_file_placeholder,
-      iterator=iterator)
+    )
 
 
 class InferModel(
-    collections.namedtuple("InferModel",
-                           ("graph", "model", "src_placeholder",
-                            "batch_size_placeholder", "iterator"))):
-  pass
+    collections.namedtuple(
+        "InferModel",
+        ("graph", "model", "src_placeholder",
+         "batch_size_placeholder", "iterator"),
+    )
+):
+    pass
 
 
 def create_infer_model(model_creator, hparams, scope=None, extra_args=None):
-  """Create inference model."""
-  graph = tf.Graph()
-  src_vocab_file = hparams.src_vocab_file
-  tgt_vocab_file = hparams.tgt_vocab_file
-
-  with graph.as_default(), tf.container(scope or "infer"):
-    src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(
-        src_vocab_file, tgt_vocab_file, hparams.share_vocab)
-    reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file(
-        tgt_vocab_file, default_value=vocab_utils.UNK)
-
-    src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
-    batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64)
-
-    src_dataset = tf.data.Dataset.from_tensor_slices(
-        src_placeholder)
-    iterator = iterator_utils.get_infer_iterator(
-        src_dataset,
-        src_vocab_table,
-        batch_size=batch_size_placeholder,
-        eos=hparams.eos,
-        src_max_len=hparams.src_max_len_infer,
-        use_char_encode=hparams.use_char_encode)
-    model = model_creator(
-        hparams,
+    """Create inference model."""
+    graph = tf.Graph()
+    src_vocab_file = hparams.src_vocab_file
+    tgt_vocab_file = hparams.tgt_vocab_file
+
+    with graph.as_default(), tf.container(scope or "infer"):
+        src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(
+            src_vocab_file, tgt_vocab_file, hparams.share_vocab
+        )
+        reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file(
+            tgt_vocab_file, default_value=vocab_utils.UNK
+        )
+
+        src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
+        batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64)
+
+        src_dataset = tf.data.Dataset.from_tensor_slices(src_placeholder)
+        iterator = iterator_utils.get_infer_iterator(
+            src_dataset,
+            src_vocab_table,
+            batch_size=batch_size_placeholder,
+            eos=hparams.eos,
+            src_max_len=hparams.src_max_len_infer,
+            use_char_encode=hparams.use_char_encode,
+        )
+        model = model_creator(
+            hparams,
+            iterator=iterator,
+            mode=tf.contrib.learn.ModeKeys.INFER,
+            source_vocab_table=src_vocab_table,
+            target_vocab_table=tgt_vocab_table,
+            reverse_target_vocab_table=reverse_tgt_vocab_table,
+            scope=scope,
+            extra_args=extra_args,
+        )
+    return InferModel(
+        graph=graph,
+        model=model,
+        src_placeholder=src_placeholder,
+        batch_size_placeholder=batch_size_placeholder,
         iterator=iterator,
-        mode=tf.contrib.learn.ModeKeys.INFER,
-        source_vocab_table=src_vocab_table,
-        target_vocab_table=tgt_vocab_table,
-        reverse_target_vocab_table=reverse_tgt_vocab_table,
-        scope=scope,
-        extra_args=extra_args)
-  return InferModel(
-      graph=graph,
-      model=model,
-      src_placeholder=src_placeholder,
-      batch_size_placeholder=batch_size_placeholder,
-      iterator=iterator)
+    )
 
 
 def _get_embed_device(vocab_size):
-  """Decide on which device to place an embed matrix given its vocab size."""
-  if vocab_size > VOCAB_SIZE_THRESHOLD_CPU:
-    return "/cpu:0"
-  else:
-    return "/gpu:0"
+    """Decide on which device to place an embed matrix given its vocab size."""
+    if vocab_size > VOCAB_SIZE_THRESHOLD_CPU:
+        return "/cpu:0"
+    else:
+        return "/gpu:0"
 
 
 def _create_pretrained_emb_from_txt(
-    vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32,
-    scope=None):
-  """Load pretrain embeding from embed_file, and return an embedding matrix.
-
-  Args:
-    embed_file: Path to a Glove formated embedding txt file.
-    num_trainable_tokens: Make the first n tokens in the vocab file as trainable
-      variables. Default is 3, which is "<unk>", "<s>" and "</s>".
-  """
-  vocab, _ = vocab_utils.load_vocab(vocab_file)
-  trainable_tokens = vocab[:num_trainable_tokens]
-
-  utils.print_out("# Using pretrained embedding: %s." % embed_file)
-  utils.print_out("  with trainable tokens: ")
-
-  emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file)
-  for token in trainable_tokens:
-    utils.print_out("    %s" % token)
-    if token not in emb_dict:
-      emb_dict[token] = [0.0] * emb_size
-
-  emb_mat = np.array(
-      [emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype())
-  emb_mat = tf.constant(emb_mat)
-  emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
-  with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope:
-    with tf.device(_get_embed_device(num_trainable_tokens)):
-      emb_mat_var = tf.get_variable(
-          "emb_mat_var", [num_trainable_tokens, emb_size])
-  return tf.concat([emb_mat_var, emb_mat_const], 0)
-
-
-def _create_or_load_embed(embed_name, vocab_file, embed_file,
-                          vocab_size, embed_size, dtype):
-  """Create a new or load an existing embedding matrix."""
-  if vocab_file and embed_file:
-    embedding = _create_pretrained_emb_from_txt(vocab_file, embed_file)
-  else:
-    with tf.device(_get_embed_device(vocab_size)):
-      embedding = tf.get_variable(
-          embed_name, [vocab_size, embed_size], dtype)
-  return embedding
-
-
-def create_emb_for_encoder_and_decoder(share_vocab,
-                                       src_vocab_size,
-                                       tgt_vocab_size,
-                                       src_embed_size,
-                                       tgt_embed_size,
-                                       dtype=tf.float32,
-                                       num_enc_partitions=0,
-                                       num_dec_partitions=0,
-                                       src_vocab_file=None,
-                                       tgt_vocab_file=None,
-                                       src_embed_file=None,
-                                       tgt_embed_file=None,
-                                       use_char_encode=False,
-                                       scope=None):
-  """Create embedding matrix for both encoder and decoder.
-
-  Args:
-    share_vocab: A boolean. Whether to share embedding matrix for both
-      encoder and decoder.
-    src_vocab_size: An integer. The source vocab size.
-    tgt_vocab_size: An integer. The target vocab size.
-    src_embed_size: An integer. The embedding dimension for the encoder's
-      embedding.
-    tgt_embed_size: An integer. The embedding dimension for the decoder's
-      embedding.
-    dtype: dtype of the embedding matrix. Default to float32.
-    num_enc_partitions: number of partitions used for the encoder's embedding
-      vars.
-    num_dec_partitions: number of partitions used for the decoder's embedding
-      vars.
-    scope: VariableScope for the created subgraph. Default to "embedding".
-
-  Returns:
-    embedding_encoder: Encoder's embedding matrix.
-    embedding_decoder: Decoder's embedding matrix.
-
-  Raises:
-    ValueError: if use share_vocab but source and target have different vocab
-      size.
-  """
-  if num_enc_partitions <= 1:
-    enc_partitioner = None
-  else:
-    # Note: num_partitions > 1 is required for distributed training due to
-    # embedding_lookup tries to colocate single partition-ed embedding variable
-    # with lookup ops. This may cause embedding variables being placed on worker
-    # jobs.
-    enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions)
-
-  if num_dec_partitions <= 1:
-    dec_partitioner = None
-  else:
-    # Note: num_partitions > 1 is required for distributed training due to
-    # embedding_lookup tries to colocate single partition-ed embedding variable
-    # with lookup ops. This may cause embedding variables being placed on worker
-    # jobs.
-    dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions)
-
-  if src_embed_file and enc_partitioner:
-    raise ValueError(
-        "Can't set num_enc_partitions > 1 when using pretrained encoder "
-        "embedding")
-
-  if tgt_embed_file and dec_partitioner:
-    raise ValueError(
-        "Can't set num_dec_partitions > 1 when using pretrained decdoer "
-        "embedding")
-
-  with tf.variable_scope(
-      scope or "embeddings", dtype=dtype, partitioner=enc_partitioner) as scope:
-    # Share embedding
-    if share_vocab:
-      if src_vocab_size != tgt_vocab_size:
-        raise ValueError("Share embedding but different src/tgt vocab sizes"
-                         " %d vs. %d" % (src_vocab_size, tgt_vocab_size))
-      assert src_embed_size == tgt_embed_size
-      utils.print_out("# Use the same embedding for source and target")
-      vocab_file = src_vocab_file or tgt_vocab_file
-      embed_file = src_embed_file or tgt_embed_file
-
-      embedding_encoder = _create_or_load_embed(
-          "embedding_share", vocab_file, embed_file,
-          src_vocab_size, src_embed_size, dtype)
-      embedding_decoder = embedding_encoder
+    vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32, scope=None
+):
+    """Load pretrain embeding from embed_file, and return an embedding matrix.
+
+    Args:
+      embed_file: Path to a Glove formated embedding txt file.
+      num_trainable_tokens: Make the first n tokens in the vocab file as trainable
+        variables. Default is 3, which is "<unk>", "<s>" and "</s>".
+    """
+    vocab, _ = vocab_utils.load_vocab(vocab_file)
+    trainable_tokens = vocab[:num_trainable_tokens]
+
+    utils.print_out("# Using pretrained embedding: %s." % embed_file)
+    utils.print_out("  with trainable tokens: ")
+
+    emb_dict, emb_size = vocab_utils.load_embed_txt(embed_file)
+    for token in trainable_tokens:
+        utils.print_out("    %s" % token)
+        if token not in emb_dict:
+            emb_dict[token] = [0.0] * emb_size
+
+    emb_mat = np.array(
+        [emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype()
+    )
+    emb_mat = tf.constant(emb_mat)
+    emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
+    with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope:
+        with tf.device(_get_embed_device(num_trainable_tokens)):
+            emb_mat_var = tf.get_variable(
+                "emb_mat_var", [num_trainable_tokens, emb_size]
+            )
+    return tf.concat([emb_mat_var, emb_mat_const], 0)
+
+
+def _create_or_load_embed(
+    embed_name, vocab_file, embed_file, vocab_size, embed_size, dtype
+):
+    """Create a new or load an existing embedding matrix."""
+    if vocab_file and embed_file:
+        embedding = _create_pretrained_emb_from_txt(vocab_file, embed_file)
     else:
-      if not use_char_encode:
-        with tf.variable_scope("encoder", partitioner=enc_partitioner):
-          embedding_encoder = _create_or_load_embed(
-              "embedding_encoder", src_vocab_file, src_embed_file,
-              src_vocab_size, src_embed_size, dtype)
-      else:
-        embedding_encoder = None
-
-      with tf.variable_scope("decoder", partitioner=dec_partitioner):
-        embedding_decoder = _create_or_load_embed(
-            "embedding_decoder", tgt_vocab_file, tgt_embed_file,
-            tgt_vocab_size, tgt_embed_size, dtype)
-
-  return embedding_encoder, embedding_decoder
-
-
-def _single_cell(unit_type, num_units, forget_bias, dropout, mode,
-                 residual_connection=False, device_str=None, residual_fn=None):
-  """Create an instance of a single RNN cell."""
-  # dropout (= 1 - keep_prob) is set to 0 during eval and infer
-  dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0
-
-  # Cell Type
-  if unit_type == "lstm":
-    utils.print_out("  LSTM, forget_bias=%g" % forget_bias, new_line=False)
-    single_cell = tf.contrib.rnn.BasicLSTMCell(
-        num_units,
-        forget_bias=forget_bias)
-  elif unit_type == "gru":
-    utils.print_out("  GRU", new_line=False)
-    single_cell = tf.contrib.rnn.GRUCell(num_units)
-  elif unit_type == "layer_norm_lstm":
-    utils.print_out("  Layer Normalized LSTM, forget_bias=%g" % forget_bias,
-                    new_line=False)
-    single_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
-        num_units,
-        forget_bias=forget_bias,
-        layer_norm=True)
-  elif unit_type == "nas":
-    utils.print_out("  NASCell", new_line=False)
-    single_cell = tf.contrib.rnn.NASCell(num_units)
-  else:
-    raise ValueError("Unknown unit type %s!" % unit_type)
-
-  # Dropout (= 1 - keep_prob)
-  if dropout > 0.0:
-    single_cell = tf.contrib.rnn.DropoutWrapper(
-        cell=single_cell, input_keep_prob=(1.0 - dropout))
-    utils.print_out("  %s, dropout=%g " %(type(single_cell).__name__, dropout),
-                    new_line=False)
-
-  # Residual
-  if residual_connection:
-    single_cell = tf.contrib.rnn.ResidualWrapper(
-        single_cell, residual_fn=residual_fn)
-    utils.print_out("  %s" % type(single_cell).__name__, new_line=False)
-
-  # Device Wrapper
-  if device_str:
-    single_cell = tf.contrib.rnn.DeviceWrapper(single_cell, device_str)
-    utils.print_out("  %s, device=%s" %
-                    (type(single_cell).__name__, device_str), new_line=False)
-
-  return single_cell
-
-
-def _cell_list(unit_type, num_units, num_layers, num_residual_layers,
-               forget_bias, dropout, mode, num_gpus, base_gpu=0,
-               single_cell_fn=None, residual_fn=None):
-  """Create a list of RNN cells."""
-  if not single_cell_fn:
-    single_cell_fn = _single_cell
-
-  # Multi-GPU
-  cell_list = []
-  for i in range(num_layers):
-    utils.print_out("  cell %d" % i, new_line=False)
-    single_cell = single_cell_fn(
+        with tf.device(_get_embed_device(vocab_size)):
+            embedding = tf.get_variable(
+                embed_name, [vocab_size, embed_size], dtype)
+    return embedding
+
+
+def create_emb_for_encoder_and_decoder(
+    share_vocab,
+    src_vocab_size,
+    tgt_vocab_size,
+    src_embed_size,
+    tgt_embed_size,
+    dtype=tf.float32,
+    num_enc_partitions=0,
+    num_dec_partitions=0,
+    src_vocab_file=None,
+    tgt_vocab_file=None,
+    src_embed_file=None,
+    tgt_embed_file=None,
+    use_char_encode=False,
+    scope=None,
+):
+    """Create embedding matrix for both encoder and decoder.
+
+    Args:
+      share_vocab: A boolean. Whether to share embedding matrix for both
+        encoder and decoder.
+      src_vocab_size: An integer. The source vocab size.
+      tgt_vocab_size: An integer. The target vocab size.
+      src_embed_size: An integer. The embedding dimension for the encoder's
+        embedding.
+      tgt_embed_size: An integer. The embedding dimension for the decoder's
+        embedding.
+      dtype: dtype of the embedding matrix. Default to float32.
+      num_enc_partitions: number of partitions used for the encoder's embedding
+        vars.
+      num_dec_partitions: number of partitions used for the decoder's embedding
+        vars.
+      scope: VariableScope for the created subgraph. Default to "embedding".
+
+    Returns:
+      embedding_encoder: Encoder's embedding matrix.
+      embedding_decoder: Decoder's embedding matrix.
+
+    Raises:
+      ValueError: if use share_vocab but source and target have different vocab
+        size.
+    """
+    if num_enc_partitions <= 1:
+        enc_partitioner = None
+    else:
+        # Note: num_partitions > 1 is required for distributed training due to
+        # embedding_lookup tries to colocate single partition-ed embedding variable
+        # with lookup ops. This may cause embedding variables being placed on worker
+        # jobs.
+        enc_partitioner = tf.fixed_size_partitioner(num_enc_partitions)
+
+    if num_dec_partitions <= 1:
+        dec_partitioner = None
+    else:
+        # Note: num_partitions > 1 is required for distributed training due to
+        # embedding_lookup tries to colocate single partition-ed embedding variable
+        # with lookup ops. This may cause embedding variables being placed on worker
+        # jobs.
+        dec_partitioner = tf.fixed_size_partitioner(num_dec_partitions)
+
+    if src_embed_file and enc_partitioner:
+        raise ValueError(
+            "Can't set num_enc_partitions > 1 when using pretrained encoder "
+            "embedding"
+        )
+
+    if tgt_embed_file and dec_partitioner:
+        raise ValueError(
+            "Can't set num_dec_partitions > 1 when using pretrained decdoer "
+            "embedding"
+        )
+
+    with tf.variable_scope(
+        scope or "embeddings", dtype=dtype, partitioner=enc_partitioner
+    ) as scope:
+        # Share embedding
+        if share_vocab:
+            if src_vocab_size != tgt_vocab_size:
+                raise ValueError(
+                    "Share embedding but different src/tgt vocab sizes"
+                    " %d vs. %d" % (src_vocab_size, tgt_vocab_size)
+                )
+            assert src_embed_size == tgt_embed_size
+            utils.print_out("# Use the same embedding for source and target")
+            vocab_file = src_vocab_file or tgt_vocab_file
+            embed_file = src_embed_file or tgt_embed_file
+
+            embedding_encoder = _create_or_load_embed(
+                "embedding_share",
+                vocab_file,
+                embed_file,
+                src_vocab_size,
+                src_embed_size,
+                dtype,
+            )
+            embedding_decoder = embedding_encoder
+        else:
+            if not use_char_encode:
+                with tf.variable_scope("encoder", partitioner=enc_partitioner):
+                    embedding_encoder = _create_or_load_embed(
+                        "embedding_encoder",
+                        src_vocab_file,
+                        src_embed_file,
+                        src_vocab_size,
+                        src_embed_size,
+                        dtype,
+                    )
+            else:
+                embedding_encoder = None
+
+            with tf.variable_scope("decoder", partitioner=dec_partitioner):
+                embedding_decoder = _create_or_load_embed(
+                    "embedding_decoder",
+                    tgt_vocab_file,
+                    tgt_embed_file,
+                    tgt_vocab_size,
+                    tgt_embed_size,
+                    dtype,
+                )
+
+    return embedding_encoder, embedding_decoder
+
+
+def _single_cell(
+    unit_type,
+    num_units,
+    forget_bias,
+    dropout,
+    mode,
+    residual_connection=False,
+    device_str=None,
+    residual_fn=None,
+):
+    """Create an instance of a single RNN cell."""
+    # dropout (= 1 - keep_prob) is set to 0 during eval and infer
+    dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0
+
+    # Cell Type
+    if unit_type == "lstm":
+        utils.print_out("  LSTM, forget_bias=%g" % forget_bias, new_line=False)
+        single_cell = tf.contrib.rnn.BasicLSTMCell(
+            num_units, forget_bias=forget_bias)
+    elif unit_type == "gru":
+        utils.print_out("  GRU", new_line=False)
+        single_cell = tf.contrib.rnn.GRUCell(num_units)
+    elif unit_type == "layer_norm_lstm":
+        utils.print_out(
+            "  Layer Normalized LSTM, forget_bias=%g" % forget_bias, new_line=False
+        )
+        single_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
+            num_units, forget_bias=forget_bias, layer_norm=True
+        )
+    elif unit_type == "nas":
+        utils.print_out("  NASCell", new_line=False)
+        single_cell = tf.contrib.rnn.NASCell(num_units)
+    else:
+        raise ValueError("Unknown unit type %s!" % unit_type)
+
+    # Dropout (= 1 - keep_prob)
+    if dropout > 0.0:
+        single_cell = tf.contrib.rnn.DropoutWrapper(
+            cell=single_cell, input_keep_prob=(1.0 - dropout)
+        )
+        utils.print_out(
+            "  %s, dropout=%g " % (type(single_cell).__name__, dropout), new_line=False
+        )
+
+    # Residual
+    if residual_connection:
+        single_cell = tf.contrib.rnn.ResidualWrapper(
+            single_cell, residual_fn=residual_fn
+        )
+        utils.print_out("  %s" % type(single_cell).__name__, new_line=False)
+
+    # Device Wrapper
+    if device_str:
+        single_cell = tf.contrib.rnn.DeviceWrapper(single_cell, device_str)
+        utils.print_out(
+            "  %s, device=%s" % (type(single_cell).__name__, device_str), new_line=False
+        )
+
+    return single_cell
+
+
+def _cell_list(
+    unit_type,
+    num_units,
+    num_layers,
+    num_residual_layers,
+    forget_bias,
+    dropout,
+    mode,
+    num_gpus,
+    base_gpu=0,
+    single_cell_fn=None,
+    residual_fn=None,
+):
+    """Create a list of RNN cells."""
+    if not single_cell_fn:
+        single_cell_fn = _single_cell
+
+    # Multi-GPU
+    cell_list = []
+    for i in range(num_layers):
+        utils.print_out("  cell %d" % i, new_line=False)
+        single_cell = single_cell_fn(
+            unit_type=unit_type,
+            num_units=num_units,
+            forget_bias=forget_bias,
+            dropout=dropout,
+            mode=mode,
+            residual_connection=(i >= num_layers - num_residual_layers),
+            device_str=get_device_str(i + base_gpu, num_gpus),
+            residual_fn=residual_fn,
+        )
+        utils.print_out("")
+        cell_list.append(single_cell)
+
+    return cell_list
+
+
+def create_rnn_cell(
+    unit_type,
+    num_units,
+    num_layers,
+    num_residual_layers,
+    forget_bias,
+    dropout,
+    mode,
+    num_gpus,
+    base_gpu=0,
+    single_cell_fn=None,
+):
+    """Create multi-layer RNN cell.
+
+    Args:
+      unit_type: string representing the unit type, i.e. "lstm".
+      num_units: the depth of each unit.
+      num_layers: number of cells.
+      num_residual_layers: Number of residual layers from top to bottom. For
+        example, if `num_layers=4` and `num_residual_layers=2`, the last 2 RNN
+        cells in the returned list will be wrapped with `ResidualWrapper`.
+      forget_bias: the initial forget bias of the RNNCell(s).
+      dropout: floating point value between 0.0 and 1.0:
+        the probability of dropout.  this is ignored if `mode != TRAIN`.
+      mode: either tf.contrib.learn.TRAIN/EVAL/INFER
+      num_gpus: The number of gpus to use when performing round-robin
+        placement of layers.
+      base_gpu: The gpu device id to use for the first RNN cell in the
+        returned list. The i-th RNN cell will use `(base_gpu + i) % num_gpus`
+        as its device id.
+      single_cell_fn: allow for adding customized cell.
+        When not specified, we default to model_helper._single_cell
+    Returns:
+      An `RNNCell` instance.
+    """
+    cell_list = _cell_list(
         unit_type=unit_type,
         num_units=num_units,
+        num_layers=num_layers,
+        num_residual_layers=num_residual_layers,
         forget_bias=forget_bias,
         dropout=dropout,
         mode=mode,
-        residual_connection=(i >= num_layers - num_residual_layers),
-        device_str=get_device_str(i + base_gpu, num_gpus),
-        residual_fn=residual_fn
+        num_gpus=num_gpus,
+        base_gpu=base_gpu,
+        single_cell_fn=single_cell_fn,
     )
-    utils.print_out("")
-    cell_list.append(single_cell)
-
-  return cell_list
-
-
-def create_rnn_cell(unit_type, num_units, num_layers, num_residual_layers,
-                    forget_bias, dropout, mode, num_gpus, base_gpu=0,
-                    single_cell_fn=None):
-  """Create multi-layer RNN cell.
-
-  Args:
-    unit_type: string representing the unit type, i.e. "lstm".
-    num_units: the depth of each unit.
-    num_layers: number of cells.
-    num_residual_layers: Number of residual layers from top to bottom. For
-      example, if `num_layers=4` and `num_residual_layers=2`, the last 2 RNN
-      cells in the returned list will be wrapped with `ResidualWrapper`.
-    forget_bias: the initial forget bias of the RNNCell(s).
-    dropout: floating point value between 0.0 and 1.0:
-      the probability of dropout.  this is ignored if `mode != TRAIN`.
-    mode: either tf.contrib.learn.TRAIN/EVAL/INFER
-    num_gpus: The number of gpus to use when performing round-robin
-      placement of layers.
-    base_gpu: The gpu device id to use for the first RNN cell in the
-      returned list. The i-th RNN cell will use `(base_gpu + i) % num_gpus`
-      as its device id.
-    single_cell_fn: allow for adding customized cell.
-      When not specified, we default to model_helper._single_cell
-  Returns:
-    An `RNNCell` instance.
-  """
-  cell_list = _cell_list(unit_type=unit_type,
-                         num_units=num_units,
-                         num_layers=num_layers,
-                         num_residual_layers=num_residual_layers,
-                         forget_bias=forget_bias,
-                         dropout=dropout,
-                         mode=mode,
-                         num_gpus=num_gpus,
-                         base_gpu=base_gpu,
-                         single_cell_fn=single_cell_fn)
-
-  if len(cell_list) == 1:  # Single layer.
-    return cell_list[0]
-  else:  # Multi layers
-    return tf.contrib.rnn.MultiRNNCell(cell_list)
+
+    if len(cell_list) == 1:  # Single layer.
+        return cell_list[0]
+    else:  # Multi layers
+        return tf.contrib.rnn.MultiRNNCell(cell_list)
 
 
 def gradient_clip(gradients, max_gradient_norm):
-  """Clipping gradients of a model."""
-  clipped_gradients, gradient_norm = tf.clip_by_global_norm(
-      gradients, max_gradient_norm)
-  gradient_norm_summary = [tf.summary.scalar("grad_norm", gradient_norm)]
-  gradient_norm_summary.append(
-      tf.summary.scalar("clipped_gradient", tf.global_norm(clipped_gradients)))
+    """Clipping gradients of a model."""
+    clipped_gradients, gradient_norm = tf.clip_by_global_norm(
+        gradients, max_gradient_norm
+    )
+    gradient_norm_summary = [tf.summary.scalar("grad_norm", gradient_norm)]
+    gradient_norm_summary.append(
+        tf.summary.scalar(
+            "clipped_gradient",
+            tf.global_norm(clipped_gradients))
+    )
 
-  return clipped_gradients, gradient_norm_summary, gradient_norm
+    return clipped_gradients, gradient_norm_summary, gradient_norm
 
 
 def print_variables_in_ckpt(ckpt_path):
-  """Print a list of variables in a checkpoint together with their shapes."""
-  utils.print_out("# Variables in ckpt %s" % ckpt_path)
-  reader = tf.train.NewCheckpointReader(ckpt_path)
-  variable_map = reader.get_variable_to_shape_map()
-  for key in sorted(variable_map.keys()):
-    utils.print_out("  %s: %s" % (key, variable_map[key]))
+    """Print a list of variables in a checkpoint together with their shapes."""
+    utils.print_out("# Variables in ckpt %s" % ckpt_path)
+    reader = tf.train.NewCheckpointReader(ckpt_path)
+    variable_map = reader.get_variable_to_shape_map()
+    for key in sorted(variable_map.keys()):
+        utils.print_out("  %s: %s" % (key, variable_map[key]))
 
 
 def load_model(model, ckpt_path, session, name):
-  """Load model from a checkpoint."""
-  start_time = time.time()
-  try:
-    model.saver.restore(session, ckpt_path)
-  except tf.errors.NotFoundError as e:
-    utils.print_out("Can't load checkpoint")
-    print_variables_in_ckpt(ckpt_path)
-    utils.print_out("%s" % str(e))
-
-  session.run(tf.tables_initializer())
-  utils.print_out(
-      "  loaded %s model parameters from %s, time %.2fs" %
-      (name, ckpt_path, time.time() - start_time))
-  return model
-
-
-def avg_checkpoints(model_dir, num_last_checkpoints, global_step,
-                    global_step_name):
-  """Average the last N checkpoints in the model_dir."""
-  checkpoint_state = tf.train.get_checkpoint_state(model_dir)
-  if not checkpoint_state:
-    utils.print_out("# No checkpoint file found in directory: %s" % model_dir)
-    return None
-
-  # Checkpoints are ordered from oldest to newest.
-  checkpoints = (
-      checkpoint_state.all_model_checkpoint_paths[-num_last_checkpoints:])
-
-  if len(checkpoints) < num_last_checkpoints:
+    """Load model from a checkpoint."""
+    start_time = time.time()
+    try:
+        model.saver.restore(session, ckpt_path)
+    except tf.errors.NotFoundError as e:
+        utils.print_out("Can't load checkpoint")
+        print_variables_in_ckpt(ckpt_path)
+        utils.print_out("%s" % str(e))
+
+    session.run(tf.tables_initializer())
     utils.print_out(
-        "# Skipping averaging checkpoints because not enough checkpoints is "
-        "avaliable."
+        "  loaded %s model parameters from %s, time %.2fs"
+        % (name, ckpt_path, time.time() - start_time)
     )
-    return None
+    return model
+
+
+def avg_checkpoints(model_dir, num_last_checkpoints,
+                    global_step, global_step_name):
+    """Average the last N checkpoints in the model_dir."""
+    checkpoint_state = tf.train.get_checkpoint_state(model_dir)
+    if not checkpoint_state:
+        utils.print_out(
+            "# No checkpoint file found in directory: %s" %
+            model_dir)
+        return None
+
+    # Checkpoints are ordered from oldest to newest.
+    checkpoints = checkpoint_state.all_model_checkpoint_paths[-num_last_checkpoints:]
+
+    if len(checkpoints) < num_last_checkpoints:
+        utils.print_out(
+            "# Skipping averaging checkpoints because not enough checkpoints is "
+            "avaliable."
+        )
+        return None
+
+    avg_model_dir = os.path.join(model_dir, "avg_checkpoints")
+    if not tf.gfile.Exists(avg_model_dir):
+        utils.print_out(
+            "# Creating new directory %s for saving averaged checkpoints."
+            % avg_model_dir
+        )
+        tf.gfile.MakeDirs(avg_model_dir)
+
+    utils.print_out("# Reading and averaging variables in checkpoints:")
+    var_list = tf.contrib.framework.list_variables(checkpoints[0])
+    var_values, var_dtypes = {}, {}
+    for name, shape in var_list:
+        if name != global_step_name:
+            var_values[name] = np.zeros(shape)
+
+    for checkpoint in checkpoints:
+        utils.print_out("    %s" % checkpoint)
+        reader = tf.contrib.framework.load_checkpoint(checkpoint)
+        for name in var_values:
+            tensor = reader.get_tensor(name)
+            var_dtypes[name] = tensor.dtype
+            var_values[name] += tensor
 
-  avg_model_dir = os.path.join(model_dir, "avg_checkpoints")
-  if not tf.gfile.Exists(avg_model_dir):
-    utils.print_out(
-        "# Creating new directory %s for saving averaged checkpoints." %
-        avg_model_dir)
-    tf.gfile.MakeDirs(avg_model_dir)
-
-  utils.print_out("# Reading and averaging variables in checkpoints:")
-  var_list = tf.contrib.framework.list_variables(checkpoints[0])
-  var_values, var_dtypes = {}, {}
-  for (name, shape) in var_list:
-    if name != global_step_name:
-      var_values[name] = np.zeros(shape)
-
-  for checkpoint in checkpoints:
-    utils.print_out("    %s" % checkpoint)
-    reader = tf.contrib.framework.load_checkpoint(checkpoint)
     for name in var_values:
-      tensor = reader.get_tensor(name)
-      var_dtypes[name] = tensor.dtype
-      var_values[name] += tensor
-
-  for name in var_values:
-    var_values[name] /= len(checkpoints)
-
-  # Build a graph with same variables in the checkpoints, and save the averaged
-  # variables into the avg_model_dir.
-  with tf.Graph().as_default():
-    tf_vars = [
-        tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[name])
-        for v in var_values
-    ]
-
-    placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars]
-    assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
-    global_step_var = tf.Variable(
-        global_step, name=global_step_name, trainable=False)
-    saver = tf.train.Saver(tf.all_variables())
-
-    with tf.Session() as sess:
-      sess.run(tf.initialize_all_variables())
-      for p, assign_op, (name, value) in zip(placeholders, assign_ops,
-                                             six.iteritems(var_values)):
-        sess.run(assign_op, {p: value})
-
-      # Use the built saver to save the averaged checkpoint. Only keep 1
-      # checkpoint and the best checkpoint will be moved to avg_best_metric_dir.
-      saver.save(
-          sess,
-          os.path.join(avg_model_dir, "translate.ckpt"))
-
-  return avg_model_dir
+        var_values[name] /= len(checkpoints)
+
+    # Build a graph with same variables in the checkpoints, and save the averaged
+    # variables into the avg_model_dir.
+    with tf.Graph().as_default():
+        tf_vars = [
+            tf.get_variable(
+                v,
+                shape=var_values[v].shape,
+                dtype=var_dtypes[name])
+            for v in var_values
+        ]
+
+        placeholders = [
+            tf.placeholder(
+                v.dtype,
+                shape=v.shape) for v in tf_vars]
+        assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
+        global_step_var = tf.Variable(
+            global_step, name=global_step_name, trainable=False
+        )
+        saver = tf.train.Saver(tf.all_variables())
+
+        with tf.Session() as sess:
+            sess.run(tf.initialize_all_variables())
+            for p, assign_op, (name, value) in zip(
+                placeholders, assign_ops, six.iteritems(var_values)
+            ):
+                sess.run(assign_op, {p: value})
+
+            # Use the built saver to save the averaged checkpoint. Only keep 1
+            # checkpoint and the best checkpoint will be moved to
+            # avg_best_metric_dir.
+            saver.save(sess, os.path.join(avg_model_dir, "translate.ckpt"))
+
+    return avg_model_dir
 
 
 def create_or_load_model(model, model_dir, session, name):
-  """Create translation model and initialize or load parameters in session."""
-  latest_ckpt = tf.train.latest_checkpoint(model_dir)
-  if latest_ckpt:
-    model = load_model(model, latest_ckpt, session, name)
-  else:
-    start_time = time.time()
-    session.run(tf.global_variables_initializer())
-    session.run(tf.tables_initializer())
-    utils.print_out("  created %s model with fresh parameters, time %.2fs" %
-                    (name, time.time() - start_time))
+    """Create translation model and initialize or load parameters in session."""
+    latest_ckpt = tf.train.latest_checkpoint(model_dir)
+    if latest_ckpt:
+        model = load_model(model, latest_ckpt, session, name)
+    else:
+        start_time = time.time()
+        session.run(tf.global_variables_initializer())
+        session.run(tf.tables_initializer())
+        utils.print_out(
+            "  created %s model with fresh parameters, time %.2fs"
+            % (name, time.time() - start_time)
+        )
 
-  global_step = model.global_step.eval(session=session)
-  return model, global_step
+    global_step = model.global_step.eval(session=session)
+    return model, global_step
 
 
 def compute_perplexity(model, sess, name):
-  """Compute perplexity of the output of the model.
-
-  Args:
-    model: model for compute perplexity.
-    sess: tensorflow session to use.
-    name: name of the batch.
-
-  Returns:
-    The perplexity of the eval outputs.
-  """
-  total_loss = 0
-  total_predict_count = 0
-  start_time = time.time()
+    """Compute perplexity of the output of the model.
+
+    Args:
+      model: model for compute perplexity.
+      sess: tensorflow session to use.
+      name: name of the batch.
+
+    Returns:
+      The perplexity of the eval outputs.
+    """
+    total_loss = 0
+    total_predict_count = 0
+    start_time = time.time()
 
-  while True:
-    try:
-      output_tuple = model.eval(sess)
-      total_loss += output_tuple.eval_loss * output_tuple.batch_size
-      total_predict_count += output_tuple.predict_count
-    except tf.errors.OutOfRangeError:
-      break
-
-  perplexity = utils.safe_exp(total_loss / total_predict_count)
-  utils.print_time("  eval %s: perplexity %.2f" % (name, perplexity),
-                   start_time)
-  return perplexity
+    while True:
+        try:
+            output_tuple = model.eval(sess)
+            total_loss += output_tuple.eval_loss * output_tuple.batch_size
+            total_predict_count += output_tuple.predict_count
+        except tf.errors.OutOfRangeError:
+            break
+
+    perplexity = utils.safe_exp(total_loss / total_predict_count)
+    utils.print_time(
+        "  eval %s: perplexity %.2f" %
+        (name, perplexity), start_time)
+    return perplexity
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_test.py
index 168895844..b753981f3 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_test.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_test.py
@@ -34,1001 +34,1113 @@
 int32 = np.int32
 array = np.array
 
-SOS = '<s>'
-EOS = '</s>'
+SOS = "<s>"
+EOS = "</s>"
 
 
 class ModelTest(tf.test.TestCase):
 
-  @classmethod
-  def setUpClass(cls):
-    cls.actual_vars_values = {}
-    cls.expected_vars_values = {
-        'AttentionMechanismBahdanau/att_layer_weight/shape': (10, 5),
-        'AttentionMechanismBahdanau/att_layer_weight/sum':
-            -0.64981574,
-        'AttentionMechanismBahdanau/last_dec_weight/shape': (10, 20),
-        'AttentionMechanismBahdanau/last_dec_weight/sum':
-            0.058069646,
-        'AttentionMechanismBahdanau/last_enc_weight/shape': (10, 20),
-        'AttentionMechanismBahdanau/last_enc_weight/sum':
-            0.058028102,
-        'AttentionMechanismLuong/att_layer_weight/shape': (10, 5),
-        'AttentionMechanismLuong/att_layer_weight/sum':
-            -0.64981574,
-        'AttentionMechanismLuong/last_dec_weight/shape': (10, 20),
-        'AttentionMechanismLuong/last_dec_weight/sum':
-            0.058069646,
-        'AttentionMechanismLuong/last_enc_weight/shape': (10, 20),
-        'AttentionMechanismLuong/last_enc_weight/sum':
-            0.058028102,
-        'AttentionMechanismNormedBahdanau/att_layer_weight/shape': (10, 5),
-        'AttentionMechanismNormedBahdanau/att_layer_weight/sum':
-            -0.64981973,
-        'AttentionMechanismNormedBahdanau/last_dec_weight/shape': (10, 20),
-        'AttentionMechanismNormedBahdanau/last_dec_weight/sum':
-            0.058067322,
-        'AttentionMechanismNormedBahdanau/last_enc_weight/shape': (10, 20),
-        'AttentionMechanismNormedBahdanau/last_enc_weight/sum':
-            0.058022559,
-        'AttentionMechanismScaledLuong/att_layer_weight/shape': (10, 5),
-        'AttentionMechanismScaledLuong/att_layer_weight/sum':
-            -0.64981574,
-        'AttentionMechanismScaledLuong/last_dec_weight/shape': (10, 20),
-        'AttentionMechanismScaledLuong/last_dec_weight/sum':
-            0.058069646,
-        'AttentionMechanismScaledLuong/last_enc_weight/shape': (10, 20),
-        'AttentionMechanismScaledLuong/last_enc_weight/sum':
-            0.058028102,
-        'GNMTModel_gnmt/last_dec_weight/shape': (15, 20),
-        'GNMTModel_gnmt/last_dec_weight/sum':
-            -0.48634407,
-        'GNMTModel_gnmt/last_enc_weight/shape': (10, 20),
-        'GNMTModel_gnmt/last_enc_weight/sum':
-            0.058025002,
-        'GNMTModel_gnmt/mem_layer_weight/shape': (5, 5),
-        'GNMTModel_gnmt/mem_layer_weight/sum':
-            -0.44815454,
-        'GNMTModel_gnmt_v2/last_dec_weight/shape': (15, 20),
-        'GNMTModel_gnmt_v2/last_dec_weight/sum':
-            -0.48634392,
-        'GNMTModel_gnmt_v2/last_enc_weight/shape': (10, 20),
-        'GNMTModel_gnmt_v2/last_enc_weight/sum':
-            0.058024824,
-        'GNMTModel_gnmt_v2/mem_layer_weight/shape': (5, 5),
-        'GNMTModel_gnmt_v2/mem_layer_weight/sum':
-            -0.44815454,
-        'NoAttentionNoResidualUniEncoder/last_dec_weight/shape': (10, 20),
-        'NoAttentionNoResidualUniEncoder/last_dec_weight/sum':
-            0.057424068,
-        'NoAttentionNoResidualUniEncoder/last_enc_weight/shape': (10, 20),
-        'NoAttentionNoResidualUniEncoder/last_enc_weight/sum':
-            0.058453858,
-        'NoAttentionResidualBiEncoder/last_dec_weight/shape': (10, 20),
-        'NoAttentionResidualBiEncoder/last_dec_weight/sum':
-            0.058025062,
-        'NoAttentionResidualBiEncoder/last_enc_weight/shape': (10, 20),
-        'NoAttentionResidualBiEncoder/last_enc_weight/sum':
-            0.058053195,
-        'UniEncoderBottomAttentionArchitecture/last_dec_weight/shape': (10, 20),
-        'UniEncoderBottomAttentionArchitecture/last_dec_weight/sum':
-            0.058024943,
-        'UniEncoderBottomAttentionArchitecture/last_enc_weight/shape': (10, 20),
-        'UniEncoderBottomAttentionArchitecture/last_enc_weight/sum':
-            0.058025122,
-        'UniEncoderBottomAttentionArchitecture/mem_layer_weight/shape': (5, 5),
-        'UniEncoderBottomAttentionArchitecture/mem_layer_weight/sum':
-            -0.44815454,
-        'UniEncoderStandardAttentionArchitecture/last_dec_weight/shape': (10,
-                                                                          20),
-        'UniEncoderStandardAttentionArchitecture/last_dec_weight/sum':
-            0.058025002,
-        'UniEncoderStandardAttentionArchitecture/last_enc_weight/shape': (10,
-                                                                          20),
-        'UniEncoderStandardAttentionArchitecture/last_enc_weight/sum':
-            0.058024883,
-        'UniEncoderStandardAttentionArchitecture/mem_layer_weight/shape': (5,
-                                                                           5),
-        'UniEncoderStandardAttentionArchitecture/mem_layer_weight/sum':
-            -0.44815454,
-    }
-
-    cls.actual_train_values = {}
-    cls.expected_train_values = {
-        'AttentionMechanismBahdanau/loss': 8.8519039,
-        'AttentionMechanismLuong/loss': 8.8519039,
-        'AttentionMechanismNormedBahdanau/loss': 8.851902,
-        'AttentionMechanismScaledLuong/loss': 8.8519039,
-        'GNMTModel_gnmt/loss': 8.8519087,
-        'GNMTModel_gnmt_v2/loss': 8.8519087,
-        'NoAttentionNoResidualUniEncoder/loss': 8.8516064,
-        'NoAttentionResidualBiEncoder/loss': 8.851984,
-        'UniEncoderStandardAttentionArchitecture/loss': 8.8519087,
-        'InitializerGlorotNormal/loss': 8.9779415,
-        'InitializerGlorotUniform/loss': 8.7643699,
-        'SampledSoftmaxLoss/loss': 5.83928,
-    }
-
-    cls.actual_eval_values = {}
-    cls.expected_eval_values = {
-        'AttentionMechanismBahdanau/loss': 8.8517132,
-        'AttentionMechanismBahdanau/predict_count': 11.0,
-        'AttentionMechanismLuong/loss': 8.8517132,
-        'AttentionMechanismLuong/predict_count': 11.0,
-        'AttentionMechanismNormedBahdanau/loss': 8.8517132,
-        'AttentionMechanismNormedBahdanau/predict_count': 11.0,
-        'AttentionMechanismScaledLuong/loss': 8.8517132,
-        'AttentionMechanismScaledLuong/predict_count': 11.0,
-        'GNMTModel_gnmt/loss': 8.8443403,
-        'GNMTModel_gnmt/predict_count': 11.0,
-        'GNMTModel_gnmt_v2/loss': 8.8443756,
-        'GNMTModel_gnmt_v2/predict_count': 11.0,
-        'NoAttentionNoResidualUniEncoder/loss': 8.8440113,
-        'NoAttentionNoResidualUniEncoder/predict_count': 11.0,
-        'NoAttentionResidualBiEncoder/loss': 8.8291245,
-        'NoAttentionResidualBiEncoder/predict_count': 11.0,
-        'UniEncoderBottomAttentionArchitecture/loss': 8.844492,
-        'UniEncoderBottomAttentionArchitecture/predict_count': 11.0,
-        'UniEncoderStandardAttentionArchitecture/loss': 8.8517151,
-        'UniEncoderStandardAttentionArchitecture/predict_count': 11.0
-    }
-
-    cls.actual_infer_values = {}
-    cls.expected_infer_values = {
-        'AttentionMechanismBahdanau/logits_sum': -0.026374687,
-        'AttentionMechanismLuong/logits_sum': -0.026374735,
-        'AttentionMechanismNormedBahdanau/logits_sum': -0.026376063,
-        'AttentionMechanismScaledLuong/logits_sum': -0.026374735,
-        'GNMTModel_gnmt/logits_sum': -1.10848486,
-        'GNMTModel_gnmt_v2/logits_sum': -1.10950875,
-        'NoAttentionNoResidualUniEncoder/logits_sum': -1.0808625,
-        'NoAttentionResidualBiEncoder/logits_sum': -2.8147559,
-        'UniEncoderBottomAttentionArchitecture/logits_sum': -0.97026241,
-        'UniEncoderStandardAttentionArchitecture/logits_sum': -0.02665353
-    }
-
-    cls.actual_beam_sentences = {}
-    cls.expected_beam_sentences = {
-        'BeamSearchAttentionModel: batch 0 of beam 0': '',
-        'BeamSearchAttentionModel: batch 0 of beam 1': '%s a %s a' % (SOS, SOS),
-        'BeamSearchAttentionModel: batch 1 of beam 0': '',
-        'BeamSearchAttentionModel: batch 1 of beam 1': 'b',
-        'BeamSearchBasicModel: batch 0 of beam 0': 'b b b b',
-        'BeamSearchBasicModel: batch 0 of beam 1': 'b b b %s' % SOS,
-        'BeamSearchBasicModel: batch 0 of beam 2': 'b b b c',
-        'BeamSearchBasicModel: batch 1 of beam 0': 'b b b b',
-        'BeamSearchBasicModel: batch 1 of beam 1': 'a b b b',
-        'BeamSearchBasicModel: batch 1 of beam 2': 'b b b %s' % SOS,
-        'BeamSearchGNMTModel: batch 0 of beam 0': '',
-        'BeamSearchGNMTModel: batch 1 of beam 0': '',
-    }
-
-  @classmethod
-  def tearDownClass(cls):
-    print('ModelTest - actual_vars_values: ')
-    pprint.pprint(cls.actual_vars_values)
-    sys.stdout.flush()
-
-    print('ModelTest - actual_train_values: ')
-    pprint.pprint(cls.actual_train_values)
-    sys.stdout.flush()
-
-    print('ModelTest - actual_eval_values: ')
-    pprint.pprint(cls.actual_eval_values)
-    sys.stdout.flush()
-
-    print('ModelTest - actual_infer_values: ')
-    pprint.pprint(cls.actual_infer_values)
-    sys.stdout.flush()
-
-    print('ModelTest - actual_beam_sentences: ')
-    pprint.pprint(cls.actual_beam_sentences)
-    sys.stdout.flush()
-
-  def assertAllClose(self, *args, **kwargs):
-    kwargs['atol'] = 5e-2
-    kwargs['rtol'] = 5e-2
-    return super(ModelTest, self).assertAllClose(*args, **kwargs)
-
-  def _assertModelVariableNames(self, expected_var_names, model_var_names,
-                                name):
-
-    print('{} variable names are: '.format(name), model_var_names)
-
-    self.assertEqual(len(expected_var_names), len(model_var_names))
-    self.assertEqual(sorted(expected_var_names), sorted(model_var_names))
-
-  def _assertModelVariable(self, variable, sess, name):
-    var_shape = tuple(variable.get_shape().as_list())
-    var_res = sess.run(variable)
-    var_weight_sum = np.sum(var_res)
-
-    print('{} weight sum is: '.format(name), var_weight_sum)
-    expected_sum = self.expected_vars_values[name + '/sum']
-    expected_shape = self.expected_vars_values[name + '/shape']
-    self.actual_vars_values[name + '/sum'] = var_weight_sum
-    self.actual_vars_values[name + '/shape'] = var_shape
-
-    self.assertEqual(expected_shape, var_shape)
-    self.assertAllClose(expected_sum, var_weight_sum)
-
-  def _assertTrainStepsLoss(self, m, sess, name, num_steps=1):
-    for _ in range(num_steps):
-      _, output_tuple = m.train(sess)
-    loss = output_tuple.train_loss
-    print('{} {}-th step loss is: '.format(name, num_steps), loss)
-    expected_loss = self.expected_train_values[name + '/loss']
-    self.actual_train_values[name + '/loss'] = loss
-
-    self.assertAllClose(expected_loss, loss)
-
-  def _assertEvalLossAndPredictCount(self, m, sess, name):
-    output_tuple = m.eval(sess)
-    loss = output_tuple.eval_loss
-    predict_count = output_tuple.predict_count
-    print('{} eval loss is: '.format(name), loss)
-    print('{} predict count is: '.format(name), predict_count)
-    expected_loss = self.expected_eval_values[name + '/loss']
-    expected_predict_count = self.expected_eval_values[name + '/predict_count']
-    self.actual_eval_values[name + '/loss'] = loss
-    self.actual_eval_values[name + '/predict_count'] = predict_count
-
-    self.assertAllClose(expected_loss, loss)
-    self.assertAllClose(expected_predict_count, predict_count)
-
-  def _assertInferLogits(self, m, sess, name):
-    output_tuple = m.infer(sess)
-    logits_sum = np.sum(output_tuple.infer_logits)
-
-    print('{} infer logits sum is: '.format(name), logits_sum)
-    expected_logits_sum = self.expected_infer_values[name + '/logits_sum']
-    self.actual_infer_values[name + '/logits_sum'] = logits_sum
-
-    self.assertAllClose(expected_logits_sum, logits_sum)
-
-  def _assertBeamSearchOutputs(self, m, sess, assert_top_k_sentence, name):
-    nmt_outputs, _ = m.decode(sess)
-
-    for i in range(assert_top_k_sentence):
-      output_words = nmt_outputs[i]
-      for j in range(output_words.shape[0]):
-        sentence = nmt_utils.get_translation(
-            output_words, j, tgt_eos=EOS, subword_option='')
-        sentence_key = ('%s: batch %d of beam %d' % (name, j, i))
-        self.actual_beam_sentences[sentence_key] = sentence
-        expected_sentence = self.expected_beam_sentences[sentence_key]
-        self.assertEqual(expected_sentence, sentence)
-
-  def _createTestTrainModel(self, m_creator, hparams, sess):
-    train_mode = tf.contrib.learn.ModeKeys.TRAIN
-    train_iterator, src_vocab_table, tgt_vocab_table = (
-        common_test_utils.create_test_iterator(hparams, train_mode))
-    train_m = m_creator(
-        hparams,
-        train_mode,
-        train_iterator,
-        src_vocab_table,
-        tgt_vocab_table,
-        scope='dynamic_seq2seq')
-    sess.run(tf.global_variables_initializer())
-    sess.run(tf.tables_initializer())
-    sess.run(train_iterator.initializer)
-    return train_m
-
-  def _createTestEvalModel(self, m_creator, hparams, sess):
-    eval_mode = tf.contrib.learn.ModeKeys.EVAL
-    eval_iterator, src_vocab_table, tgt_vocab_table = (
-        common_test_utils.create_test_iterator(hparams, eval_mode))
-    eval_m = m_creator(
-        hparams,
-        eval_mode,
-        eval_iterator,
-        src_vocab_table,
-        tgt_vocab_table,
-        scope='dynamic_seq2seq')
-    sess.run(tf.tables_initializer())
-    sess.run(eval_iterator.initializer)
-    return eval_m
-
-  def _createTestInferModel(
-      self, m_creator, hparams, sess, init_global_vars=False):
-    infer_mode = tf.contrib.learn.ModeKeys.INFER
-    (infer_iterator, src_vocab_table,
-     tgt_vocab_table, reverse_tgt_vocab_table) = (
-         common_test_utils.create_test_iterator(hparams, infer_mode))
-    infer_m = m_creator(
-        hparams,
-        infer_mode,
-        infer_iterator,
-        src_vocab_table,
-        tgt_vocab_table,
-        reverse_tgt_vocab_table,
-        scope='dynamic_seq2seq')
-    if init_global_vars:
-      sess.run(tf.global_variables_initializer())
-    sess.run(tf.tables_initializer())
-    sess.run(infer_iterator.initializer)
-    return infer_m
-
-  def _get_session_config(self):
-    config = tf.ConfigProto()
-    config.allow_soft_placement = True
-    return config
-
-  ## Testing 3 encoders:
-  # uni: no attention, no residual, 1 layers
-  # bi: no attention, with residual, 4 layers
-  def testNoAttentionNoResidualUniEncoder(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        num_layers=1,
-        attention='',
-        attention_architecture='',
-        use_residual=False,)
-
-    workers, _ = tf.test.create_local_cluster(1, 0)
-    worker = workers[0]
-
-    # pylint: disable=line-too-long
-    expected_var_names = [
-        'dynamic_seq2seq/encoder/embedding_encoder:0',
-        'dynamic_seq2seq/decoder/embedding_decoder:0',
-        'dynamic_seq2seq/encoder/rnn/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/output_projection/kernel:0'
-    ]
-    # pylint: enable=line-too-long
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        train_m = self._createTestTrainModel(model.Model, hparams, sess)
-
-        m_vars = tf.trainable_variables()
-        self._assertModelVariableNames(expected_var_names,
-                                       [v.name for v in m_vars],
-                                       'NoAttentionNoResidualUniEncoder')
-
-        with tf.variable_scope('dynamic_seq2seq', reuse=True):
-          last_enc_weight = tf.get_variable(
-              'encoder/rnn/basic_lstm_cell/kernel')
-          last_dec_weight = tf.get_variable('decoder/basic_lstm_cell/kernel')
-        self._assertTrainStepsLoss(train_m, sess,
-                                   'NoAttentionNoResidualUniEncoder')
-        self._assertModelVariable(
-            last_enc_weight, sess,
-            'NoAttentionNoResidualUniEncoder/last_enc_weight')
-        self._assertModelVariable(
-            last_dec_weight, sess,
-            'NoAttentionNoResidualUniEncoder/last_dec_weight')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        eval_m = self._createTestEvalModel(model.Model, hparams, sess)
-        self._assertEvalLossAndPredictCount(eval_m, sess,
-                                            'NoAttentionNoResidualUniEncoder')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        infer_m = self._createTestInferModel(model.Model, hparams, sess)
-        self._assertInferLogits(infer_m, sess,
-                                'NoAttentionNoResidualUniEncoder')
-
-  def testNoAttentionResidualBiEncoder(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='bi',
-        num_layers=4,
-        attention='',
-        attention_architecture='',
-        use_residual=True,)
-
-    workers, _ = tf.test.create_local_cluster(1, 0)
-    worker = workers[0]
-
-    # pylint: disable=line-too-long
-    expected_var_names = [
-        'dynamic_seq2seq/encoder/embedding_encoder:0',
-        'dynamic_seq2seq/decoder/embedding_decoder:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/fw/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/fw/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/fw/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/fw/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_3/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_3/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/output_projection/kernel:0'
-    ]
-    # pylint: enable=line-too-long
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        train_m = self._createTestTrainModel(model.Model, hparams, sess)
-
-        m_vars = tf.trainable_variables()
-        self._assertModelVariableNames(expected_var_names,
-                                       [v.name for v in m_vars],
-                                       'NoAttentionResidualBiEncoder')
-        with tf.variable_scope('dynamic_seq2seq', reuse=True):
-          last_enc_weight = tf.get_variable(
-              'encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_1/basic_lstm_cell/kernel'
-          )
-          last_dec_weight = tf.get_variable(
-              'decoder/multi_rnn_cell/cell_3/basic_lstm_cell/kernel')
-        self._assertTrainStepsLoss(train_m, sess,
-                                   'NoAttentionResidualBiEncoder')
-        self._assertModelVariable(
-            last_enc_weight, sess,
-            'NoAttentionResidualBiEncoder/last_enc_weight')
-        self._assertModelVariable(
-            last_dec_weight, sess,
-            'NoAttentionResidualBiEncoder/last_dec_weight')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        eval_m = self._createTestEvalModel(model.Model, hparams, sess)
-        self._assertEvalLossAndPredictCount(eval_m, sess,
-                                            'NoAttentionResidualBiEncoder')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        infer_m = self._createTestInferModel(model.Model, hparams, sess)
-        self._assertInferLogits(infer_m, sess, 'NoAttentionResidualBiEncoder')
-
-  ## Test attention mechanisms: luong, scaled_luong, bahdanau, normed_bahdanau
-  def testAttentionMechanismLuong(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        attention='luong',
-        attention_architecture='standard',
-        num_layers=2,
-        use_residual=False,)
-
-    workers, _ = tf.test.create_local_cluster(1, 0)
-    worker = workers[0]
-
-    # pylint: disable=line-too-long
-    expected_var_names = [
-        'dynamic_seq2seq/encoder/embedding_encoder:0',
-        'dynamic_seq2seq/decoder/embedding_decoder:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/memory_layer/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/attention_layer/kernel:0',
-        'dynamic_seq2seq/decoder/output_projection/kernel:0'
-    ]
-    # pylint: enable=line-too-long
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        train_m = self._createTestTrainModel(attention_model.AttentionModel,
-                                             hparams, sess)
-
-        m_vars = tf.trainable_variables()
-        self._assertModelVariableNames(
-            expected_var_names, [v.name
-                                 for v in m_vars], 'AttentionMechanismLuong')
-
-        with tf.variable_scope('dynamic_seq2seq', reuse=True):
-          # pylint: disable=line-too-long
-          last_enc_weight = tf.get_variable(
-              'encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel')
-          last_dec_weight = tf.get_variable(
-              'decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel')
-          att_layer_weight = tf.get_variable(
-              'decoder/attention/attention_layer/kernel')
-          # pylint: enable=line-too-long
-        self._assertTrainStepsLoss(train_m, sess, 'AttentionMechanismLuong')
-        self._assertModelVariable(last_enc_weight, sess,
-                                  'AttentionMechanismLuong/last_enc_weight')
-        self._assertModelVariable(last_dec_weight, sess,
-                                  'AttentionMechanismLuong/last_dec_weight')
-        self._assertModelVariable(att_layer_weight, sess,
-                                  'AttentionMechanismLuong/att_layer_weight')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        eval_m = self._createTestEvalModel(attention_model.AttentionModel,
-                                           hparams, sess)
-        self._assertEvalLossAndPredictCount(eval_m, sess,
-                                            'AttentionMechanismLuong')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        infer_m = self._createTestInferModel(attention_model.AttentionModel,
-                                             hparams, sess)
-        self._assertInferLogits(infer_m, sess, 'AttentionMechanismLuong')
-
-  def testAttentionMechanismScaledLuong(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        attention='scaled_luong',
-        attention_architecture='standard',
-        num_layers=2,
-        use_residual=False,)
-
-    workers, _ = tf.test.create_local_cluster(1, 0)
-    worker = workers[0]
-
-    # pylint: disable=line-too-long
-    expected_var_names = [
-        'dynamic_seq2seq/encoder/embedding_encoder:0',
-        'dynamic_seq2seq/decoder/embedding_decoder:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/memory_layer/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/luong_attention/attention_g:0',
-        'dynamic_seq2seq/decoder/attention/attention_layer/kernel:0',
-        'dynamic_seq2seq/decoder/output_projection/kernel:0'
-    ]
-    # pylint: enable=line-too-long
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        train_m = self._createTestTrainModel(attention_model.AttentionModel,
-                                             hparams, sess)
-
-        m_vars = tf.trainable_variables()
-        self._assertModelVariableNames(expected_var_names,
-                                       [v.name for v in m_vars],
-                                       'AttentionMechanismScaledLuong')
-
-        with tf.variable_scope('dynamic_seq2seq', reuse=True):
-          # pylint: disable=line-too-long
-          last_enc_weight = tf.get_variable(
-              'encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel')
-          last_dec_weight = tf.get_variable(
-              'decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel')
-          att_layer_weight = tf.get_variable(
-              'decoder/attention/attention_layer/kernel')
-          # pylint: enable=line-too-long
-
-        self._assertTrainStepsLoss(train_m, sess,
-                                   'AttentionMechanismScaledLuong')
-        self._assertModelVariable(
-            last_enc_weight, sess,
-            'AttentionMechanismScaledLuong/last_enc_weight')
-        self._assertModelVariable(
-            last_dec_weight, sess,
-            'AttentionMechanismScaledLuong/last_dec_weight')
-        self._assertModelVariable(
-            att_layer_weight, sess,
-            'AttentionMechanismScaledLuong/att_layer_weight')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        eval_m = self._createTestEvalModel(attention_model.AttentionModel,
-                                           hparams, sess)
-        self._assertEvalLossAndPredictCount(eval_m, sess,
-                                            'AttentionMechanismScaledLuong')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        infer_m = self._createTestInferModel(attention_model.AttentionModel,
-                                             hparams, sess)
-        self._assertInferLogits(infer_m, sess, 'AttentionMechanismScaledLuong')
-
-  def testAttentionMechanismBahdanau(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        attention='bahdanau',
-        attention_architecture='standard',
-        num_layers=2,
-        use_residual=False,)
-
-    workers, _ = tf.test.create_local_cluster(1, 0)
-    worker = workers[0]
-
-    # pylint: disable=line-too-long
-    expected_var_names = [
-        'dynamic_seq2seq/encoder/embedding_encoder:0',
-        'dynamic_seq2seq/decoder/embedding_decoder:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/memory_layer/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/bahdanau_attention/query_layer/kernel:0',
-        'dynamic_seq2seq/decoder/attention/bahdanau_attention/attention_v:0',
-        'dynamic_seq2seq/decoder/attention/attention_layer/kernel:0',
-        'dynamic_seq2seq/decoder/output_projection/kernel:0'
-    ]
-    # pylint: enable=line-too-long
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        train_m = self._createTestTrainModel(attention_model.AttentionModel,
-                                             hparams, sess)
-
-        m_vars = tf.trainable_variables()
-        self._assertModelVariableNames(
-            expected_var_names, [v.name
-                                 for v in m_vars], 'AttentionMechanismBahdanau')
-
-        with tf.variable_scope('dynamic_seq2seq', reuse=True):
-          # pylint: disable=line-too-long
-          last_enc_weight = tf.get_variable(
-              'encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel')
-          last_dec_weight = tf.get_variable(
-              'decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel')
-          att_layer_weight = tf.get_variable(
-              'decoder/attention/attention_layer/kernel')
-          # pylint: enable=line-too-long
-        self._assertTrainStepsLoss(train_m, sess, 'AttentionMechanismBahdanau')
-        self._assertModelVariable(last_enc_weight, sess,
-                                  'AttentionMechanismBahdanau/last_enc_weight')
-        self._assertModelVariable(last_dec_weight, sess,
-                                  'AttentionMechanismBahdanau/last_dec_weight')
-        self._assertModelVariable(att_layer_weight, sess,
-                                  'AttentionMechanismBahdanau/att_layer_weight')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        eval_m = self._createTestEvalModel(attention_model.AttentionModel,
-                                           hparams, sess)
-        self._assertEvalLossAndPredictCount(eval_m, sess,
-                                            'AttentionMechanismBahdanau')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        infer_m = self._createTestInferModel(attention_model.AttentionModel,
-                                             hparams, sess)
-        self._assertInferLogits(infer_m, sess, 'AttentionMechanismBahdanau')
-
-  def testAttentionMechanismNormedBahdanau(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        attention='normed_bahdanau',
-        attention_architecture='standard',
-        num_layers=2,
-        use_residual=False,)
-
-    workers, _ = tf.test.create_local_cluster(1, 0)
-    worker = workers[0]
-
-    # pylint: disable=line-too-long
-    expected_var_names = [
-        'dynamic_seq2seq/encoder/embedding_encoder:0',
-        'dynamic_seq2seq/decoder/embedding_decoder:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/memory_layer/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/bahdanau_attention/query_layer/kernel:0',
-        'dynamic_seq2seq/decoder/attention/bahdanau_attention/attention_v:0',
-        'dynamic_seq2seq/decoder/attention/bahdanau_attention/attention_g:0',
-        'dynamic_seq2seq/decoder/attention/bahdanau_attention/attention_b:0',
-        'dynamic_seq2seq/decoder/attention/attention_layer/kernel:0',
-        'dynamic_seq2seq/decoder/output_projection/kernel:0'
-    ]
-    # pylint: enable=line-too-long
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        train_m = self._createTestTrainModel(attention_model.AttentionModel,
-                                             hparams, sess)
-
-        m_vars = tf.trainable_variables()
-        self._assertModelVariableNames(expected_var_names,
-                                       [v.name for v in m_vars],
-                                       'AttentionMechanismNormedBahdanau')
-
-        with tf.variable_scope('dynamic_seq2seq', reuse=True):
-          # pylint: disable=line-too-long
-          last_enc_weight = tf.get_variable(
-              'encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel')
-          last_dec_weight = tf.get_variable(
-              'decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel')
-          att_layer_weight = tf.get_variable(
-              'decoder/attention/attention_layer/kernel')
-          # pylint: enable=line-too-long
-        self._assertTrainStepsLoss(train_m, sess,
-                                   'AttentionMechanismNormedBahdanau')
-        self._assertModelVariable(
-            last_enc_weight, sess,
-            'AttentionMechanismNormedBahdanau/last_enc_weight')
-        self._assertModelVariable(
-            last_dec_weight, sess,
-            'AttentionMechanismNormedBahdanau/last_dec_weight')
-        self._assertModelVariable(
-            att_layer_weight, sess,
-            'AttentionMechanismNormedBahdanau/att_layer_weight')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        eval_m = self._createTestEvalModel(attention_model.AttentionModel,
-                                           hparams, sess)
-        self._assertEvalLossAndPredictCount(eval_m, sess,
-                                            'AttentionMechanismNormedBahdanau')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        infer_m = self._createTestInferModel(attention_model.AttentionModel,
-                                             hparams, sess)
-        self._assertInferLogits(infer_m, sess,
-                                'AttentionMechanismNormedBahdanau')
-
-  ## Test encoder vs. attention (all use residual):
-  # uni encoder, standard attention
-  def testUniEncoderStandardAttentionArchitecture(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        num_layers=4,
-        attention='scaled_luong',
-        attention_architecture='standard',)
-
-    workers, _ = tf.test.create_local_cluster(1, 0)
-    worker = workers[0]
-
-    # pylint: disable=line-too-long
-    expected_var_names = [
-        'dynamic_seq2seq/encoder/embedding_encoder:0',
-        'dynamic_seq2seq/decoder/embedding_decoder:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/memory_layer/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_3/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_3/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/attention/luong_attention/attention_g:0',
-        'dynamic_seq2seq/decoder/attention/attention_layer/kernel:0',
-        'dynamic_seq2seq/decoder/output_projection/kernel:0'
-    ]
-    # pylint: enable=line-too-long
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        train_m = self._createTestTrainModel(attention_model.AttentionModel,
-                                             hparams, sess)
-
-        m_vars = tf.trainable_variables()
-        self._assertModelVariableNames(expected_var_names, [
-            v.name for v in m_vars
-        ], 'UniEncoderStandardAttentionArchitecture')
-        with tf.variable_scope('dynamic_seq2seq', reuse=True):
-          last_enc_weight = tf.get_variable(
-              'encoder/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/kernel')
-          last_dec_weight = tf.get_variable(
-              'decoder/attention/multi_rnn_cell/cell_3/basic_lstm_cell/kernel')
-          mem_layer_weight = tf.get_variable('decoder/memory_layer/kernel')
-        self._assertTrainStepsLoss(train_m, sess,
-                                   'UniEncoderStandardAttentionArchitecture')
-        self._assertModelVariable(
-            last_enc_weight, sess,
-            'UniEncoderStandardAttentionArchitecture/last_enc_weight')
-        self._assertModelVariable(
-            last_dec_weight, sess,
-            'UniEncoderStandardAttentionArchitecture/last_dec_weight')
-        self._assertModelVariable(
-            mem_layer_weight, sess,
-            'UniEncoderStandardAttentionArchitecture/mem_layer_weight')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        eval_m = self._createTestEvalModel(attention_model.AttentionModel,
-                                           hparams, sess)
-        self._assertEvalLossAndPredictCount(
-            eval_m, sess, 'UniEncoderStandardAttentionArchitecture')
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        infer_m = self._createTestInferModel(attention_model.AttentionModel,
-                                             hparams, sess)
-        self._assertInferLogits(infer_m, sess,
-                                'UniEncoderStandardAttentionArchitecture')
-
-  # Test gnmt model.
-  def _testGNMTModel(self, architecture):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='gnmt',
-        num_layers=4,
-        attention='scaled_luong',
-        attention_architecture=architecture)
-
-    workers, _ = tf.test.create_local_cluster(1, 0)
-    worker = workers[0]
-
-    # pylint: disable=line-too-long
-    expected_var_names = [
-        'dynamic_seq2seq/encoder/embedding_encoder:0',
-        'dynamic_seq2seq/decoder/embedding_decoder:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/fw/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/fw/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/bw/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/bidirectional_rnn/bw/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/memory_layer/kernel:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_0_attention/attention/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_0_attention/attention/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_0_attention/attention/luong_attention/attention_g:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_3/basic_lstm_cell/kernel:0',
-        'dynamic_seq2seq/decoder/multi_rnn_cell/cell_3/basic_lstm_cell/bias:0',
-        'dynamic_seq2seq/decoder/output_projection/kernel:0'
-    ]
-    # pylint: enable=line-too-long
-
-    test_prefix = 'GNMTModel_%s' % architecture
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        train_m = self._createTestTrainModel(gnmt_model.GNMTModel, hparams,
-                                             sess)
-
-        m_vars = tf.trainable_variables()
-        self._assertModelVariableNames(expected_var_names,
-                                       [v.name for v in m_vars], test_prefix)
-        with tf.variable_scope('dynamic_seq2seq', reuse=True):
-          last_enc_weight = tf.get_variable(
-              'encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/kernel')
-          last_dec_weight = tf.get_variable(
-              'decoder/multi_rnn_cell/cell_3/basic_lstm_cell/kernel')
-          mem_layer_weight = tf.get_variable('decoder/memory_layer/kernel')
-        self._assertTrainStepsLoss(train_m, sess, test_prefix)
-
-        self._assertModelVariable(last_enc_weight, sess,
-                                  '%s/last_enc_weight' % test_prefix)
-        self._assertModelVariable(last_dec_weight, sess,
-                                  '%s/last_dec_weight' % test_prefix)
-        self._assertModelVariable(mem_layer_weight, sess,
-                                  '%s/mem_layer_weight' % test_prefix)
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        eval_m = self._createTestEvalModel(gnmt_model.GNMTModel, hparams, sess)
-        self._assertEvalLossAndPredictCount(eval_m, sess, test_prefix)
-
-    with tf.Graph().as_default():
-      with tf.Session(worker.target, config=self._get_session_config()) as sess:
-        infer_m = self._createTestInferModel(gnmt_model.GNMTModel, hparams,
-                                             sess)
-        self._assertInferLogits(infer_m, sess, test_prefix)
-
-  def testGNMTModel(self):
-    self._testGNMTModel('gnmt')
-
-  def testGNMTModelV2(self):
-    self._testGNMTModel('gnmt_v2')
-
-  # Test beam search.
-  def testBeamSearchBasicModel(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        num_layers=1,
-        attention='',
-        attention_architecture='',
-        use_residual=False,)
-    hparams.beam_width = 3
-    hparams.infer_mode = "beam_search"
-    hparams.tgt_max_len_infer = 4
-    assert_top_k_sentence = 3
-
-    with self.test_session() as sess:
-      infer_m = self._createTestInferModel(
-          model.Model, hparams, sess, True)
-      self._assertBeamSearchOutputs(
-          infer_m, sess, assert_top_k_sentence, 'BeamSearchBasicModel')
-
-  def testBeamSearchAttentionModel(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        attention='scaled_luong',
-        attention_architecture='standard',
-        num_layers=2,
-        use_residual=False,)
-    hparams.beam_width = 3
-    hparams.infer_mode = "beam_search"
-    hparams.tgt_max_len_infer = 4
-    assert_top_k_sentence = 2
-
-    with self.test_session() as sess:
-      infer_m = self._createTestInferModel(
-          attention_model.AttentionModel, hparams, sess, True)
-      self._assertBeamSearchOutputs(
-          infer_m, sess, assert_top_k_sentence, 'BeamSearchAttentionModel')
-
-  def testBeamSearchGNMTModel(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='gnmt',
-        num_layers=4,
-        attention='scaled_luong',
-        attention_architecture='gnmt')
-    hparams.beam_width = 3
-    hparams.infer_mode = "beam_search"
-    hparams.tgt_max_len_infer = 4
-    assert_top_k_sentence = 1
-
-    with self.test_session() as sess:
-      infer_m = self._createTestInferModel(
-          gnmt_model.GNMTModel, hparams, sess, True)
-      self._assertBeamSearchOutputs(
-          infer_m, sess, assert_top_k_sentence, 'BeamSearchGNMTModel')
-
-  def testInitializerGlorotNormal(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        num_layers=1,
-        attention='',
-        attention_architecture='',
-        use_residual=False,
-        init_op='glorot_normal')
-
-    with self.test_session() as sess:
-      train_m = self._createTestTrainModel(model.Model, hparams, sess)
-      self._assertTrainStepsLoss(train_m, sess,
-                                 'InitializerGlorotNormal')
-
-  def testInitializerGlorotUniform(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='uni',
-        num_layers=1,
-        attention='',
-        attention_architecture='',
-        use_residual=False,
-        init_op='glorot_uniform')
-
-    with self.test_session() as sess:
-      train_m = self._createTestTrainModel(model.Model, hparams, sess)
-      self._assertTrainStepsLoss(train_m, sess,
-                                 'InitializerGlorotUniform')
-
-  def testSampledSoftmaxLoss(self):
-    hparams = common_test_utils.create_test_hparams(
-        encoder_type='gnmt',
-        num_layers=4,
-        attention='scaled_luong',
-        attention_architecture='gnmt')
-    hparams.num_sampled_softmax = 3
-
-    with self.test_session() as sess:
-      train_m = self._createTestTrainModel(gnmt_model.GNMTModel, hparams, sess)
-      self._assertTrainStepsLoss(train_m, sess,
-                                 'SampledSoftmaxLoss')
-
-if __name__ == '__main__':
-  tf.test.main()
+    @classmethod
+    def setUpClass(cls):
+        cls.actual_vars_values = {}
+        cls.expected_vars_values = {
+            "AttentionMechanismBahdanau/att_layer_weight/shape": (10, 5),
+            "AttentionMechanismBahdanau/att_layer_weight/sum": -0.64981574,
+            "AttentionMechanismBahdanau/last_dec_weight/shape": (10, 20),
+            "AttentionMechanismBahdanau/last_dec_weight/sum": 0.058069646,
+            "AttentionMechanismBahdanau/last_enc_weight/shape": (10, 20),
+            "AttentionMechanismBahdanau/last_enc_weight/sum": 0.058028102,
+            "AttentionMechanismLuong/att_layer_weight/shape": (10, 5),
+            "AttentionMechanismLuong/att_layer_weight/sum": -0.64981574,
+            "AttentionMechanismLuong/last_dec_weight/shape": (10, 20),
+            "AttentionMechanismLuong/last_dec_weight/sum": 0.058069646,
+            "AttentionMechanismLuong/last_enc_weight/shape": (10, 20),
+            "AttentionMechanismLuong/last_enc_weight/sum": 0.058028102,
+            "AttentionMechanismNormedBahdanau/att_layer_weight/shape": (10, 5),
+            "AttentionMechanismNormedBahdanau/att_layer_weight/sum": -0.64981973,
+            "AttentionMechanismNormedBahdanau/last_dec_weight/shape": (10, 20),
+            "AttentionMechanismNormedBahdanau/last_dec_weight/sum": 0.058067322,
+            "AttentionMechanismNormedBahdanau/last_enc_weight/shape": (10, 20),
+            "AttentionMechanismNormedBahdanau/last_enc_weight/sum": 0.058022559,
+            "AttentionMechanismScaledLuong/att_layer_weight/shape": (10, 5),
+            "AttentionMechanismScaledLuong/att_layer_weight/sum": -0.64981574,
+            "AttentionMechanismScaledLuong/last_dec_weight/shape": (10, 20),
+            "AttentionMechanismScaledLuong/last_dec_weight/sum": 0.058069646,
+            "AttentionMechanismScaledLuong/last_enc_weight/shape": (10, 20),
+            "AttentionMechanismScaledLuong/last_enc_weight/sum": 0.058028102,
+            "GNMTModel_gnmt/last_dec_weight/shape": (15, 20),
+            "GNMTModel_gnmt/last_dec_weight/sum": -0.48634407,
+            "GNMTModel_gnmt/last_enc_weight/shape": (10, 20),
+            "GNMTModel_gnmt/last_enc_weight/sum": 0.058025002,
+            "GNMTModel_gnmt/mem_layer_weight/shape": (5, 5),
+            "GNMTModel_gnmt/mem_layer_weight/sum": -0.44815454,
+            "GNMTModel_gnmt_v2/last_dec_weight/shape": (15, 20),
+            "GNMTModel_gnmt_v2/last_dec_weight/sum": -0.48634392,
+            "GNMTModel_gnmt_v2/last_enc_weight/shape": (10, 20),
+            "GNMTModel_gnmt_v2/last_enc_weight/sum": 0.058024824,
+            "GNMTModel_gnmt_v2/mem_layer_weight/shape": (5, 5),
+            "GNMTModel_gnmt_v2/mem_layer_weight/sum": -0.44815454,
+            "NoAttentionNoResidualUniEncoder/last_dec_weight/shape": (10, 20),
+            "NoAttentionNoResidualUniEncoder/last_dec_weight/sum": 0.057424068,
+            "NoAttentionNoResidualUniEncoder/last_enc_weight/shape": (10, 20),
+            "NoAttentionNoResidualUniEncoder/last_enc_weight/sum": 0.058453858,
+            "NoAttentionResidualBiEncoder/last_dec_weight/shape": (10, 20),
+            "NoAttentionResidualBiEncoder/last_dec_weight/sum": 0.058025062,
+            "NoAttentionResidualBiEncoder/last_enc_weight/shape": (10, 20),
+            "NoAttentionResidualBiEncoder/last_enc_weight/sum": 0.058053195,
+            "UniEncoderBottomAttentionArchitecture/last_dec_weight/shape": (10, 20),
+            "UniEncoderBottomAttentionArchitecture/last_dec_weight/sum": 0.058024943,
+            "UniEncoderBottomAttentionArchitecture/last_enc_weight/shape": (10, 20),
+            "UniEncoderBottomAttentionArchitecture/last_enc_weight/sum": 0.058025122,
+            "UniEncoderBottomAttentionArchitecture/mem_layer_weight/shape": (5, 5),
+            "UniEncoderBottomAttentionArchitecture/mem_layer_weight/sum": -0.44815454,
+            "UniEncoderStandardAttentionArchitecture/last_dec_weight/shape": (10, 20),
+            "UniEncoderStandardAttentionArchitecture/last_dec_weight/sum": 0.058025002,
+            "UniEncoderStandardAttentionArchitecture/last_enc_weight/shape": (10, 20),
+            "UniEncoderStandardAttentionArchitecture/last_enc_weight/sum": 0.058024883,
+            "UniEncoderStandardAttentionArchitecture/mem_layer_weight/shape": (5, 5),
+            "UniEncoderStandardAttentionArchitecture/mem_layer_weight/sum": -0.44815454,
+        }
+
+        cls.actual_train_values = {}
+        cls.expected_train_values = {
+            "AttentionMechanismBahdanau/loss": 8.8519039,
+            "AttentionMechanismLuong/loss": 8.8519039,
+            "AttentionMechanismNormedBahdanau/loss": 8.851902,
+            "AttentionMechanismScaledLuong/loss": 8.8519039,
+            "GNMTModel_gnmt/loss": 8.8519087,
+            "GNMTModel_gnmt_v2/loss": 8.8519087,
+            "NoAttentionNoResidualUniEncoder/loss": 8.8516064,
+            "NoAttentionResidualBiEncoder/loss": 8.851984,
+            "UniEncoderStandardAttentionArchitecture/loss": 8.8519087,
+            "InitializerGlorotNormal/loss": 8.9779415,
+            "InitializerGlorotUniform/loss": 8.7643699,
+            "SampledSoftmaxLoss/loss": 5.83928,
+        }
+
+        cls.actual_eval_values = {}
+        cls.expected_eval_values = {
+            "AttentionMechanismBahdanau/loss": 8.8517132,
+            "AttentionMechanismBahdanau/predict_count": 11.0,
+            "AttentionMechanismLuong/loss": 8.8517132,
+            "AttentionMechanismLuong/predict_count": 11.0,
+            "AttentionMechanismNormedBahdanau/loss": 8.8517132,
+            "AttentionMechanismNormedBahdanau/predict_count": 11.0,
+            "AttentionMechanismScaledLuong/loss": 8.8517132,
+            "AttentionMechanismScaledLuong/predict_count": 11.0,
+            "GNMTModel_gnmt/loss": 8.8443403,
+            "GNMTModel_gnmt/predict_count": 11.0,
+            "GNMTModel_gnmt_v2/loss": 8.8443756,
+            "GNMTModel_gnmt_v2/predict_count": 11.0,
+            "NoAttentionNoResidualUniEncoder/loss": 8.8440113,
+            "NoAttentionNoResidualUniEncoder/predict_count": 11.0,
+            "NoAttentionResidualBiEncoder/loss": 8.8291245,
+            "NoAttentionResidualBiEncoder/predict_count": 11.0,
+            "UniEncoderBottomAttentionArchitecture/loss": 8.844492,
+            "UniEncoderBottomAttentionArchitecture/predict_count": 11.0,
+            "UniEncoderStandardAttentionArchitecture/loss": 8.8517151,
+            "UniEncoderStandardAttentionArchitecture/predict_count": 11.0,
+        }
+
+        cls.actual_infer_values = {}
+        cls.expected_infer_values = {
+            "AttentionMechanismBahdanau/logits_sum": -0.026374687,
+            "AttentionMechanismLuong/logits_sum": -0.026374735,
+            "AttentionMechanismNormedBahdanau/logits_sum": -0.026376063,
+            "AttentionMechanismScaledLuong/logits_sum": -0.026374735,
+            "GNMTModel_gnmt/logits_sum": -1.10848486,
+            "GNMTModel_gnmt_v2/logits_sum": -1.10950875,
+            "NoAttentionNoResidualUniEncoder/logits_sum": -1.0808625,
+            "NoAttentionResidualBiEncoder/logits_sum": -2.8147559,
+            "UniEncoderBottomAttentionArchitecture/logits_sum": -0.97026241,
+            "UniEncoderStandardAttentionArchitecture/logits_sum": -0.02665353,
+        }
+
+        cls.actual_beam_sentences = {}
+        cls.expected_beam_sentences = {
+            "BeamSearchAttentionModel: batch 0 of beam 0": "",
+            "BeamSearchAttentionModel: batch 0 of beam 1": "%s a %s a" % (SOS, SOS),
+            "BeamSearchAttentionModel: batch 1 of beam 0": "",
+            "BeamSearchAttentionModel: batch 1 of beam 1": "b",
+            "BeamSearchBasicModel: batch 0 of beam 0": "b b b b",
+            "BeamSearchBasicModel: batch 0 of beam 1": "b b b %s" % SOS,
+            "BeamSearchBasicModel: batch 0 of beam 2": "b b b c",
+            "BeamSearchBasicModel: batch 1 of beam 0": "b b b b",
+            "BeamSearchBasicModel: batch 1 of beam 1": "a b b b",
+            "BeamSearchBasicModel: batch 1 of beam 2": "b b b %s" % SOS,
+            "BeamSearchGNMTModel: batch 0 of beam 0": "",
+            "BeamSearchGNMTModel: batch 1 of beam 0": "",
+        }
+
+    @classmethod
+    def tearDownClass(cls):
+        print("ModelTest - actual_vars_values: ")
+        pprint.pprint(cls.actual_vars_values)
+        sys.stdout.flush()
+
+        print("ModelTest - actual_train_values: ")
+        pprint.pprint(cls.actual_train_values)
+        sys.stdout.flush()
+
+        print("ModelTest - actual_eval_values: ")
+        pprint.pprint(cls.actual_eval_values)
+        sys.stdout.flush()
+
+        print("ModelTest - actual_infer_values: ")
+        pprint.pprint(cls.actual_infer_values)
+        sys.stdout.flush()
+
+        print("ModelTest - actual_beam_sentences: ")
+        pprint.pprint(cls.actual_beam_sentences)
+        sys.stdout.flush()
+
+    def assertAllClose(self, *args, **kwargs):
+        kwargs["atol"] = 5e-2
+        kwargs["rtol"] = 5e-2
+        return super(ModelTest, self).assertAllClose(*args, **kwargs)
+
+    def _assertModelVariableNames(
+            self, expected_var_names, model_var_names, name):
+
+        print("{} variable names are: ".format(name), model_var_names)
+
+        self.assertEqual(len(expected_var_names), len(model_var_names))
+        self.assertEqual(sorted(expected_var_names), sorted(model_var_names))
+
+    def _assertModelVariable(self, variable, sess, name):
+        var_shape = tuple(variable.get_shape().as_list())
+        var_res = sess.run(variable)
+        var_weight_sum = np.sum(var_res)
+
+        print("{} weight sum is: ".format(name), var_weight_sum)
+        expected_sum = self.expected_vars_values[name + "/sum"]
+        expected_shape = self.expected_vars_values[name + "/shape"]
+        self.actual_vars_values[name + "/sum"] = var_weight_sum
+        self.actual_vars_values[name + "/shape"] = var_shape
+
+        self.assertEqual(expected_shape, var_shape)
+        self.assertAllClose(expected_sum, var_weight_sum)
+
+    def _assertTrainStepsLoss(self, m, sess, name, num_steps=1):
+        for _ in range(num_steps):
+            _, output_tuple = m.train(sess)
+        loss = output_tuple.train_loss
+        print("{} {}-th step loss is: ".format(name, num_steps), loss)
+        expected_loss = self.expected_train_values[name + "/loss"]
+        self.actual_train_values[name + "/loss"] = loss
+
+        self.assertAllClose(expected_loss, loss)
+
+    def _assertEvalLossAndPredictCount(self, m, sess, name):
+        output_tuple = m.eval(sess)
+        loss = output_tuple.eval_loss
+        predict_count = output_tuple.predict_count
+        print("{} eval loss is: ".format(name), loss)
+        print("{} predict count is: ".format(name), predict_count)
+        expected_loss = self.expected_eval_values[name + "/loss"]
+        expected_predict_count = self.expected_eval_values[name +
+                                                           "/predict_count"]
+        self.actual_eval_values[name + "/loss"] = loss
+        self.actual_eval_values[name + "/predict_count"] = predict_count
+
+        self.assertAllClose(expected_loss, loss)
+        self.assertAllClose(expected_predict_count, predict_count)
+
+    def _assertInferLogits(self, m, sess, name):
+        output_tuple = m.infer(sess)
+        logits_sum = np.sum(output_tuple.infer_logits)
+
+        print("{} infer logits sum is: ".format(name), logits_sum)
+        expected_logits_sum = self.expected_infer_values[name + "/logits_sum"]
+        self.actual_infer_values[name + "/logits_sum"] = logits_sum
+
+        self.assertAllClose(expected_logits_sum, logits_sum)
+
+    def _assertBeamSearchOutputs(self, m, sess, assert_top_k_sentence, name):
+        nmt_outputs, _ = m.decode(sess)
+
+        for i in range(assert_top_k_sentence):
+            output_words = nmt_outputs[i]
+            for j in range(output_words.shape[0]):
+                sentence = nmt_utils.get_translation(
+                    output_words, j, tgt_eos=EOS, subword_option=""
+                )
+                sentence_key = "%s: batch %d of beam %d" % (name, j, i)
+                self.actual_beam_sentences[sentence_key] = sentence
+                expected_sentence = self.expected_beam_sentences[sentence_key]
+                self.assertEqual(expected_sentence, sentence)
+
+    def _createTestTrainModel(self, m_creator, hparams, sess):
+        train_mode = tf.contrib.learn.ModeKeys.TRAIN
+        train_iterator, src_vocab_table, tgt_vocab_table = (
+            common_test_utils.create_test_iterator(hparams, train_mode)
+        )
+        train_m = m_creator(
+            hparams,
+            train_mode,
+            train_iterator,
+            src_vocab_table,
+            tgt_vocab_table,
+            scope="dynamic_seq2seq",
+        )
+        sess.run(tf.global_variables_initializer())
+        sess.run(tf.tables_initializer())
+        sess.run(train_iterator.initializer)
+        return train_m
+
+    def _createTestEvalModel(self, m_creator, hparams, sess):
+        eval_mode = tf.contrib.learn.ModeKeys.EVAL
+        eval_iterator, src_vocab_table, tgt_vocab_table = (
+            common_test_utils.create_test_iterator(hparams, eval_mode)
+        )
+        eval_m = m_creator(
+            hparams,
+            eval_mode,
+            eval_iterator,
+            src_vocab_table,
+            tgt_vocab_table,
+            scope="dynamic_seq2seq",
+        )
+        sess.run(tf.tables_initializer())
+        sess.run(eval_iterator.initializer)
+        return eval_m
+
+    def _createTestInferModel(self, m_creator, hparams,
+                              sess, init_global_vars=False):
+        infer_mode = tf.contrib.learn.ModeKeys.INFER
+        (infer_iterator, src_vocab_table, tgt_vocab_table, reverse_tgt_vocab_table) = (
+            common_test_utils.create_test_iterator(hparams, infer_mode)
+        )
+        infer_m = m_creator(
+            hparams,
+            infer_mode,
+            infer_iterator,
+            src_vocab_table,
+            tgt_vocab_table,
+            reverse_tgt_vocab_table,
+            scope="dynamic_seq2seq",
+        )
+        if init_global_vars:
+            sess.run(tf.global_variables_initializer())
+        sess.run(tf.tables_initializer())
+        sess.run(infer_iterator.initializer)
+        return infer_m
+
+    def _get_session_config(self):
+        config = tf.ConfigProto()
+        config.allow_soft_placement = True
+        return config
+
+    # Testing 3 encoders:
+    # uni: no attention, no residual, 1 layers
+    # bi: no attention, with residual, 4 layers
+    def testNoAttentionNoResidualUniEncoder(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=1,
+            attention="",
+            attention_architecture="",
+            use_residual=False,
+        )
+
+        workers, _ = tf.test.create_local_cluster(1, 0)
+        worker = workers[0]
+
+        # pylint: disable=line-too-long
+        expected_var_names = [
+            "dynamic_seq2seq/encoder/embedding_encoder:0",
+            "dynamic_seq2seq/decoder/embedding_decoder:0",
+            "dynamic_seq2seq/encoder/rnn/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/output_projection/kernel:0",
+        ]
+        # pylint: enable=line-too-long
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                train_m = self._createTestTrainModel(
+                    model.Model, hparams, sess)
+
+                m_vars = tf.trainable_variables()
+                self._assertModelVariableNames(
+                    expected_var_names,
+                    [v.name for v in m_vars],
+                    "NoAttentionNoResidualUniEncoder",
+                )
+
+                with tf.variable_scope("dynamic_seq2seq", reuse=True):
+                    last_enc_weight = tf.get_variable(
+                        "encoder/rnn/basic_lstm_cell/kernel"
+                    )
+                    last_dec_weight = tf.get_variable(
+                        "decoder/basic_lstm_cell/kernel")
+                self._assertTrainStepsLoss(
+                    train_m, sess, "NoAttentionNoResidualUniEncoder"
+                )
+                self._assertModelVariable(
+                    last_enc_weight,
+                    sess,
+                    "NoAttentionNoResidualUniEncoder/last_enc_weight",
+                )
+                self._assertModelVariable(
+                    last_dec_weight,
+                    sess,
+                    "NoAttentionNoResidualUniEncoder/last_dec_weight",
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                eval_m = self._createTestEvalModel(model.Model, hparams, sess)
+                self._assertEvalLossAndPredictCount(
+                    eval_m, sess, "NoAttentionNoResidualUniEncoder"
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                infer_m = self._createTestInferModel(
+                    model.Model, hparams, sess)
+                self._assertInferLogits(
+                    infer_m, sess, "NoAttentionNoResidualUniEncoder"
+                )
+
+    def testNoAttentionResidualBiEncoder(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="bi",
+            num_layers=4,
+            attention="",
+            attention_architecture="",
+            use_residual=True,
+        )
+
+        workers, _ = tf.test.create_local_cluster(1, 0)
+        worker = workers[0]
+
+        # pylint: disable=line-too-long
+        expected_var_names = [
+            "dynamic_seq2seq/encoder/embedding_encoder:0",
+            "dynamic_seq2seq/decoder/embedding_decoder:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/fw/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/fw/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/fw/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/fw/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_3/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_3/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/output_projection/kernel:0",
+        ]
+        # pylint: enable=line-too-long
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                train_m = self._createTestTrainModel(
+                    model.Model, hparams, sess)
+
+                m_vars = tf.trainable_variables()
+                self._assertModelVariableNames(
+                    expected_var_names,
+                    [v.name for v in m_vars],
+                    "NoAttentionResidualBiEncoder",
+                )
+                with tf.variable_scope("dynamic_seq2seq", reuse=True):
+                    last_enc_weight = tf.get_variable(
+                        "encoder/bidirectional_rnn/bw/multi_rnn_cell/cell_1/basic_lstm_cell/kernel"
+                    )
+                    last_dec_weight = tf.get_variable(
+                        "decoder/multi_rnn_cell/cell_3/basic_lstm_cell/kernel"
+                    )
+                self._assertTrainStepsLoss(
+                    train_m, sess, "NoAttentionResidualBiEncoder"
+                )
+                self._assertModelVariable(
+                    last_enc_weight,
+                    sess,
+                    "NoAttentionResidualBiEncoder/last_enc_weight",
+                )
+                self._assertModelVariable(
+                    last_dec_weight,
+                    sess,
+                    "NoAttentionResidualBiEncoder/last_dec_weight",
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                eval_m = self._createTestEvalModel(model.Model, hparams, sess)
+                self._assertEvalLossAndPredictCount(
+                    eval_m, sess, "NoAttentionResidualBiEncoder"
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                infer_m = self._createTestInferModel(
+                    model.Model, hparams, sess)
+                self._assertInferLogits(
+                    infer_m, sess, "NoAttentionResidualBiEncoder")
+
+    # Test attention mechanisms: luong, scaled_luong, bahdanau, normed_bahdanau
+    def testAttentionMechanismLuong(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            attention="luong",
+            attention_architecture="standard",
+            num_layers=2,
+            use_residual=False,
+        )
+
+        workers, _ = tf.test.create_local_cluster(1, 0)
+        worker = workers[0]
+
+        # pylint: disable=line-too-long
+        expected_var_names = [
+            "dynamic_seq2seq/encoder/embedding_encoder:0",
+            "dynamic_seq2seq/decoder/embedding_decoder:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/memory_layer/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/attention_layer/kernel:0",
+            "dynamic_seq2seq/decoder/output_projection/kernel:0",
+        ]
+        # pylint: enable=line-too-long
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                train_m = self._createTestTrainModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+
+                m_vars = tf.trainable_variables()
+                self._assertModelVariableNames(
+                    expected_var_names,
+                    [v.name for v in m_vars],
+                    "AttentionMechanismLuong",
+                )
+
+                with tf.variable_scope("dynamic_seq2seq", reuse=True):
+                    # pylint: disable=line-too-long
+                    last_enc_weight = tf.get_variable(
+                        "encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel"
+                    )
+                    last_dec_weight = tf.get_variable(
+                        "decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel"
+                    )
+                    att_layer_weight = tf.get_variable(
+                        "decoder/attention/attention_layer/kernel"
+                    )
+                    # pylint: enable=line-too-long
+                self._assertTrainStepsLoss(
+                    train_m, sess, "AttentionMechanismLuong")
+                self._assertModelVariable(
+                    last_enc_weight, sess, "AttentionMechanismLuong/last_enc_weight"
+                )
+                self._assertModelVariable(
+                    last_dec_weight, sess, "AttentionMechanismLuong/last_dec_weight"
+                )
+                self._assertModelVariable(
+                    att_layer_weight, sess, "AttentionMechanismLuong/att_layer_weight"
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                eval_m = self._createTestEvalModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertEvalLossAndPredictCount(
+                    eval_m, sess, "AttentionMechanismLuong"
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                infer_m = self._createTestInferModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertInferLogits(
+                    infer_m, sess, "AttentionMechanismLuong")
+
+    def testAttentionMechanismScaledLuong(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            attention="scaled_luong",
+            attention_architecture="standard",
+            num_layers=2,
+            use_residual=False,
+        )
+
+        workers, _ = tf.test.create_local_cluster(1, 0)
+        worker = workers[0]
+
+        # pylint: disable=line-too-long
+        expected_var_names = [
+            "dynamic_seq2seq/encoder/embedding_encoder:0",
+            "dynamic_seq2seq/decoder/embedding_decoder:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/memory_layer/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/luong_attention/attention_g:0",
+            "dynamic_seq2seq/decoder/attention/attention_layer/kernel:0",
+            "dynamic_seq2seq/decoder/output_projection/kernel:0",
+        ]
+        # pylint: enable=line-too-long
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                train_m = self._createTestTrainModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+
+                m_vars = tf.trainable_variables()
+                self._assertModelVariableNames(
+                    expected_var_names,
+                    [v.name for v in m_vars],
+                    "AttentionMechanismScaledLuong",
+                )
+
+                with tf.variable_scope("dynamic_seq2seq", reuse=True):
+                    # pylint: disable=line-too-long
+                    last_enc_weight = tf.get_variable(
+                        "encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel"
+                    )
+                    last_dec_weight = tf.get_variable(
+                        "decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel"
+                    )
+                    att_layer_weight = tf.get_variable(
+                        "decoder/attention/attention_layer/kernel"
+                    )
+                    # pylint: enable=line-too-long
+
+                self._assertTrainStepsLoss(
+                    train_m, sess, "AttentionMechanismScaledLuong"
+                )
+                self._assertModelVariable(
+                    last_enc_weight,
+                    sess,
+                    "AttentionMechanismScaledLuong/last_enc_weight",
+                )
+                self._assertModelVariable(
+                    last_dec_weight,
+                    sess,
+                    "AttentionMechanismScaledLuong/last_dec_weight",
+                )
+                self._assertModelVariable(
+                    att_layer_weight,
+                    sess,
+                    "AttentionMechanismScaledLuong/att_layer_weight",
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                eval_m = self._createTestEvalModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertEvalLossAndPredictCount(
+                    eval_m, sess, "AttentionMechanismScaledLuong"
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                infer_m = self._createTestInferModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertInferLogits(
+                    infer_m, sess, "AttentionMechanismScaledLuong")
+
+    def testAttentionMechanismBahdanau(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            attention="bahdanau",
+            attention_architecture="standard",
+            num_layers=2,
+            use_residual=False,
+        )
+
+        workers, _ = tf.test.create_local_cluster(1, 0)
+        worker = workers[0]
+
+        # pylint: disable=line-too-long
+        expected_var_names = [
+            "dynamic_seq2seq/encoder/embedding_encoder:0",
+            "dynamic_seq2seq/decoder/embedding_decoder:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/memory_layer/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/bahdanau_attention/query_layer/kernel:0",
+            "dynamic_seq2seq/decoder/attention/bahdanau_attention/attention_v:0",
+            "dynamic_seq2seq/decoder/attention/attention_layer/kernel:0",
+            "dynamic_seq2seq/decoder/output_projection/kernel:0",
+        ]
+        # pylint: enable=line-too-long
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                train_m = self._createTestTrainModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+
+                m_vars = tf.trainable_variables()
+                self._assertModelVariableNames(
+                    expected_var_names,
+                    [v.name for v in m_vars],
+                    "AttentionMechanismBahdanau",
+                )
+
+                with tf.variable_scope("dynamic_seq2seq", reuse=True):
+                    # pylint: disable=line-too-long
+                    last_enc_weight = tf.get_variable(
+                        "encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel"
+                    )
+                    last_dec_weight = tf.get_variable(
+                        "decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel"
+                    )
+                    att_layer_weight = tf.get_variable(
+                        "decoder/attention/attention_layer/kernel"
+                    )
+                    # pylint: enable=line-too-long
+                self._assertTrainStepsLoss(
+                    train_m, sess, "AttentionMechanismBahdanau")
+                self._assertModelVariable(
+                    last_enc_weight, sess, "AttentionMechanismBahdanau/last_enc_weight"
+                )
+                self._assertModelVariable(
+                    last_dec_weight, sess, "AttentionMechanismBahdanau/last_dec_weight"
+                )
+                self._assertModelVariable(
+                    att_layer_weight,
+                    sess,
+                    "AttentionMechanismBahdanau/att_layer_weight",
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                eval_m = self._createTestEvalModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertEvalLossAndPredictCount(
+                    eval_m, sess, "AttentionMechanismBahdanau"
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                infer_m = self._createTestInferModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertInferLogits(
+                    infer_m, sess, "AttentionMechanismBahdanau")
+
+    def testAttentionMechanismNormedBahdanau(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            attention="normed_bahdanau",
+            attention_architecture="standard",
+            num_layers=2,
+            use_residual=False,
+        )
+
+        workers, _ = tf.test.create_local_cluster(1, 0)
+        worker = workers[0]
+
+        # pylint: disable=line-too-long
+        expected_var_names = [
+            "dynamic_seq2seq/encoder/embedding_encoder:0",
+            "dynamic_seq2seq/decoder/embedding_decoder:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/memory_layer/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/bahdanau_attention/query_layer/kernel:0",
+            "dynamic_seq2seq/decoder/attention/bahdanau_attention/attention_v:0",
+            "dynamic_seq2seq/decoder/attention/bahdanau_attention/attention_g:0",
+            "dynamic_seq2seq/decoder/attention/bahdanau_attention/attention_b:0",
+            "dynamic_seq2seq/decoder/attention/attention_layer/kernel:0",
+            "dynamic_seq2seq/decoder/output_projection/kernel:0",
+        ]
+        # pylint: enable=line-too-long
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                train_m = self._createTestTrainModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+
+                m_vars = tf.trainable_variables()
+                self._assertModelVariableNames(
+                    expected_var_names,
+                    [v.name for v in m_vars],
+                    "AttentionMechanismNormedBahdanau",
+                )
+
+                with tf.variable_scope("dynamic_seq2seq", reuse=True):
+                    # pylint: disable=line-too-long
+                    last_enc_weight = tf.get_variable(
+                        "encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel"
+                    )
+                    last_dec_weight = tf.get_variable(
+                        "decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel"
+                    )
+                    att_layer_weight = tf.get_variable(
+                        "decoder/attention/attention_layer/kernel"
+                    )
+                    # pylint: enable=line-too-long
+                self._assertTrainStepsLoss(
+                    train_m, sess, "AttentionMechanismNormedBahdanau"
+                )
+                self._assertModelVariable(
+                    last_enc_weight,
+                    sess,
+                    "AttentionMechanismNormedBahdanau/last_enc_weight",
+                )
+                self._assertModelVariable(
+                    last_dec_weight,
+                    sess,
+                    "AttentionMechanismNormedBahdanau/last_dec_weight",
+                )
+                self._assertModelVariable(
+                    att_layer_weight,
+                    sess,
+                    "AttentionMechanismNormedBahdanau/att_layer_weight",
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                eval_m = self._createTestEvalModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertEvalLossAndPredictCount(
+                    eval_m, sess, "AttentionMechanismNormedBahdanau"
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                infer_m = self._createTestInferModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertInferLogits(
+                    infer_m, sess, "AttentionMechanismNormedBahdanau"
+                )
+
+    # Test encoder vs. attention (all use residual):
+    # uni encoder, standard attention
+    def testUniEncoderStandardAttentionArchitecture(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=4,
+            attention="scaled_luong",
+            attention_architecture="standard",
+        )
+
+        workers, _ = tf.test.create_local_cluster(1, 0)
+        worker = workers[0]
+
+        # pylint: disable=line-too-long
+        expected_var_names = [
+            "dynamic_seq2seq/encoder/embedding_encoder:0",
+            "dynamic_seq2seq/decoder/embedding_decoder:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/memory_layer/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_3/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/attention/multi_rnn_cell/cell_3/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/attention/luong_attention/attention_g:0",
+            "dynamic_seq2seq/decoder/attention/attention_layer/kernel:0",
+            "dynamic_seq2seq/decoder/output_projection/kernel:0",
+        ]
+        # pylint: enable=line-too-long
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                train_m = self._createTestTrainModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+
+                m_vars = tf.trainable_variables()
+                self._assertModelVariableNames(
+                    expected_var_names,
+                    [v.name for v in m_vars],
+                    "UniEncoderStandardAttentionArchitecture",
+                )
+                with tf.variable_scope("dynamic_seq2seq", reuse=True):
+                    last_enc_weight = tf.get_variable(
+                        "encoder/rnn/multi_rnn_cell/cell_3/basic_lstm_cell/kernel"
+                    )
+                    last_dec_weight = tf.get_variable(
+                        "decoder/attention/multi_rnn_cell/cell_3/basic_lstm_cell/kernel"
+                    )
+                    mem_layer_weight = tf.get_variable(
+                        "decoder/memory_layer/kernel")
+                self._assertTrainStepsLoss(
+                    train_m, sess, "UniEncoderStandardAttentionArchitecture"
+                )
+                self._assertModelVariable(
+                    last_enc_weight,
+                    sess,
+                    "UniEncoderStandardAttentionArchitecture/last_enc_weight",
+                )
+                self._assertModelVariable(
+                    last_dec_weight,
+                    sess,
+                    "UniEncoderStandardAttentionArchitecture/last_dec_weight",
+                )
+                self._assertModelVariable(
+                    mem_layer_weight,
+                    sess,
+                    "UniEncoderStandardAttentionArchitecture/mem_layer_weight",
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                eval_m = self._createTestEvalModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertEvalLossAndPredictCount(
+                    eval_m, sess, "UniEncoderStandardAttentionArchitecture"
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                infer_m = self._createTestInferModel(
+                    attention_model.AttentionModel, hparams, sess
+                )
+                self._assertInferLogits(
+                    infer_m, sess, "UniEncoderStandardAttentionArchitecture"
+                )
+
+    # Test gnmt model.
+    def _testGNMTModel(self, architecture):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="gnmt",
+            num_layers=4,
+            attention="scaled_luong",
+            attention_architecture=architecture,
+        )
+
+        workers, _ = tf.test.create_local_cluster(1, 0)
+        worker = workers[0]
+
+        # pylint: disable=line-too-long
+        expected_var_names = [
+            "dynamic_seq2seq/encoder/embedding_encoder:0",
+            "dynamic_seq2seq/decoder/embedding_decoder:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/fw/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/fw/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/bw/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/bidirectional_rnn/bw/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/memory_layer/kernel:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_0_attention/attention/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_0_attention/attention/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_0_attention/attention/luong_attention/attention_g:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_2/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_2/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_3/basic_lstm_cell/kernel:0",
+            "dynamic_seq2seq/decoder/multi_rnn_cell/cell_3/basic_lstm_cell/bias:0",
+            "dynamic_seq2seq/decoder/output_projection/kernel:0",
+        ]
+        # pylint: enable=line-too-long
+
+        test_prefix = "GNMTModel_%s" % architecture
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                train_m = self._createTestTrainModel(
+                    gnmt_model.GNMTModel, hparams, sess
+                )
+
+                m_vars = tf.trainable_variables()
+                self._assertModelVariableNames(
+                    expected_var_names, [v.name for v in m_vars], test_prefix
+                )
+                with tf.variable_scope("dynamic_seq2seq", reuse=True):
+                    last_enc_weight = tf.get_variable(
+                        "encoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/kernel"
+                    )
+                    last_dec_weight = tf.get_variable(
+                        "decoder/multi_rnn_cell/cell_3/basic_lstm_cell/kernel"
+                    )
+                    mem_layer_weight = tf.get_variable(
+                        "decoder/memory_layer/kernel")
+                self._assertTrainStepsLoss(train_m, sess, test_prefix)
+
+                self._assertModelVariable(
+                    last_enc_weight, sess, "%s/last_enc_weight" % test_prefix
+                )
+                self._assertModelVariable(
+                    last_dec_weight, sess, "%s/last_dec_weight" % test_prefix
+                )
+                self._assertModelVariable(
+                    mem_layer_weight, sess, "%s/mem_layer_weight" % test_prefix
+                )
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                eval_m = self._createTestEvalModel(
+                    gnmt_model.GNMTModel, hparams, sess)
+                self._assertEvalLossAndPredictCount(eval_m, sess, test_prefix)
+
+        with tf.Graph().as_default():
+            with tf.Session(worker.target, config=self._get_session_config()) as sess:
+                infer_m = self._createTestInferModel(
+                    gnmt_model.GNMTModel, hparams, sess
+                )
+                self._assertInferLogits(infer_m, sess, test_prefix)
+
+    def testGNMTModel(self):
+        self._testGNMTModel("gnmt")
+
+    def testGNMTModelV2(self):
+        self._testGNMTModel("gnmt_v2")
+
+    # Test beam search.
+    def testBeamSearchBasicModel(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=1,
+            attention="",
+            attention_architecture="",
+            use_residual=False,
+        )
+        hparams.beam_width = 3
+        hparams.infer_mode = "beam_search"
+        hparams.tgt_max_len_infer = 4
+        assert_top_k_sentence = 3
+
+        with self.test_session() as sess:
+            infer_m = self._createTestInferModel(
+                model.Model, hparams, sess, True)
+            self._assertBeamSearchOutputs(
+                infer_m, sess, assert_top_k_sentence, "BeamSearchBasicModel"
+            )
+
+    def testBeamSearchAttentionModel(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            attention="scaled_luong",
+            attention_architecture="standard",
+            num_layers=2,
+            use_residual=False,
+        )
+        hparams.beam_width = 3
+        hparams.infer_mode = "beam_search"
+        hparams.tgt_max_len_infer = 4
+        assert_top_k_sentence = 2
+
+        with self.test_session() as sess:
+            infer_m = self._createTestInferModel(
+                attention_model.AttentionModel, hparams, sess, True
+            )
+            self._assertBeamSearchOutputs(
+                infer_m, sess, assert_top_k_sentence, "BeamSearchAttentionModel"
+            )
+
+    def testBeamSearchGNMTModel(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="gnmt",
+            num_layers=4,
+            attention="scaled_luong",
+            attention_architecture="gnmt",
+        )
+        hparams.beam_width = 3
+        hparams.infer_mode = "beam_search"
+        hparams.tgt_max_len_infer = 4
+        assert_top_k_sentence = 1
+
+        with self.test_session() as sess:
+            infer_m = self._createTestInferModel(
+                gnmt_model.GNMTModel, hparams, sess, True
+            )
+            self._assertBeamSearchOutputs(
+                infer_m, sess, assert_top_k_sentence, "BeamSearchGNMTModel"
+            )
+
+    def testInitializerGlorotNormal(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=1,
+            attention="",
+            attention_architecture="",
+            use_residual=False,
+            init_op="glorot_normal",
+        )
+
+        with self.test_session() as sess:
+            train_m = self._createTestTrainModel(model.Model, hparams, sess)
+            self._assertTrainStepsLoss(
+                train_m, sess, "InitializerGlorotNormal")
+
+    def testInitializerGlorotUniform(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="uni",
+            num_layers=1,
+            attention="",
+            attention_architecture="",
+            use_residual=False,
+            init_op="glorot_uniform",
+        )
+
+        with self.test_session() as sess:
+            train_m = self._createTestTrainModel(model.Model, hparams, sess)
+            self._assertTrainStepsLoss(
+                train_m, sess, "InitializerGlorotUniform")
+
+    def testSampledSoftmaxLoss(self):
+        hparams = common_test_utils.create_test_hparams(
+            encoder_type="gnmt",
+            num_layers=4,
+            attention="scaled_luong",
+            attention_architecture="gnmt",
+        )
+        hparams.num_sampled_softmax = 3
+
+        with self.test_session() as sess:
+            train_m = self._createTestTrainModel(
+                gnmt_model.GNMTModel, hparams, sess)
+            self._assertTrainStepsLoss(train_m, sess, "SampledSoftmaxLoss")
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt.py
index ce9e683e1..d5c05bd64 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt.py
@@ -35,85 +35,160 @@
 
 FLAGS = None
 
-INFERENCE_KEYS = ["src_max_len_infer", "tgt_max_len_infer", "subword_option",
-                  "infer_batch_size", "beam_width",
-                  "length_penalty_weight", "sampling_temperature",
-                  "num_translations_per_input", "infer_mode"]
+INFERENCE_KEYS = [
+    "src_max_len_infer",
+    "tgt_max_len_infer",
+    "subword_option",
+    "infer_batch_size",
+    "beam_width",
+    "length_penalty_weight",
+    "sampling_temperature",
+    "num_translations_per_input",
+    "infer_mode",
+]
 
 
 def add_arguments(parser):
-  """Build ArgumentParser."""
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-
-  # network
-  parser.add_argument("--num_units", type=int, default=32, help="Network size.")
-  parser.add_argument("--num_layers", type=int, default=2,
-                      help="Network depth.")
-  parser.add_argument("--num_encoder_layers", type=int, default=None,
-                      help="Encoder depth, equal to num_layers if None.")
-  parser.add_argument("--num_decoder_layers", type=int, default=None,
-                      help="Decoder depth, equal to num_layers if None.")
-  parser.add_argument("--encoder_type", type=str, default="uni", help="""\
+    """Build ArgumentParser."""
+    parser.register("type", "bool", lambda v: v.lower() == "true")
+
+    # network
+    parser.add_argument(
+        "--num_units",
+        type=int,
+        default=32,
+        help="Network size.")
+    parser.add_argument(
+        "--num_layers",
+        type=int,
+        default=2,
+        help="Network depth.")
+    parser.add_argument(
+        "--num_encoder_layers",
+        type=int,
+        default=None,
+        help="Encoder depth, equal to num_layers if None.",
+    )
+    parser.add_argument(
+        "--num_decoder_layers",
+        type=int,
+        default=None,
+        help="Decoder depth, equal to num_layers if None.",
+    )
+    parser.add_argument(
+        "--encoder_type",
+        type=str,
+        default="uni",
+        help="""\
       uni | bi | gnmt.
       For bi, we build num_encoder_layers/2 bi-directional layers.
       For gnmt, we build 1 bi-directional layer, and (num_encoder_layers - 1)
         uni-directional layers.\
-      """)
-  parser.add_argument("--residual", type="bool", nargs="?", const=True,
-                      default=False,
-                      help="Whether to add residual connections.")
-  parser.add_argument("--time_major", type="bool", nargs="?", const=True,
-                      default=True,
-                      help="Whether to use time-major mode for dynamic RNN.")
-  parser.add_argument("--num_embeddings_partitions", type=int, default=0,
-                      help="Number of partitions for embedding vars.")
-
-  # attention mechanisms
-  parser.add_argument("--attention", type=str, default="", help="""\
+      """,
+    )
+    parser.add_argument(
+        "--residual",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=False,
+        help="Whether to add residual connections.",
+    )
+    parser.add_argument(
+        "--time_major",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=True,
+        help="Whether to use time-major mode for dynamic RNN.",
+    )
+    parser.add_argument(
+        "--num_embeddings_partitions",
+        type=int,
+        default=0,
+        help="Number of partitions for embedding vars.",
+    )
+
+    # attention mechanisms
+    parser.add_argument(
+        "--attention",
+        type=str,
+        default="",
+        help="""\
       luong | scaled_luong | bahdanau | normed_bahdanau or set to "" for no
       attention\
-      """)
-  parser.add_argument(
-      "--attention_architecture",
-      type=str,
-      default="standard",
-      help="""\
+      """,
+    )
+    parser.add_argument(
+        "--attention_architecture",
+        type=str,
+        default="standard",
+        help="""\
       standard | gnmt | gnmt_v2.
       standard: use top layer to compute attention.
       gnmt: GNMT style of computing attention, use previous bottom layer to
           compute attention.
       gnmt_v2: similar to gnmt, but use current bottom layer to compute
           attention.\
-      """)
-  parser.add_argument(
-      "--output_attention", type="bool", nargs="?", const=True,
-      default=True,
-      help="""\
+      """,
+    )
+    parser.add_argument(
+        "--output_attention",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=True,
+        help="""\
       Only used in standard attention_architecture. Whether use attention as
       the cell output at each timestep.
       .\
-      """)
-  parser.add_argument(
-      "--pass_hidden_state", type="bool", nargs="?", const=True,
-      default=True,
-      help="""\
+      """,
+    )
+    parser.add_argument(
+        "--pass_hidden_state",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=True,
+        help="""\
       Whether to pass encoder's hidden state to decoder when using an attention
       based model.\
-      """)
-
-  # optimizer
-  parser.add_argument("--optimizer", type=str, default="sgd", help="sgd | adam")
-  parser.add_argument("--learning_rate", type=float, default=1.0,
-                      help="Learning rate. Adam: 0.001 | 0.0001")
-  parser.add_argument("--warmup_steps", type=int, default=0,
-                      help="How many steps we inverse-decay learning.")
-  parser.add_argument("--warmup_scheme", type=str, default="t2t", help="""\
+      """,
+    )
+
+    # optimizer
+    parser.add_argument(
+        "--optimizer",
+        type=str,
+        default="sgd",
+        help="sgd | adam")
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1.0,
+        help="Learning rate. Adam: 0.001 | 0.0001",
+    )
+    parser.add_argument(
+        "--warmup_steps",
+        type=int,
+        default=0,
+        help="How many steps we inverse-decay learning.",
+    )
+    parser.add_argument(
+        "--warmup_scheme",
+        type=str,
+        default="t2t",
+        help="""\
       How to warmup learning rates. Options include:
         t2t: Tensor2Tensor's way, start with lr 100 times smaller, then
              exponentiate until the specified lr.\
-      """)
-  parser.add_argument(
-      "--decay_scheme", type=str, default="", help="""\
+      """,
+    )
+    parser.add_argument(
+        "--decay_scheme",
+        type=str,
+        default="",
+        help="""\
       How we decay learning rate. Options include:
         luong234: after 2/3 num train steps, we start halving the learning rate
           for 4 times before finishing.
@@ -121,597 +196,862 @@ def add_arguments(parser):
           for 5 times before finishing.\
         luong10: after 1/2 num train steps, we start halving the learning rate
           for 10 times before finishing.\
-      """)
-
-  parser.add_argument(
-      "--num_train_steps", type=int, default=12000, help="Num steps to train.")
-  parser.add_argument("--colocate_gradients_with_ops", type="bool", nargs="?",
-                      const=True,
-                      default=True,
-                      help=("Whether try colocating gradients with "
-                            "corresponding op"))
-
-  # initializer
-  parser.add_argument("--init_op", type=str, default="uniform",
-                      help="uniform | glorot_normal | glorot_uniform")
-  parser.add_argument("--init_weight", type=float, default=0.1,
-                      help=("for uniform init_op, initialize weights "
-                            "between [-this, this]."))
-
-  # data
-  parser.add_argument("--src", type=str, default=None,
-                      help="Source suffix, e.g., en.")
-  parser.add_argument("--tgt", type=str, default=None,
-                      help="Target suffix, e.g., de.")
-  parser.add_argument("--train_prefix", type=str, default=None,
-                      help="Train prefix, expect files with src/tgt suffixes.")
-  parser.add_argument("--dev_prefix", type=str, default=None,
-                      help="Dev prefix, expect files with src/tgt suffixes.")
-  parser.add_argument("--test_prefix", type=str, default=None,
-                      help="Test prefix, expect files with src/tgt suffixes.")
-  parser.add_argument("--out_dir", type=str, default=None,
-                      help="Store log/model files.")
-
-  # Vocab
-  parser.add_argument("--vocab_prefix", type=str, default=None, help="""\
+      """,
+    )
+
+    parser.add_argument(
+        "--num_train_steps", type=int, default=12000, help="Num steps to train."
+    )
+    parser.add_argument(
+        "--colocate_gradients_with_ops",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=True,
+        help=("Whether try colocating gradients with " "corresponding op"),
+    )
+
+    # initializer
+    parser.add_argument(
+        "--init_op",
+        type=str,
+        default="uniform",
+        help="uniform | glorot_normal | glorot_uniform",
+    )
+    parser.add_argument(
+        "--init_weight",
+        type=float,
+        default=0.1,
+        help=(
+            "for uniform init_op, initialize weights "
+            "between [-this, this]."),
+    )
+
+    # data
+    parser.add_argument(
+        "--src", type=str, default=None, help="Source suffix, e.g., en."
+    )
+    parser.add_argument(
+        "--tgt", type=str, default=None, help="Target suffix, e.g., de."
+    )
+    parser.add_argument(
+        "--train_prefix",
+        type=str,
+        default=None,
+        help="Train prefix, expect files with src/tgt suffixes.",
+    )
+    parser.add_argument(
+        "--dev_prefix",
+        type=str,
+        default=None,
+        help="Dev prefix, expect files with src/tgt suffixes.",
+    )
+    parser.add_argument(
+        "--test_prefix",
+        type=str,
+        default=None,
+        help="Test prefix, expect files with src/tgt suffixes.",
+    )
+    parser.add_argument(
+        "--out_dir", type=str, default=None, help="Store log/model files."
+    )
+
+    # Vocab
+    parser.add_argument(
+        "--vocab_prefix",
+        type=str,
+        default=None,
+        help="""\
       Vocab prefix, expect files with src/tgt suffixes.\
-      """)
-  parser.add_argument("--embed_prefix", type=str, default=None, help="""\
+      """,
+    )
+    parser.add_argument(
+        "--embed_prefix",
+        type=str,
+        default=None,
+        help="""\
       Pretrained embedding prefix, expect files with src/tgt suffixes.
       The embedding files should be Glove formated txt files.\
-      """)
-  parser.add_argument("--sos", type=str, default="<s>",
-                      help="Start-of-sentence symbol.")
-  parser.add_argument("--eos", type=str, default="</s>",
-                      help="End-of-sentence symbol.")
-  parser.add_argument("--share_vocab", type="bool", nargs="?", const=True,
-                      default=False,
-                      help="""\
+      """,
+    )
+    parser.add_argument(
+        "--sos", type=str, default="<s>", help="Start-of-sentence symbol."
+    )
+    parser.add_argument(
+        "--eos", type=str, default="</s>", help="End-of-sentence symbol."
+    )
+    parser.add_argument(
+        "--share_vocab",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=False,
+        help="""\
       Whether to use the source vocab and embeddings for both source and
       target.\
-      """)
-  parser.add_argument("--check_special_token", type="bool", default=True,
-                      help="""\
+      """,
+    )
+    parser.add_argument(
+        "--check_special_token",
+        type="bool",
+        default=True,
+        help="""\
                       Whether check special sos, eos, unk tokens exist in the
                       vocab files.\
-                      """)
-
-  # Sequence lengths
-  parser.add_argument("--src_max_len", type=int, default=50,
-                      help="Max length of src sequences during training.")
-  parser.add_argument("--tgt_max_len", type=int, default=50,
-                      help="Max length of tgt sequences during training.")
-  parser.add_argument("--src_max_len_infer", type=int, default=None,
-                      help="Max length of src sequences during inference.")
-  parser.add_argument("--tgt_max_len_infer", type=int, default=None,
-                      help="""\
+                      """,
+    )
+
+    # Sequence lengths
+    parser.add_argument(
+        "--src_max_len",
+        type=int,
+        default=50,
+        help="Max length of src sequences during training.",
+    )
+    parser.add_argument(
+        "--tgt_max_len",
+        type=int,
+        default=50,
+        help="Max length of tgt sequences during training.",
+    )
+    parser.add_argument(
+        "--src_max_len_infer",
+        type=int,
+        default=None,
+        help="Max length of src sequences during inference.",
+    )
+    parser.add_argument(
+        "--tgt_max_len_infer",
+        type=int,
+        default=None,
+        help="""\
       Max length of tgt sequences during inference.  Also use to restrict the
       maximum decoding length.\
-      """)
-
-  # Default settings works well (rarely need to change)
-  parser.add_argument("--unit_type", type=str, default="lstm",
-                      help="lstm | gru | layer_norm_lstm | nas")
-  parser.add_argument("--forget_bias", type=float, default=1.0,
-                      help="Forget bias for BasicLSTMCell.")
-  parser.add_argument("--dropout", type=float, default=0.2,
-                      help="Dropout rate (not keep_prob)")
-  parser.add_argument("--max_gradient_norm", type=float, default=5.0,
-                      help="Clip gradients to this norm.")
-  parser.add_argument("--batch_size", type=int, default=128, help="Batch size.")
-
-  parser.add_argument("--steps_per_stats", type=int, default=100,
-                      help=("How many training steps to do per stats logging."
-                            "Save checkpoint every 10x steps_per_stats"))
-  parser.add_argument("--max_train", type=int, default=0,
-                      help="Limit on the size of training data (0: no limit).")
-  parser.add_argument("--num_buckets", type=int, default=5,
-                      help="Put data into similar-length buckets.")
-  parser.add_argument("--num_sampled_softmax", type=int, default=0,
-                      help=("Use sampled_softmax_loss if > 0."
-                            "Otherwise, use full softmax loss."))
-
-  # SPM
-  parser.add_argument("--subword_option", type=str, default="",
-                      choices=["", "bpe", "spm"],
-                      help="""\
+      """,
+    )
+
+    # Default settings works well (rarely need to change)
+    parser.add_argument(
+        "--unit_type",
+        type=str,
+        default="lstm",
+        help="lstm | gru | layer_norm_lstm | nas",
+    )
+    parser.add_argument(
+        "--forget_bias", type=float, default=1.0, help="Forget bias for BasicLSTMCell."
+    )
+    parser.add_argument(
+        "--dropout", type=float, default=0.2, help="Dropout rate (not keep_prob)"
+    )
+    parser.add_argument(
+        "--max_gradient_norm",
+        type=float,
+        default=5.0,
+        help="Clip gradients to this norm.",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="Batch size.")
+
+    parser.add_argument(
+        "--steps_per_stats",
+        type=int,
+        default=100,
+        help=(
+            "How many training steps to do per stats logging."
+            "Save checkpoint every 10x steps_per_stats"
+        ),
+    )
+    parser.add_argument(
+        "--max_train",
+        type=int,
+        default=0,
+        help="Limit on the size of training data (0: no limit).",
+    )
+    parser.add_argument(
+        "--num_buckets",
+        type=int,
+        default=5,
+        help="Put data into similar-length buckets.",
+    )
+    parser.add_argument(
+        "--num_sampled_softmax",
+        type=int,
+        default=0,
+        help=(
+            "Use sampled_softmax_loss if > 0."
+            "Otherwise, use full softmax loss."),
+    )
+
+    # SPM
+    parser.add_argument(
+        "--subword_option",
+        type=str,
+        default="",
+        choices=["", "bpe", "spm"],
+        help="""\
                       Set to bpe or spm to activate subword desegmentation.\
-                      """)
-
-  # Experimental encoding feature.
-  parser.add_argument("--use_char_encode", type="bool", default=False,
-                      help="""\
+                      """,
+    )
+
+    # Experimental encoding feature.
+    parser.add_argument(
+        "--use_char_encode",
+        type="bool",
+        default=False,
+        help="""\
                       Whether to split each word or bpe into character, and then
                       generate the word-level representation from the character
                       reprentation.
-                      """)
-
-  # Misc
-  parser.add_argument("--num_gpus", type=int, default=1,
-                      help="Number of gpus in each worker.")
-  parser.add_argument("--log_device_placement", type="bool", nargs="?",
-                      const=True, default=False, help="Debug GPU allocation.")
-  parser.add_argument("--metrics", type=str, default="bleu",
-                      help=("Comma-separated list of evaluations "
-                            "metrics (bleu,rouge,accuracy)"))
-  parser.add_argument("--steps_per_external_eval", type=int, default=None,
-                      help="""\
+                      """,
+    )
+
+    # Misc
+    parser.add_argument(
+        "--num_gpus", type=int, default=1, help="Number of gpus in each worker."
+    )
+    parser.add_argument(
+        "--log_device_placement",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=False,
+        help="Debug GPU allocation.",
+    )
+    parser.add_argument(
+        "--metrics",
+        type=str,
+        default="bleu",
+        help=(
+            "Comma-separated list of evaluations "
+            "metrics (bleu,rouge,accuracy)"),
+    )
+    parser.add_argument(
+        "--steps_per_external_eval",
+        type=int,
+        default=None,
+        help="""\
       How many training steps to do per external evaluation.  Automatically set
       based on data if None.\
-      """)
-  parser.add_argument("--scope", type=str, default=None,
-                      help="scope to put variables under")
-  parser.add_argument("--hparams_path", type=str, default=None,
-                      help=("Path to standard hparams json file that overrides"
-                            "hparams values from FLAGS."))
-  parser.add_argument("--random_seed", type=int, default=None,
-                      help="Random seed (>0, set a specific seed).")
-  parser.add_argument("--override_loaded_hparams", type="bool", nargs="?",
-                      const=True, default=False,
-                      help="Override loaded hparams with values specified")
-  parser.add_argument("--num_keep_ckpts", type=int, default=5,
-                      help="Max number of checkpoints to keep.")
-  parser.add_argument("--avg_ckpts", type="bool", nargs="?",
-                      const=True, default=False, help=("""\
+      """,
+    )
+    parser.add_argument(
+        "--scope", type=str, default=None, help="scope to put variables under"
+    )
+    parser.add_argument(
+        "--hparams_path",
+        type=str,
+        default=None,
+        help=(
+            "Path to standard hparams json file that overrides"
+            "hparams values from FLAGS."
+        ),
+    )
+    parser.add_argument(
+        "--random_seed",
+        type=int,
+        default=None,
+        help="Random seed (>0, set a specific seed).",
+    )
+    parser.add_argument(
+        "--override_loaded_hparams",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=False,
+        help="Override loaded hparams with values specified",
+    )
+    parser.add_argument(
+        "--num_keep_ckpts",
+        type=int,
+        default=5,
+        help="Max number of checkpoints to keep.",
+    )
+    parser.add_argument(
+        "--avg_ckpts",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=False,
+        help=(
+            """\
                       Average the last N checkpoints for external evaluation.
                       N can be controlled by setting --num_keep_ckpts.\
-                      """))
-  parser.add_argument("--language_model", type="bool", nargs="?",
-                      const=True, default=False,
-                      help="True to train a language model, ignoring encoder")
-
-  # Inference
-  parser.add_argument("--ckpt", type=str, default="",
-                      help="Checkpoint file to load a model for inference.")
-  parser.add_argument("--inference_input_file", type=str, default=None,
-                      help="Set to the text to decode.")
-  parser.add_argument("--inference_list", type=str, default=None,
-                      help=("A comma-separated list of sentence indices "
-                            "(0-based) to decode."))
-  parser.add_argument("--infer_batch_size", type=int, default=None,
-                      help="Batch size for inference mode.")
-  parser.add_argument("--inference_output_file", type=str, default=None,
-                      help="Output file to store decoding results.")
-  parser.add_argument("--inference_ref_file", type=str, default=None,
-                      help=("""\
+                      """
+        ),
+    )
+    parser.add_argument(
+        "--language_model",
+        type="bool",
+        nargs="?",
+        const=True,
+        default=False,
+        help="True to train a language model, ignoring encoder",
+    )
+
+    # Inference
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="",
+        help="Checkpoint file to load a model for inference.",
+    )
+    parser.add_argument(
+        "--inference_input_file",
+        type=str,
+        default=None,
+        help="Set to the text to decode.",
+    )
+    parser.add_argument(
+        "--inference_list",
+        type=str,
+        default=None,
+        help=(
+            "A comma-separated list of sentence indices "
+            "(0-based) to decode."),
+    )
+    parser.add_argument(
+        "--infer_batch_size",
+        type=int,
+        default=None,
+        help="Batch size for inference mode.",
+    )
+    parser.add_argument(
+        "--inference_output_file",
+        type=str,
+        default=None,
+        help="Output file to store decoding results.",
+    )
+    parser.add_argument(
+        "--inference_ref_file",
+        type=str,
+        default=None,
+        help=(
+            """\
       Reference file to compute evaluation scores (if provided).\
-      """))
-
-  # Advanced inference arguments
-  parser.add_argument("--infer_mode", type=str, default="greedy",
-                      choices=["greedy", "sample", "beam_search"],
-                      help="Which type of decoder to use during inference.")
-  parser.add_argument("--beam_width", type=int, default=0,
-                      help=("""\
+      """
+        ),
+    )
+
+    # Advanced inference arguments
+    parser.add_argument(
+        "--infer_mode",
+        type=str,
+        default="greedy",
+        choices=["greedy", "sample", "beam_search"],
+        help="Which type of decoder to use during inference.",
+    )
+    parser.add_argument(
+        "--beam_width",
+        type=int,
+        default=0,
+        help=(
+            """\
       beam width when using beam search decoder. If 0 (default), use standard
       decoder with greedy helper.\
-      """))
-  parser.add_argument("--length_penalty_weight", type=float, default=0.0,
-                      help="Length penalty for beam search.")
-  parser.add_argument("--sampling_temperature", type=float,
-                      default=0.0,
-                      help=("""\
+      """
+        ),
+    )
+    parser.add_argument(
+        "--length_penalty_weight",
+        type=float,
+        default=0.0,
+        help="Length penalty for beam search.",
+    )
+    parser.add_argument(
+        "--sampling_temperature",
+        type=float,
+        default=0.0,
+        help=(
+            """\
       Softmax sampling temperature for inference decoding, 0.0 means greedy
       decoding. This option is ignored when using beam search.\
-      """))
-  parser.add_argument("--num_translations_per_input", type=int, default=1,
-                      help=("""\
+      """
+        ),
+    )
+    parser.add_argument(
+        "--num_translations_per_input",
+        type=int,
+        default=1,
+        help=(
+            """\
       Number of translations generated for each sentence. This is only used for
       inference.\
-      """))
-
-  # Job info
-  parser.add_argument("--jobid", type=int, default=0,
-                      help="Task id of the worker.")
-  parser.add_argument("--num_workers", type=int, default=1,
-                      help="Number of workers (inference only).")
-  parser.add_argument("--num_inter_threads", type=int, default=0,
-                      help="number of inter_op_parallelism_threads")
-  parser.add_argument("--num_intra_threads", type=int, default=0,
-                      help="number of intra_op_parallelism_threads")
-  parser.add_argument("--iterations", type=int, default=1,
-                      help="number of iterations")
-  parser.add_argument("--workloadName", type=str, default="",
-                       help="name of workload")
-  parser.add_argument("--run", type=str, default='performance',
-                       help="Determine criteria run for infernece")
+      """
+        ),
+    )
+
+    # Job info
+    parser.add_argument(
+        "--jobid",
+        type=int,
+        default=0,
+        help="Task id of the worker.")
+    parser.add_argument(
+        "--num_workers", type=int, default=1, help="Number of workers (inference only)."
+    )
+    parser.add_argument(
+        "--num_inter_threads",
+        type=int,
+        default=0,
+        help="number of inter_op_parallelism_threads",
+    )
+    parser.add_argument(
+        "--num_intra_threads",
+        type=int,
+        default=0,
+        help="number of intra_op_parallelism_threads",
+    )
+    parser.add_argument(
+        "--iterations", type=int, default=1, help="number of iterations"
+    )
+    parser.add_argument(
+        "--workloadName",
+        type=str,
+        default="",
+        help="name of workload")
+    parser.add_argument(
+        "--run",
+        type=str,
+        default="performance",
+        help="Determine criteria run for infernece",
+    )
 
 
 def create_hparams(flags):
-  """Create training hparams."""
-  return tf.contrib.training.HParams(
-      # Data
-      src=flags.src,
-      tgt=flags.tgt,
-      train_prefix=flags.train_prefix,
-      dev_prefix=flags.dev_prefix,
-      test_prefix=flags.test_prefix,
-      vocab_prefix=flags.vocab_prefix,
-      embed_prefix=flags.embed_prefix,
-      out_dir=flags.out_dir,
-
-      # Networks
-      num_units=flags.num_units,
-      num_encoder_layers=(flags.num_encoder_layers or flags.num_layers),
-      num_decoder_layers=(flags.num_decoder_layers or flags.num_layers),
-      dropout=flags.dropout,
-      unit_type=flags.unit_type,
-      encoder_type=flags.encoder_type,
-      residual=flags.residual,
-      time_major=flags.time_major,
-      num_embeddings_partitions=flags.num_embeddings_partitions,
-
-      # Attention mechanisms
-      attention=flags.attention,
-      attention_architecture=flags.attention_architecture,
-      output_attention=flags.output_attention,
-      pass_hidden_state=flags.pass_hidden_state,
-
-      # Train
-      optimizer=flags.optimizer,
-      num_train_steps=flags.num_train_steps,
-      batch_size=flags.batch_size,
-      init_op=flags.init_op,
-      init_weight=flags.init_weight,
-      max_gradient_norm=flags.max_gradient_norm,
-      learning_rate=flags.learning_rate,
-      warmup_steps=flags.warmup_steps,
-      warmup_scheme=flags.warmup_scheme,
-      decay_scheme=flags.decay_scheme,
-      colocate_gradients_with_ops=flags.colocate_gradients_with_ops,
-      num_sampled_softmax=flags.num_sampled_softmax,
-
-      # Data constraints
-      num_buckets=flags.num_buckets,
-      max_train=flags.max_train,
-      src_max_len=flags.src_max_len,
-      tgt_max_len=flags.tgt_max_len,
-
-      # Inference
-      src_max_len_infer=flags.src_max_len_infer,
-      tgt_max_len_infer=flags.tgt_max_len_infer,
-      infer_batch_size=flags.infer_batch_size,
-
-      # Advanced inference arguments
-      infer_mode=flags.infer_mode,
-      beam_width=flags.beam_width,
-      length_penalty_weight=flags.length_penalty_weight,
-      sampling_temperature=flags.sampling_temperature,
-      num_translations_per_input=flags.num_translations_per_input,
-
-      # Vocab
-      sos=flags.sos if flags.sos else vocab_utils.SOS,
-      eos=flags.eos if flags.eos else vocab_utils.EOS,
-      subword_option=flags.subword_option,
-      check_special_token=flags.check_special_token,
-      use_char_encode=flags.use_char_encode,
-
-      # Misc
-      forget_bias=flags.forget_bias,
-      num_gpus=flags.num_gpus,
-      epoch_step=0,  # record where we were within an epoch.
-      steps_per_stats=flags.steps_per_stats,
-      steps_per_external_eval=flags.steps_per_external_eval,
-      share_vocab=flags.share_vocab,
-      metrics=flags.metrics.split(","),
-      log_device_placement=flags.log_device_placement,
-      random_seed=flags.random_seed,
-      override_loaded_hparams=flags.override_loaded_hparams,
-      num_keep_ckpts=flags.num_keep_ckpts,
-      avg_ckpts=flags.avg_ckpts,
-      language_model=flags.language_model,
-      num_intra_threads=flags.num_intra_threads,
-      num_inter_threads=flags.num_inter_threads,
-      iterations=flags.iterations,
-      run = flags.run,
-  )
+    """Create training hparams."""
+    return tf.contrib.training.HParams(
+        # Data
+        src=flags.src,
+        tgt=flags.tgt,
+        train_prefix=flags.train_prefix,
+        dev_prefix=flags.dev_prefix,
+        test_prefix=flags.test_prefix,
+        vocab_prefix=flags.vocab_prefix,
+        embed_prefix=flags.embed_prefix,
+        out_dir=flags.out_dir,
+        # Networks
+        num_units=flags.num_units,
+        num_encoder_layers=(flags.num_encoder_layers or flags.num_layers),
+        num_decoder_layers=(flags.num_decoder_layers or flags.num_layers),
+        dropout=flags.dropout,
+        unit_type=flags.unit_type,
+        encoder_type=flags.encoder_type,
+        residual=flags.residual,
+        time_major=flags.time_major,
+        num_embeddings_partitions=flags.num_embeddings_partitions,
+        # Attention mechanisms
+        attention=flags.attention,
+        attention_architecture=flags.attention_architecture,
+        output_attention=flags.output_attention,
+        pass_hidden_state=flags.pass_hidden_state,
+        # Train
+        optimizer=flags.optimizer,
+        num_train_steps=flags.num_train_steps,
+        batch_size=flags.batch_size,
+        init_op=flags.init_op,
+        init_weight=flags.init_weight,
+        max_gradient_norm=flags.max_gradient_norm,
+        learning_rate=flags.learning_rate,
+        warmup_steps=flags.warmup_steps,
+        warmup_scheme=flags.warmup_scheme,
+        decay_scheme=flags.decay_scheme,
+        colocate_gradients_with_ops=flags.colocate_gradients_with_ops,
+        num_sampled_softmax=flags.num_sampled_softmax,
+        # Data constraints
+        num_buckets=flags.num_buckets,
+        max_train=flags.max_train,
+        src_max_len=flags.src_max_len,
+        tgt_max_len=flags.tgt_max_len,
+        # Inference
+        src_max_len_infer=flags.src_max_len_infer,
+        tgt_max_len_infer=flags.tgt_max_len_infer,
+        infer_batch_size=flags.infer_batch_size,
+        # Advanced inference arguments
+        infer_mode=flags.infer_mode,
+        beam_width=flags.beam_width,
+        length_penalty_weight=flags.length_penalty_weight,
+        sampling_temperature=flags.sampling_temperature,
+        num_translations_per_input=flags.num_translations_per_input,
+        # Vocab
+        sos=flags.sos if flags.sos else vocab_utils.SOS,
+        eos=flags.eos if flags.eos else vocab_utils.EOS,
+        subword_option=flags.subword_option,
+        check_special_token=flags.check_special_token,
+        use_char_encode=flags.use_char_encode,
+        # Misc
+        forget_bias=flags.forget_bias,
+        num_gpus=flags.num_gpus,
+        epoch_step=0,  # record where we were within an epoch.
+        steps_per_stats=flags.steps_per_stats,
+        steps_per_external_eval=flags.steps_per_external_eval,
+        share_vocab=flags.share_vocab,
+        metrics=flags.metrics.split(","),
+        log_device_placement=flags.log_device_placement,
+        random_seed=flags.random_seed,
+        override_loaded_hparams=flags.override_loaded_hparams,
+        num_keep_ckpts=flags.num_keep_ckpts,
+        avg_ckpts=flags.avg_ckpts,
+        language_model=flags.language_model,
+        num_intra_threads=flags.num_intra_threads,
+        num_inter_threads=flags.num_inter_threads,
+        iterations=flags.iterations,
+        run=flags.run,
+    )
 
 
 def _add_argument(hparams, key, value, update=True):
-  """Add an argument to hparams; if exists, change the value if update==True."""
-  if hasattr(hparams, key):
-    if update:
-      setattr(hparams, key, value)
-  else:
-    hparams.add_hparam(key, value)
+    """Add an argument to hparams; if exists, change the value if update==True."""
+    if hasattr(hparams, key):
+        if update:
+            setattr(hparams, key, value)
+    else:
+        hparams.add_hparam(key, value)
 
 
 def extend_hparams(hparams):
-  """Add new arguments to hparams."""
-  # Sanity checks
-  if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0:
-    raise ValueError("For bi, num_encoder_layers %d should be even" %
-                     hparams.num_encoder_layers)
-  if (hparams.attention_architecture in ["gnmt"] and
-      hparams.num_encoder_layers < 2):
-    raise ValueError("For gnmt attention architecture, "
-                     "num_encoder_layers %d should be >= 2" %
-                     hparams.num_encoder_layers)
-  if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
-    raise ValueError("subword option must be either spm, or bpe")
-  if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
-    raise ValueError("beam_width must greater than 0 when using beam_search"
-                     "decoder.")
-  if hparams.infer_mode == "sample" and hparams.sampling_temperature <= 0.0:
-    raise ValueError("sampling_temperature must greater than 0.0 when using"
-                     "sample decoder.")
-
-  # Different number of encoder / decoder layers
-  assert hparams.num_encoder_layers and hparams.num_decoder_layers
-  if hparams.num_encoder_layers != hparams.num_decoder_layers:
-    hparams.pass_hidden_state = False
-    utils.print_out("Num encoder layer %d is different from num decoder layer"
-                    " %d, so set pass_hidden_state to False" % (
-                        hparams.num_encoder_layers,
-                        hparams.num_decoder_layers))
-
-  # Set residual layers
-  num_encoder_residual_layers = 0
-  num_decoder_residual_layers = 0
-  if hparams.residual:
-    if hparams.num_encoder_layers > 1:
-      num_encoder_residual_layers = hparams.num_encoder_layers - 1
-    if hparams.num_decoder_layers > 1:
-      num_decoder_residual_layers = hparams.num_decoder_layers - 1
-
-    if hparams.encoder_type == "gnmt":
-      # The first unidirectional layer (after the bi-directional layer) in
-      # the GNMT encoder can't have residual connection due to the input is
-      # the concatenation of fw_cell and bw_cell's outputs.
-      num_encoder_residual_layers = hparams.num_encoder_layers - 2
-
-      # Compatible for GNMT models
-      if hparams.num_encoder_layers == hparams.num_decoder_layers:
-        num_decoder_residual_layers = num_encoder_residual_layers
-  _add_argument(hparams, "num_encoder_residual_layers",
-                num_encoder_residual_layers)
-  _add_argument(hparams, "num_decoder_residual_layers",
-                num_decoder_residual_layers)
-
-  # Language modeling
-  if getattr(hparams, "language_model", None):
-    hparams.attention = "normed_bahdanau"
-    hparams.attention_architecture = "gnmt_v2"
-    hparams.pass_hidden_state = False
-    hparams.share_vocab = True
-    hparams.src = hparams.tgt
-    utils.print_out("For language modeling, we turn off attention and "
-                    "pass_hidden_state; turn on share_vocab; set src to tgt.")
-
-  ## Vocab
-  # Get vocab file names first
-  if hparams.vocab_prefix:
-    src_vocab_file = hparams.vocab_prefix + "." + hparams.src
-    tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
-  else:
-    raise ValueError("hparams.vocab_prefix must be provided.")
-
-  # Source vocab
-  check_special_token = getattr(hparams, "check_special_token", True)
-  src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
-      src_vocab_file,
-      hparams.out_dir,
-      check_special_token=check_special_token,
-      sos=hparams.sos,
-      eos=hparams.eos,
-      unk=vocab_utils.UNK)
-
-  # Target vocab
-  if hparams.share_vocab:
-    utils.print_out("  using source vocab for target")
-    tgt_vocab_file = src_vocab_file
-    tgt_vocab_size = src_vocab_size
-  else:
-    tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
-        tgt_vocab_file,
+    """Add new arguments to hparams."""
+    # Sanity checks
+    if hparams.encoder_type == "bi" and hparams.num_encoder_layers % 2 != 0:
+        raise ValueError(
+            "For bi, num_encoder_layers %d should be even" % hparams.num_encoder_layers
+        )
+    if hparams.attention_architecture in [
+            "gnmt"] and hparams.num_encoder_layers < 2:
+        raise ValueError(
+            "For gnmt attention architecture, "
+            "num_encoder_layers %d should be >= 2" % hparams.num_encoder_layers
+        )
+    if hparams.subword_option and hparams.subword_option not in ["spm", "bpe"]:
+        raise ValueError("subword option must be either spm, or bpe")
+    if hparams.infer_mode == "beam_search" and hparams.beam_width <= 0:
+        raise ValueError(
+            "beam_width must greater than 0 when using beam_search" "decoder."
+        )
+    if hparams.infer_mode == "sample" and hparams.sampling_temperature <= 0.0:
+        raise ValueError(
+            "sampling_temperature must greater than 0.0 when using" "sample decoder."
+        )
+
+    # Different number of encoder / decoder layers
+    assert hparams.num_encoder_layers and hparams.num_decoder_layers
+    if hparams.num_encoder_layers != hparams.num_decoder_layers:
+        hparams.pass_hidden_state = False
+        utils.print_out(
+            "Num encoder layer %d is different from num decoder layer"
+            " %d, so set pass_hidden_state to False"
+            % (hparams.num_encoder_layers, hparams.num_decoder_layers)
+        )
+
+    # Set residual layers
+    num_encoder_residual_layers = 0
+    num_decoder_residual_layers = 0
+    if hparams.residual:
+        if hparams.num_encoder_layers > 1:
+            num_encoder_residual_layers = hparams.num_encoder_layers - 1
+        if hparams.num_decoder_layers > 1:
+            num_decoder_residual_layers = hparams.num_decoder_layers - 1
+
+        if hparams.encoder_type == "gnmt":
+            # The first unidirectional layer (after the bi-directional layer) in
+            # the GNMT encoder can't have residual connection due to the input is
+            # the concatenation of fw_cell and bw_cell's outputs.
+            num_encoder_residual_layers = hparams.num_encoder_layers - 2
+
+            # Compatible for GNMT models
+            if hparams.num_encoder_layers == hparams.num_decoder_layers:
+                num_decoder_residual_layers = num_encoder_residual_layers
+    _add_argument(
+        hparams,
+        "num_encoder_residual_layers",
+        num_encoder_residual_layers)
+    _add_argument(
+        hparams,
+        "num_decoder_residual_layers",
+        num_decoder_residual_layers)
+
+    # Language modeling
+    if getattr(hparams, "language_model", None):
+        hparams.attention = "normed_bahdanau"
+        hparams.attention_architecture = "gnmt_v2"
+        hparams.pass_hidden_state = False
+        hparams.share_vocab = True
+        hparams.src = hparams.tgt
+        utils.print_out(
+            "For language modeling, we turn off attention and "
+            "pass_hidden_state; turn on share_vocab; set src to tgt."
+        )
+
+    # Vocab
+    # Get vocab file names first
+    if hparams.vocab_prefix:
+        src_vocab_file = hparams.vocab_prefix + "." + hparams.src
+        tgt_vocab_file = hparams.vocab_prefix + "." + hparams.tgt
+    else:
+        raise ValueError("hparams.vocab_prefix must be provided.")
+
+    # Source vocab
+    check_special_token = getattr(hparams, "check_special_token", True)
+    src_vocab_size, src_vocab_file = vocab_utils.check_vocab(
+        src_vocab_file,
         hparams.out_dir,
         check_special_token=check_special_token,
         sos=hparams.sos,
         eos=hparams.eos,
-        unk=vocab_utils.UNK)
-  _add_argument(hparams, "src_vocab_size", src_vocab_size)
-  _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
-  _add_argument(hparams, "src_vocab_file", src_vocab_file)
-  _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)
-
-  # Num embedding partitions
-  num_embeddings_partitions = getattr(hparams, "num_embeddings_partitions", 0)
-  _add_argument(hparams, "num_enc_emb_partitions", num_embeddings_partitions)
-  _add_argument(hparams, "num_dec_emb_partitions", num_embeddings_partitions)
-
-  # Pretrained Embeddings
-  _add_argument(hparams, "src_embed_file", "")
-  _add_argument(hparams, "tgt_embed_file", "")
-  if getattr(hparams, "embed_prefix", None):
-    src_embed_file = hparams.embed_prefix + "." + hparams.src
-    tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt
-
-    if tf.gfile.Exists(src_embed_file):
-      utils.print_out("  src_embed_file %s exist" % src_embed_file)
-      hparams.src_embed_file = src_embed_file
-
-      utils.print_out(
-          "For pretrained embeddings, set num_enc_emb_partitions to 1")
-      hparams.num_enc_emb_partitions = 1
+        unk=vocab_utils.UNK,
+    )
+
+    # Target vocab
+    if hparams.share_vocab:
+        utils.print_out("  using source vocab for target")
+        tgt_vocab_file = src_vocab_file
+        tgt_vocab_size = src_vocab_size
     else:
-      utils.print_out("  src_embed_file %s doesn't exist" % src_embed_file)
-
-    if tf.gfile.Exists(tgt_embed_file):
-      utils.print_out("  tgt_embed_file %s exist" % tgt_embed_file)
-      hparams.tgt_embed_file = tgt_embed_file
-
-      utils.print_out(
-          "For pretrained embeddings, set num_dec_emb_partitions to 1")
-      hparams.num_dec_emb_partitions = 1
-    else:
-      utils.print_out("  tgt_embed_file %s doesn't exist" % tgt_embed_file)
-
-  # Evaluation
-  for metric in hparams.metrics:
-    best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
-    tf.gfile.MakeDirs(best_metric_dir)
-    _add_argument(hparams, "best_" + metric, 0, update=False)
-    _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir)
-
-    if getattr(hparams, "avg_ckpts", None):
-      best_metric_dir = os.path.join(hparams.out_dir, "avg_best_" + metric)
-      tf.gfile.MakeDirs(best_metric_dir)
-      _add_argument(hparams, "avg_best_" + metric, 0, update=False)
-      _add_argument(hparams, "avg_best_" + metric + "_dir", best_metric_dir)
-
-  return hparams
-
+        tgt_vocab_size, tgt_vocab_file = vocab_utils.check_vocab(
+            tgt_vocab_file,
+            hparams.out_dir,
+            check_special_token=check_special_token,
+            sos=hparams.sos,
+            eos=hparams.eos,
+            unk=vocab_utils.UNK,
+        )
+    _add_argument(hparams, "src_vocab_size", src_vocab_size)
+    _add_argument(hparams, "tgt_vocab_size", tgt_vocab_size)
+    _add_argument(hparams, "src_vocab_file", src_vocab_file)
+    _add_argument(hparams, "tgt_vocab_file", tgt_vocab_file)
+
+    # Num embedding partitions
+    num_embeddings_partitions = getattr(
+        hparams, "num_embeddings_partitions", 0)
+    _add_argument(hparams, "num_enc_emb_partitions", num_embeddings_partitions)
+    _add_argument(hparams, "num_dec_emb_partitions", num_embeddings_partitions)
+
+    # Pretrained Embeddings
+    _add_argument(hparams, "src_embed_file", "")
+    _add_argument(hparams, "tgt_embed_file", "")
+    if getattr(hparams, "embed_prefix", None):
+        src_embed_file = hparams.embed_prefix + "." + hparams.src
+        tgt_embed_file = hparams.embed_prefix + "." + hparams.tgt
+
+        if tf.gfile.Exists(src_embed_file):
+            utils.print_out("  src_embed_file %s exist" % src_embed_file)
+            hparams.src_embed_file = src_embed_file
+
+            utils.print_out(
+                "For pretrained embeddings, set num_enc_emb_partitions to 1"
+            )
+            hparams.num_enc_emb_partitions = 1
+        else:
+            utils.print_out(
+                "  src_embed_file %s doesn't exist" %
+                src_embed_file)
+
+        if tf.gfile.Exists(tgt_embed_file):
+            utils.print_out("  tgt_embed_file %s exist" % tgt_embed_file)
+            hparams.tgt_embed_file = tgt_embed_file
+
+            utils.print_out(
+                "For pretrained embeddings, set num_dec_emb_partitions to 1"
+            )
+            hparams.num_dec_emb_partitions = 1
+        else:
+            utils.print_out(
+                "  tgt_embed_file %s doesn't exist" %
+                tgt_embed_file)
 
-def ensure_compatible_hparams(hparams, default_hparams, hparams_path=""):
-  """Make sure the loaded hparams is compatible with new changes."""
-  default_hparams = utils.maybe_parse_standard_hparams(
-      default_hparams, hparams_path)
-
-  # Set num encoder/decoder layers (for old checkpoints)
-  if hasattr(hparams, "num_layers"):
-    if not hasattr(hparams, "num_encoder_layers"):
-      hparams.add_hparam("num_encoder_layers", hparams.num_layers)
-    if not hasattr(hparams, "num_decoder_layers"):
-      hparams.add_hparam("num_decoder_layers", hparams.num_layers)
-
-  # For compatible reason, if there are new fields in default_hparams,
-  #   we add them to the current hparams
-  default_config = default_hparams.values()
-  config = hparams.values()
-  for key in default_config:
-    if key not in config:
-      hparams.add_hparam(key, default_config[key])
-
-  # Update all hparams' keys if override_loaded_hparams=True
-  if getattr(default_hparams, "override_loaded_hparams", None):
-    overwritten_keys = default_config.keys()
-  else:
-    # For inference
-    overwritten_keys = INFERENCE_KEYS
-
-  for key in overwritten_keys:
-    if getattr(hparams, key) != default_config[key]:
-      utils.print_out("# Updating hparams.%s: %s -> %s" %
-                      (key, str(getattr(hparams, key)),
-                       str(default_config[key])))
-      setattr(hparams, key, default_config[key])
-  return hparams
-
-
-def create_or_load_hparams(
-    out_dir, default_hparams, hparams_path, save_hparams=True):
-  """Create hparams or load hparams from out_dir."""
-  hparams = utils.load_hparams(out_dir)
-  if not hparams:
-    hparams = default_hparams
-    hparams = utils.maybe_parse_standard_hparams(
-        hparams, hparams_path)
-  else:
-    hparams = ensure_compatible_hparams(hparams, default_hparams, hparams_path)
-  hparams = extend_hparams(hparams)
-
-  # Save HParams
-  if save_hparams:
-    utils.save_hparams(out_dir, hparams)
+    # Evaluation
     for metric in hparams.metrics:
-      utils.save_hparams(getattr(hparams, "best_" + metric + "_dir"), hparams)
-
-  # Print HParams
-  utils.print_hparams(hparams)
-  return hparams
-
-
-def run_main(flags, default_hparams, train_fn, inference_fn, target_session=""):
-  """Run main."""
-  # Job
-  jobid = flags.jobid
-  num_workers = flags.num_workers
-  utils.print_out("# Job id %d" % jobid)
-
-  # Random
-  random_seed = flags.random_seed
-  if random_seed is not None and random_seed > 0:
-    utils.print_out("# Set random seed to %d" % random_seed)
-    random.seed(random_seed + jobid)
-    np.random.seed(random_seed + jobid)
-
-  # Model output directory
-  out_dir = flags.out_dir
-  if out_dir and not tf.gfile.Exists(out_dir):
-    utils.print_out("# Creating output directory %s ..." % out_dir)
-    tf.gfile.MakeDirs(out_dir)
-
-  # Load hparams.
-  loaded_hparams = False
-  if flags.ckpt:  # Try to load hparams from the same directory as ckpt
-    ckpt_dir = os.path.dirname(flags.ckpt)
-    ckpt_hparams_file = os.path.join(ckpt_dir, "hparams")
-    if tf.gfile.Exists(ckpt_hparams_file) or flags.hparams_path:
-      hparams = create_or_load_hparams(
-          ckpt_dir, default_hparams, flags.hparams_path,
-          save_hparams=False)
-      loaded_hparams = True
-  if not loaded_hparams:  # Try to load from out_dir
-    assert out_dir
-    hparams = create_or_load_hparams(
-        out_dir, default_hparams, flags.hparams_path,
-        save_hparams=(jobid == 0))
-
- # GPU device
-  config_proto = utils.get_config_proto(
-      allow_soft_placement=True,
-      num_intra_threads=hparams.num_intra_threads,
-      num_inter_threads=hparams.num_inter_threads)
-  utils.print_out(
-      "# Devices visible to TensorFlow: %s" 
-      % repr(tf.Session(config=config_proto).list_devices()))
-
-  ## Train / Decode
-  if flags.inference_input_file:
-    # Inference output directory
-    trans_file = flags.inference_output_file
-    assert trans_file
-    trans_dir = os.path.dirname(trans_file)
-    if not tf.gfile.Exists(trans_dir): tf.gfile.MakeDirs(trans_dir)
-
-    # Inference indices
-    hparams.inference_indices = None
-    if flags.inference_list:
-      (hparams.inference_indices) = (
-          [int(token)  for token in flags.inference_list.split(",")])
+        best_metric_dir = os.path.join(hparams.out_dir, "best_" + metric)
+        tf.gfile.MakeDirs(best_metric_dir)
+        _add_argument(hparams, "best_" + metric, 0, update=False)
+        _add_argument(hparams, "best_" + metric + "_dir", best_metric_dir)
+
+        if getattr(hparams, "avg_ckpts", None):
+            best_metric_dir = os.path.join(
+                hparams.out_dir, "avg_best_" + metric)
+            tf.gfile.MakeDirs(best_metric_dir)
+            _add_argument(hparams, "avg_best_" + metric, 0, update=False)
+            _add_argument(
+                hparams,
+                "avg_best_" +
+                metric +
+                "_dir",
+                best_metric_dir)
+
+    return hparams
 
-    # Inference
-    ckpt = flags.ckpt
-    if not ckpt:
-      ckpt = tf.train.latest_checkpoint(out_dir)
-    inference_fn(flags.run,flags.iterations,ckpt, flags.inference_input_file,
-                 trans_file, hparams, num_workers, jobid)
 
-    # Evaluation
-    if flags.run == 'accuracy':
-        ref_file = flags.inference_ref_file
-        if ref_file and tf.gfile.Exists(trans_file):
-          for metric in hparams.metrics:
-            score = evaluation_utils.evaluate(
-                ref_file,
-                trans_file,
-                metric,
-                hparams.subword_option)
-            utils.print_out("  %s: %.1f" % (metric, score))
-        else:
-        # Train
-            train_fn(hparams, target_session=target_session)
+def ensure_compatible_hparams(hparams, default_hparams, hparams_path=""):
+    """Make sure the loaded hparams is compatible with new changes."""
+    default_hparams = utils.maybe_parse_standard_hparams(
+        default_hparams, hparams_path)
+
+    # Set num encoder/decoder layers (for old checkpoints)
+    if hasattr(hparams, "num_layers"):
+        if not hasattr(hparams, "num_encoder_layers"):
+            hparams.add_hparam("num_encoder_layers", hparams.num_layers)
+        if not hasattr(hparams, "num_decoder_layers"):
+            hparams.add_hparam("num_decoder_layers", hparams.num_layers)
+
+    # For compatible reason, if there are new fields in default_hparams,
+    #   we add them to the current hparams
+    default_config = default_hparams.values()
+    config = hparams.values()
+    for key in default_config:
+        if key not in config:
+            hparams.add_hparam(key, default_config[key])
+
+    # Update all hparams' keys if override_loaded_hparams=True
+    if getattr(default_hparams, "override_loaded_hparams", None):
+        overwritten_keys = default_config.keys()
+    else:
+        # For inference
+        overwritten_keys = INFERENCE_KEYS
+
+    for key in overwritten_keys:
+        if getattr(hparams, key) != default_config[key]:
+            utils.print_out(
+                "# Updating hparams.%s: %s -> %s"
+                % (key, str(getattr(hparams, key)), str(default_config[key]))
+            )
+            setattr(hparams, key, default_config[key])
+    return hparams
+
+
+def create_or_load_hparams(out_dir, default_hparams,
+                           hparams_path, save_hparams=True):
+    """Create hparams or load hparams from out_dir."""
+    hparams = utils.load_hparams(out_dir)
+    if not hparams:
+        hparams = default_hparams
+        hparams = utils.maybe_parse_standard_hparams(hparams, hparams_path)
+    else:
+        hparams = ensure_compatible_hparams(
+            hparams, default_hparams, hparams_path)
+    hparams = extend_hparams(hparams)
+
+    # Save HParams
+    if save_hparams:
+        utils.save_hparams(out_dir, hparams)
+        for metric in hparams.metrics:
+            utils.save_hparams(
+                getattr(
+                    hparams,
+                    "best_" +
+                    metric +
+                    "_dir"),
+                hparams)
+
+    # Print HParams
+    utils.print_hparams(hparams)
+    return hparams
+
+
+def run_main(flags, default_hparams, train_fn,
+             inference_fn, target_session=""):
+    """Run main."""
+    # Job
+    jobid = flags.jobid
+    num_workers = flags.num_workers
+    utils.print_out("# Job id %d" % jobid)
+
+    # Random
+    random_seed = flags.random_seed
+    if random_seed is not None and random_seed > 0:
+        utils.print_out("# Set random seed to %d" % random_seed)
+        random.seed(random_seed + jobid)
+        np.random.seed(random_seed + jobid)
+
+    # Model output directory
+    out_dir = flags.out_dir
+    if out_dir and not tf.gfile.Exists(out_dir):
+        utils.print_out("# Creating output directory %s ..." % out_dir)
+        tf.gfile.MakeDirs(out_dir)
+
+    # Load hparams.
+    loaded_hparams = False
+    if flags.ckpt:  # Try to load hparams from the same directory as ckpt
+        ckpt_dir = os.path.dirname(flags.ckpt)
+        ckpt_hparams_file = os.path.join(ckpt_dir, "hparams")
+        if tf.gfile.Exists(ckpt_hparams_file) or flags.hparams_path:
+            hparams = create_or_load_hparams(
+                ckpt_dir, default_hparams, flags.hparams_path, save_hparams=False
+            )
+            loaded_hparams = True
+    if not loaded_hparams:  # Try to load from out_dir
+        assert out_dir
+        hparams = create_or_load_hparams(
+            out_dir, default_hparams, flags.hparams_path, save_hparams=(
+                jobid == 0)
+        )
+
+    # GPU device
+    config_proto = utils.get_config_proto(
+        allow_soft_placement=True,
+        num_intra_threads=hparams.num_intra_threads,
+        num_inter_threads=hparams.num_inter_threads,
+    )
+    utils.print_out(
+        "# Devices visible to TensorFlow: %s"
+        % repr(tf.Session(config=config_proto).list_devices())
+    )
+
+    # Train / Decode
+    if flags.inference_input_file:
+        # Inference output directory
+        trans_file = flags.inference_output_file
+        assert trans_file
+        trans_dir = os.path.dirname(trans_file)
+        if not tf.gfile.Exists(trans_dir):
+            tf.gfile.MakeDirs(trans_dir)
+
+        # Inference indices
+        hparams.inference_indices = None
+        if flags.inference_list:
+            (hparams.inference_indices) = [
+                int(token) for token in flags.inference_list.split(",")
+            ]
+
+        # Inference
+        ckpt = flags.ckpt
+        if not ckpt:
+            ckpt = tf.train.latest_checkpoint(out_dir)
+        inference_fn(
+            flags.run,
+            flags.iterations,
+            ckpt,
+            flags.inference_input_file,
+            trans_file,
+            hparams,
+            num_workers,
+            jobid,
+        )
+
+        # Evaluation
+        if flags.run == "accuracy":
+            ref_file = flags.inference_ref_file
+            if ref_file and tf.gfile.Exists(trans_file):
+                for metric in hparams.metrics:
+                    score = evaluation_utils.evaluate(
+                        ref_file, trans_file, metric, hparams.subword_option
+                    )
+                    utils.print_out("  %s: %.1f" % (metric, score))
+            else:
+                # Train
+                train_fn(hparams, target_session=target_session)
 
 
 def main(unused_argv):
-  default_hparams = create_hparams(FLAGS)
-  train_fn = train.train
-  inference_fn = inference.inference
-  run_main(FLAGS, default_hparams, train_fn, inference_fn)
+    default_hparams = create_hparams(FLAGS)
+    train_fn = train.train
+    inference_fn = inference.inference
+    run_main(FLAGS, default_hparams, train_fn, inference_fn)
 
 
 if __name__ == "__main__":
-  nmt_parser = argparse.ArgumentParser()
-  add_arguments(nmt_parser)
-  FLAGS, unparsed = nmt_parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+    nmt_parser = argparse.ArgumentParser()
+    add_arguments(nmt_parser)
+    FLAGS, unparsed = nmt_parser.parse_known_args()
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt_test.py
index c12179d5a..45328f151 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt_test.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt_test.py
@@ -29,79 +29,71 @@
 
 
 def _update_flags(flags, test_name):
-  """Update flags for basic training."""
-  flags.num_train_steps = 100
-  flags.steps_per_stats = 5
-  flags.src = "en"
-  flags.tgt = "vi"
-  flags.train_prefix = ("nmt/testdata/"
-                        "iwslt15.tst2013.100")
-  flags.vocab_prefix = ("nmt/testdata/"
-                        "iwslt15.vocab.100")
-  flags.dev_prefix = ("nmt/testdata/"
-                      "iwslt15.tst2013.100")
-  flags.test_prefix = ("nmt/testdata/"
-                       "iwslt15.tst2013.100")
-  flags.out_dir = os.path.join(tf.test.get_temp_dir(), test_name)
+    """Update flags for basic training."""
+    flags.num_train_steps = 100
+    flags.steps_per_stats = 5
+    flags.src = "en"
+    flags.tgt = "vi"
+    flags.train_prefix = "nmt/testdata/" "iwslt15.tst2013.100"
+    flags.vocab_prefix = "nmt/testdata/" "iwslt15.vocab.100"
+    flags.dev_prefix = "nmt/testdata/" "iwslt15.tst2013.100"
+    flags.test_prefix = "nmt/testdata/" "iwslt15.tst2013.100"
+    flags.out_dir = os.path.join(tf.test.get_temp_dir(), test_name)
 
 
 class NMTTest(tf.test.TestCase):
 
-  def testTrain(self):
-    """Test the training loop is functional with basic hparams."""
-    nmt_parser = argparse.ArgumentParser()
-    nmt.add_arguments(nmt_parser)
-    FLAGS, unparsed = nmt_parser.parse_known_args()
+    def testTrain(self):
+        """Test the training loop is functional with basic hparams."""
+        nmt_parser = argparse.ArgumentParser()
+        nmt.add_arguments(nmt_parser)
+        FLAGS, unparsed = nmt_parser.parse_known_args()
 
-    _update_flags(FLAGS, "nmt_train_test")
+        _update_flags(FLAGS, "nmt_train_test")
 
-    default_hparams = nmt.create_hparams(FLAGS)
+        default_hparams = nmt.create_hparams(FLAGS)
 
-    train_fn = train.train
-    nmt.run_main(FLAGS, default_hparams, train_fn, None)
+        train_fn = train.train
+        nmt.run_main(FLAGS, default_hparams, train_fn, None)
 
+    def testTrainWithAvgCkpts(self):
+        """Test the training loop is functional with basic hparams."""
+        nmt_parser = argparse.ArgumentParser()
+        nmt.add_arguments(nmt_parser)
+        FLAGS, unparsed = nmt_parser.parse_known_args()
 
-  def testTrainWithAvgCkpts(self):
-    """Test the training loop is functional with basic hparams."""
-    nmt_parser = argparse.ArgumentParser()
-    nmt.add_arguments(nmt_parser)
-    FLAGS, unparsed = nmt_parser.parse_known_args()
+        _update_flags(FLAGS, "nmt_train_test_avg_ckpts")
+        FLAGS.avg_ckpts = True
 
-    _update_flags(FLAGS, "nmt_train_test_avg_ckpts")
-    FLAGS.avg_ckpts = True
+        default_hparams = nmt.create_hparams(FLAGS)
 
-    default_hparams = nmt.create_hparams(FLAGS)
+        train_fn = train.train
+        nmt.run_main(FLAGS, default_hparams, train_fn, None)
 
-    train_fn = train.train
-    nmt.run_main(FLAGS, default_hparams, train_fn, None)
+    def testInference(self):
+        """Test inference is function with basic hparams."""
+        nmt_parser = argparse.ArgumentParser()
+        nmt.add_arguments(nmt_parser)
+        FLAGS, unparsed = nmt_parser.parse_known_args()
 
+        _update_flags(FLAGS, "nmt_train_infer")
 
-  def testInference(self):
-    """Test inference is function with basic hparams."""
-    nmt_parser = argparse.ArgumentParser()
-    nmt.add_arguments(nmt_parser)
-    FLAGS, unparsed = nmt_parser.parse_known_args()
+        # Train one step so we have a checkpoint.
+        FLAGS.num_train_steps = 1
+        default_hparams = nmt.create_hparams(FLAGS)
+        train_fn = train.train
+        nmt.run_main(FLAGS, default_hparams, train_fn, None)
 
-    _update_flags(FLAGS, "nmt_train_infer")
+        # Update FLAGS for inference.
+        FLAGS.inference_input_file = "nmt/testdata/" "iwslt15.tst2013.100.en"
+        FLAGS.inference_output_file = os.path.join(FLAGS.out_dir, "output")
+        FLAGS.inference_ref_file = "nmt/testdata/" "iwslt15.tst2013.100.vi"
 
-    # Train one step so we have a checkpoint.
-    FLAGS.num_train_steps = 1
-    default_hparams = nmt.create_hparams(FLAGS)
-    train_fn = train.train
-    nmt.run_main(FLAGS, default_hparams, train_fn, None)
+        default_hparams = nmt.create_hparams(FLAGS)
 
-    # Update FLAGS for inference.
-    FLAGS.inference_input_file = ("nmt/testdata/"
-                                  "iwslt15.tst2013.100.en")
-    FLAGS.inference_output_file = os.path.join(FLAGS.out_dir, "output")
-    FLAGS.inference_ref_file = ("nmt/testdata/"
-                                "iwslt15.tst2013.100.vi")
-
-    default_hparams = nmt.create_hparams(FLAGS)
-
-    inference_fn = inference.inference
-    nmt.run_main(FLAGS, default_hparams, None, inference_fn)
+        inference_fn = inference.inference
+        nmt.run_main(FLAGS, default_hparams, None, inference_fn)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/bleu.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/bleu.py
index 494e39c6c..c677b9db6 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/bleu.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/bleu.py
@@ -27,146 +27,164 @@
 import collections
 import math
 
+
 ##
 # @brief Class to compute running BLEU scores
 # @detail BLEU scores can be computed in a non-linear way,
 # or without having access to the full translated corpus in time.
 class RunningBLEUScorer:
 
-  def __init__(self, max_order=4, smooth=False):
-    self.max_order = max_order
-    self.smooth = smooth
-    self.reset()
-
-  ##
-  # @brief Reset all variables (none of the previus sentences will be taken into account)
-  def reset(self):
-    self.matches_by_order = [0] * self.max_order
-    self.possible_matches_by_order = [0] * self.max_order
-    self.reference_length = 0
-    self.translation_length = 0
-
-  ##
-  # @brief Add a single sentence
-  # @param reference list of words for a reference sentence
-  # @param translation list of words for its corresponding translated sentence
-  # @post Updates internal structures to take this sentence's translation 
-  # result into account in final BLEU score
-  def add_sentence(self, reference, translation):
-    self.add_sentence_with_multiple_refs([reference], translation)
-
-  ##
-  # @brief Add a single reference, with potentially multiple references
-  # @param reference list of list of words for a reference sentence
-  # @note That we could have multiple sentences serving as a reference
-  # @param translation (single) list of words for its corresponding translated sentence
-  # @post Updates internal structures to take this sentence's translation 
-  # result into account in final BLEU score
-  def add_sentence_with_multiple_refs(self, references, translation):
-    self.reference_length += min(len(r) for r in references)
-    self.translation_length += len(translation)
-
-    merged_ref_ngram_counts = collections.Counter()
-    for reference in references:
-      merged_ref_ngram_counts |= self._get_ngrams(reference)
-
-    translation_ngram_counts = self._get_ngrams(translation)
-    
-    new_matches_by_order, new_possible_matches_by_order = self._get_ngram_match_values(merged_ref_ngram_counts, translation_ngram_counts, len(translation))
-
-    for i in range(self.max_order):
-      self.matches_by_order[i] += new_matches_by_order[i]
-      self.possible_matches_by_order[i] += new_possible_matches_by_order[i]
-
-  ##
-  # @brief Calculate final BLEU score
-  def calc_BLEU_score(self):
-    precisions = [0] * self.max_order
-    for i in range(0, self.max_order):
-      if self.smooth:
-        precisions[i] = ((self.matches_by_order[i] + 1.) /
-                         (self.possible_matches_by_order[i] + 1.))
-      else:
-        if self.possible_matches_by_order[i] > 0:
-          precisions[i] = (float(self.matches_by_order[i]) /
-                           self.possible_matches_by_order[i])
+    def __init__(self, max_order=4, smooth=False):
+        self.max_order = max_order
+        self.smooth = smooth
+        self.reset()
+
+    ##
+    # @brief Reset all variables (none of the previus sentences will be taken into account)
+    def reset(self):
+        self.matches_by_order = [0] * self.max_order
+        self.possible_matches_by_order = [0] * self.max_order
+        self.reference_length = 0
+        self.translation_length = 0
+
+    ##
+    # @brief Add a single sentence
+    # @param reference list of words for a reference sentence
+    # @param translation list of words for its corresponding translated sentence
+    # @post Updates internal structures to take this sentence's translation
+    # result into account in final BLEU score
+    def add_sentence(self, reference, translation):
+        self.add_sentence_with_multiple_refs([reference], translation)
+
+    ##
+    # @brief Add a single reference, with potentially multiple references
+    # @param reference list of list of words for a reference sentence
+    # @note That we could have multiple sentences serving as a reference
+    # @param translation (single) list of words for its corresponding translated sentence
+    # @post Updates internal structures to take this sentence's translation
+    # result into account in final BLEU score
+    def add_sentence_with_multiple_refs(self, references, translation):
+        self.reference_length += min(len(r) for r in references)
+        self.translation_length += len(translation)
+
+        merged_ref_ngram_counts = collections.Counter()
+        for reference in references:
+            merged_ref_ngram_counts |= self._get_ngrams(reference)
+
+        translation_ngram_counts = self._get_ngrams(translation)
+
+        new_matches_by_order, new_possible_matches_by_order = (
+            self._get_ngram_match_values(
+                merged_ref_ngram_counts, translation_ngram_counts, len(
+                    translation)
+            )
+        )
+
+        for i in range(self.max_order):
+            self.matches_by_order[i] += new_matches_by_order[i]
+            self.possible_matches_by_order[i] += new_possible_matches_by_order[i]
+
+    ##
+    # @brief Calculate final BLEU score
+    def calc_BLEU_score(self):
+        precisions = [0] * self.max_order
+        for i in range(0, self.max_order):
+            if self.smooth:
+                precisions[i] = (self.matches_by_order[i] + 1.0) / (
+                    self.possible_matches_by_order[i] + 1.0
+                )
+            else:
+                if self.possible_matches_by_order[i] > 0:
+                    precisions[i] = (
+                        float(self.matches_by_order[i])
+                        / self.possible_matches_by_order[i]
+                    )
+                else:
+                    precisions[i] = 0.0
+
+        if min(precisions) > 0:
+            p_log_sum = sum((1.0 / self.max_order) * math.log(p)
+                            for p in precisions)
+            geo_mean = math.exp(p_log_sum)
         else:
-          precisions[i] = 0.0
-
-    if min(precisions) > 0:
-      p_log_sum = sum((1. / self.max_order) * math.log(p) for p in precisions)
-      geo_mean = math.exp(p_log_sum)
-    else:
-      geo_mean = 0
-
-    ratio = float(self.translation_length) / self.reference_length
-
-    if ratio > 1.0:
-      bp = 1.
-    else:
-      bp = math.exp(1 - 1. / ratio)
-
-    bleu = geo_mean * bp
-
-    return (bleu, precisions, bp, ratio, self.translation_length, self.reference_length)
+            geo_mean = 0
 
-  ##
-  # @brief Internal function to compute matching percentages for different order ngrams
-  def _get_ngram_match_values(self, ref_ngram_counts, translation_ngram_counts, translation_length):
-    new_matches_by_order = [0] * self.max_order
-    new_possible_matches_by_order = [0] * self.max_order
+        ratio = float(self.translation_length) / self.reference_length
 
-    overlap = translation_ngram_counts & ref_ngram_counts
-    for ngram in overlap:
-      new_matches_by_order[len(ngram)-1] += overlap[ngram]
-    for order in range(1, self.max_order+1):
-      possible_matches = translation_length - order + 1
-      new_possible_matches_by_order[order-1] = max(0, possible_matches)
-
-    return (new_matches_by_order, new_possible_matches_by_order)
-
-  def _get_ngrams(self, segment):
-    """Internal function to extract all n-grams upto a given maximum order from an input segment.
+        if ratio > 1.0:
+            bp = 1.0
+        else:
+            bp = math.exp(1 - 1.0 / ratio)
+
+        bleu = geo_mean * bp
+
+        return (
+            bleu,
+            precisions,
+            bp,
+            ratio,
+            self.translation_length,
+            self.reference_length,
+        )
+
+    ##
+    # @brief Internal function to compute matching percentages for different order ngrams
+    def _get_ngram_match_values(
+        self, ref_ngram_counts, translation_ngram_counts, translation_length
+    ):
+        new_matches_by_order = [0] * self.max_order
+        new_possible_matches_by_order = [0] * self.max_order
+
+        overlap = translation_ngram_counts & ref_ngram_counts
+        for ngram in overlap:
+            new_matches_by_order[len(ngram) - 1] += overlap[ngram]
+        for order in range(1, self.max_order + 1):
+            possible_matches = translation_length - order + 1
+            new_possible_matches_by_order[order - 1] = max(0, possible_matches)
+
+        return (new_matches_by_order, new_possible_matches_by_order)
+
+    def _get_ngrams(self, segment):
+        """Internal function to extract all n-grams upto a given maximum order from an input segment.
+
+        Args:
+          segment: text segment from which n-grams will be extracted.
+
+        Returns:
+          The Counter containing all n-grams upto max_order in segment
+          with a count of how many times each n-gram occurred.
+        """
+        ngram_counts = collections.Counter()
+        for order in range(1, self.max_order + 1):
+            for i in range(0, len(segment) - order + 1):
+                ngram = tuple(segment[i: i + order])
+                ngram_counts[ngram] += 1
+        return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus,
+                 max_order=4, smooth=False):
+    """Computes BLEU score of translated segments against one or more references.
+      This is the most common usage when calculating BLEU scores.
 
     Args:
-      segment: text segment from which n-grams will be extracted.
+      reference_corpus: list of lists of references for each translation. Each
+          reference should be tokenized into a list of tokens.
+          reference_corpus[i][j][k] represents the k'th word of the i'th sentence
+          for the j'th reference text
+      translation_corpus: list of translated sentences to score. Each sentence
+          should be tokenized into a list of tokens.
+          translation_corpus[i][j] represents the j'th word for the i'th sentence
+      max_order: Maximum n-gram order to use when computing BLEU score.
+      smooth: Whether or not to apply Lin et al. 2004 smoothing.
 
     Returns:
-      The Counter containing all n-grams upto max_order in segment
-      with a count of how many times each n-gram occurred.
+      3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
+      precisions and brevity penalty.
     """
-    ngram_counts = collections.Counter()
-    for order in range(1, self.max_order + 1):
-      for i in range(0, len(segment) - order + 1):
-        ngram = tuple(segment[i:i+order])
-        ngram_counts[ngram] += 1
-    return ngram_counts
-
-def compute_bleu(reference_corpus, translation_corpus, max_order=4,
-                 smooth=False):
-  """Computes BLEU score of translated segments against one or more references.
-    This is the most common usage when calculating BLEU scores.
-
-  Args:
-    reference_corpus: list of lists of references for each translation. Each
-        reference should be tokenized into a list of tokens.
-        reference_corpus[i][j][k] represents the k'th word of the i'th sentence
-        for the j'th reference text
-    translation_corpus: list of translated sentences to score. Each sentence
-        should be tokenized into a list of tokens.
-        translation_corpus[i][j] represents the j'th word for the i'th sentence
-    max_order: Maximum n-gram order to use when computing BLEU score.
-    smooth: Whether or not to apply Lin et al. 2004 smoothing.
-
-  Returns:
-    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
-    precisions and brevity penalty.
-  """
-  runningBLEU = RunningBLEUScorer(max_order=max_order, smooth=smooth)
-
-
-  for (references, translation) in zip(reference_corpus,
-                                       translation_corpus):
-    runningBLEU.add_sentence_with_multiple_refs(references, translation)
- 
-  return runningBLEU.calc_BLEU_score()
\ No newline at end of file
+    runningBLEU = RunningBLEUScorer(max_order=max_order, smooth=smooth)
+
+    for references, translation in zip(reference_corpus, translation_corpus):
+        runningBLEU.add_sentence_with_multiple_refs(references, translation)
+
+    return runningBLEU.calc_BLEU_score()
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/rouge.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/rouge.py
index e0269b9c1..505ab7f65 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/rouge.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/rouge.py
@@ -13,340 +13,336 @@
 import itertools
 import numpy as np
 
-#pylint: disable=C0103
+# pylint: disable=C0103
 
 
 def _get_ngrams(n, text):
-  """Calcualtes n-grams.
+    """Calcualtes n-grams.
 
-  Args:
-    n: which n-grams to calculate
-    text: An array of tokens
+    Args:
+      n: which n-grams to calculate
+      text: An array of tokens
 
-  Returns:
-    A set of n-grams
-  """
-  ngram_set = set()
-  text_length = len(text)
-  max_index_ngram_start = text_length - n
-  for i in range(max_index_ngram_start + 1):
-    ngram_set.add(tuple(text[i:i + n]))
-  return ngram_set
+    Returns:
+      A set of n-grams
+    """
+    ngram_set = set()
+    text_length = len(text)
+    max_index_ngram_start = text_length - n
+    for i in range(max_index_ngram_start + 1):
+        ngram_set.add(tuple(text[i: i + n]))
+    return ngram_set
 
 
 def _split_into_words(sentences):
-  """Splits multiple sentences into words and flattens the result"""
-  return list(itertools.chain(*[_.split(" ") for _ in sentences]))
+    """Splits multiple sentences into words and flattens the result"""
+    return list(itertools.chain(*[_.split(" ") for _ in sentences]))
 
 
 def _get_word_ngrams(n, sentences):
-  """Calculates word n-grams for multiple sentences.
-  """
-  assert len(sentences) > 0
-  assert n > 0
+    """Calculates word n-grams for multiple sentences."""
+    assert len(sentences) > 0
+    assert n > 0
 
-  words = _split_into_words(sentences)
-  return _get_ngrams(n, words)
+    words = _split_into_words(sentences)
+    return _get_ngrams(n, words)
 
 
 def _len_lcs(x, y):
-  """
-  Returns the length of the Longest Common Subsequence between sequences x
-  and y.
-  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+    """
+    Returns the length of the Longest Common Subsequence between sequences x
+    and y.
+    Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
 
-  Args:
-    x: sequence of words
-    y: sequence of words
+    Args:
+      x: sequence of words
+      y: sequence of words
 
-  Returns
-    integer: Length of LCS between x and y
-  """
-  table = _lcs(x, y)
-  n, m = len(x), len(y)
-  return table[n, m]
+    Returns
+      integer: Length of LCS between x and y
+    """
+    table = _lcs(x, y)
+    n, m = len(x), len(y)
+    return table[n, m]
 
 
 def _lcs(x, y):
-  """
-  Computes the length of the longest common subsequence (lcs) between two
-  strings. The implementation below uses a DP programming algorithm and runs
-  in O(nm) time where n = len(x) and m = len(y).
-  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
-
-  Args:
-    x: collection of words
-    y: collection of words
-
-  Returns:
-    Table of dictionary of coord and len lcs
-  """
-  n, m = len(x), len(y)
-  table = dict()
-  for i in range(n + 1):
-    for j in range(m + 1):
-      if i == 0 or j == 0:
-        table[i, j] = 0
-      elif x[i - 1] == y[j - 1]:
-        table[i, j] = table[i - 1, j - 1] + 1
-      else:
-        table[i, j] = max(table[i - 1, j], table[i, j - 1])
-  return table
+    """
+    Computes the length of the longest common subsequence (lcs) between two
+    strings. The implementation below uses a DP programming algorithm and runs
+    in O(nm) time where n = len(x) and m = len(y).
+    Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+    Args:
+      x: collection of words
+      y: collection of words
+
+    Returns:
+      Table of dictionary of coord and len lcs
+    """
+    n, m = len(x), len(y)
+    table = dict()
+    for i in range(n + 1):
+        for j in range(m + 1):
+            if i == 0 or j == 0:
+                table[i, j] = 0
+            elif x[i - 1] == y[j - 1]:
+                table[i, j] = table[i - 1, j - 1] + 1
+            else:
+                table[i, j] = max(table[i - 1, j], table[i, j - 1])
+    return table
 
 
 def _recon_lcs(x, y):
-  """
-  Returns the Longest Subsequence between x and y.
-  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
-
-  Args:
-    x: sequence of words
-    y: sequence of words
-
-  Returns:
-    sequence: LCS of x and y
-  """
-  i, j = len(x), len(y)
-  table = _lcs(x, y)
-
-  def _recon(i, j):
-    """private recon calculation"""
-    if i == 0 or j == 0:
-      return []
-    elif x[i - 1] == y[j - 1]:
-      return _recon(i - 1, j - 1) + [(x[i - 1], i)]
-    elif table[i - 1, j] > table[i, j - 1]:
-      return _recon(i - 1, j)
-    else:
-      return _recon(i, j - 1)
-
-  recon_tuple = tuple(map(lambda x: x[0], _recon(i, j)))
-  return recon_tuple
+    """
+    Returns the Longest Subsequence between x and y.
+    Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+    Args:
+      x: sequence of words
+      y: sequence of words
+
+    Returns:
+      sequence: LCS of x and y
+    """
+    i, j = len(x), len(y)
+    table = _lcs(x, y)
+
+    def _recon(i, j):
+        """private recon calculation"""
+        if i == 0 or j == 0:
+            return []
+        elif x[i - 1] == y[j - 1]:
+            return _recon(i - 1, j - 1) + [(x[i - 1], i)]
+        elif table[i - 1, j] > table[i, j - 1]:
+            return _recon(i - 1, j)
+        else:
+            return _recon(i, j - 1)
+
+    recon_tuple = tuple(map(lambda x: x[0], _recon(i, j)))
+    return recon_tuple
 
 
 def rouge_n(evaluated_sentences, reference_sentences, n=2):
-  """
-  Computes ROUGE-N of two text collections of sentences.
-  Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/
-  papers/rouge-working-note-v1.3.1.pdf
-
-  Args:
-    evaluated_sentences: The sentences that have been picked by the summarizer
-    reference_sentences: The sentences from the referene set
-    n: Size of ngram.  Defaults to 2.
-
-  Returns:
-    A tuple (f1, precision, recall) for ROUGE-N
-
-  Raises:
-    ValueError: raises exception if a param has len <= 0
-  """
-  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
-    raise ValueError("Collections must contain at least 1 sentence.")
-
-  evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
-  reference_ngrams = _get_word_ngrams(n, reference_sentences)
-  reference_count = len(reference_ngrams)
-  evaluated_count = len(evaluated_ngrams)
-
-  # Gets the overlapping ngrams between evaluated and reference
-  overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
-  overlapping_count = len(overlapping_ngrams)
-
-  # Handle edge case. This isn't mathematically correct, but it's good enough
-  if evaluated_count == 0:
-    precision = 0.0
-  else:
-    precision = overlapping_count / evaluated_count
+    """
+    Computes ROUGE-N of two text collections of sentences.
+    Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/
+    papers/rouge-working-note-v1.3.1.pdf
+
+    Args:
+      evaluated_sentences: The sentences that have been picked by the summarizer
+      reference_sentences: The sentences from the referene set
+      n: Size of ngram.  Defaults to 2.
+
+    Returns:
+      A tuple (f1, precision, recall) for ROUGE-N
+
+    Raises:
+      ValueError: raises exception if a param has len <= 0
+    """
+    if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+        raise ValueError("Collections must contain at least 1 sentence.")
+
+    evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
+    reference_ngrams = _get_word_ngrams(n, reference_sentences)
+    reference_count = len(reference_ngrams)
+    evaluated_count = len(evaluated_ngrams)
+
+    # Gets the overlapping ngrams between evaluated and reference
+    overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
+    overlapping_count = len(overlapping_ngrams)
+
+    # Handle edge case. This isn't mathematically correct, but it's good enough
+    if evaluated_count == 0:
+        precision = 0.0
+    else:
+        precision = overlapping_count / evaluated_count
 
-  if reference_count == 0:
-    recall = 0.0
-  else:
-    recall = overlapping_count / reference_count
+    if reference_count == 0:
+        recall = 0.0
+    else:
+        recall = overlapping_count / reference_count
 
-  f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
+    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
 
-  # return overlapping_count / reference_count
-  return f1_score, precision, recall
+    # return overlapping_count / reference_count
+    return f1_score, precision, recall
 
 
 def _f_p_r_lcs(llcs, m, n):
-  """
-  Computes the LCS-based F-measure score
-  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
-  rouge-working-note-v1.3.1.pdf
-
-  Args:
-    llcs: Length of LCS
-    m: number of words in reference summary
-    n: number of words in candidate summary
-
-  Returns:
-    Float. LCS-based F-measure score
-  """
-  r_lcs = llcs / m
-  p_lcs = llcs / n
-  beta = p_lcs / (r_lcs + 1e-12)
-  num = (1 + (beta**2)) * r_lcs * p_lcs
-  denom = r_lcs + ((beta**2) * p_lcs)
-  f_lcs = num / (denom + 1e-12)
-  return f_lcs, p_lcs, r_lcs
+    """
+    Computes the LCS-based F-measure score
+    Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+    rouge-working-note-v1.3.1.pdf
+
+    Args:
+      llcs: Length of LCS
+      m: number of words in reference summary
+      n: number of words in candidate summary
+
+    Returns:
+      Float. LCS-based F-measure score
+    """
+    r_lcs = llcs / m
+    p_lcs = llcs / n
+    beta = p_lcs / (r_lcs + 1e-12)
+    num = (1 + (beta**2)) * r_lcs * p_lcs
+    denom = r_lcs + ((beta**2) * p_lcs)
+    f_lcs = num / (denom + 1e-12)
+    return f_lcs, p_lcs, r_lcs
 
 
 def rouge_l_sentence_level(evaluated_sentences, reference_sentences):
-  """
-  Computes ROUGE-L (sentence level) of two text collections of sentences.
-  http://research.microsoft.com/en-us/um/people/cyl/download/papers/
-  rouge-working-note-v1.3.1.pdf
-
-  Calculated according to:
-  R_lcs = LCS(X,Y)/m
-  P_lcs = LCS(X,Y)/n
-  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
-
-  where:
-  X = reference summary
-  Y = Candidate summary
-  m = length of reference summary
-  n = length of candidate summary
-
-  Args:
-    evaluated_sentences: The sentences that have been picked by the summarizer
-    reference_sentences: The sentences from the referene set
-
-  Returns:
-    A float: F_lcs
-
-  Raises:
-    ValueError: raises exception if a param has len <= 0
-  """
-  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
-    raise ValueError("Collections must contain at least 1 sentence.")
-  reference_words = _split_into_words(reference_sentences)
-  evaluated_words = _split_into_words(evaluated_sentences)
-  m = len(reference_words)
-  n = len(evaluated_words)
-  lcs = _len_lcs(evaluated_words, reference_words)
-  return _f_p_r_lcs(lcs, m, n)
+    """
+    Computes ROUGE-L (sentence level) of two text collections of sentences.
+    http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+    rouge-working-note-v1.3.1.pdf
+
+    Calculated according to:
+    R_lcs = LCS(X,Y)/m
+    P_lcs = LCS(X,Y)/n
+    F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+
+    where:
+    X = reference summary
+    Y = Candidate summary
+    m = length of reference summary
+    n = length of candidate summary
+
+    Args:
+      evaluated_sentences: The sentences that have been picked by the summarizer
+      reference_sentences: The sentences from the referene set
+
+    Returns:
+      A float: F_lcs
+
+    Raises:
+      ValueError: raises exception if a param has len <= 0
+    """
+    if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+        raise ValueError("Collections must contain at least 1 sentence.")
+    reference_words = _split_into_words(reference_sentences)
+    evaluated_words = _split_into_words(evaluated_sentences)
+    m = len(reference_words)
+    n = len(evaluated_words)
+    lcs = _len_lcs(evaluated_words, reference_words)
+    return _f_p_r_lcs(lcs, m, n)
 
 
 def _union_lcs(evaluated_sentences, reference_sentence):
-  """
-  Returns LCS_u(r_i, C) which is the LCS score of the union longest common
-  subsequence between reference sentence ri and candidate summary C. For example
-  if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and
-  c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is
-  "w1 w2" and the longest common subsequence of r_i and c2 is "w1 w3 w5". The
-  union longest common subsequence of r_i, c1, and c2 is "w1 w2 w3 w5" and
-  LCS_u(r_i, C) = 4/5.
-
-  Args:
-    evaluated_sentences: The sentences that have been picked by the summarizer
-    reference_sentence: One of the sentences in the reference summaries
-
-  Returns:
-    float: LCS_u(r_i, C)
-
-  ValueError:
-    Raises exception if a param has len <= 0
-  """
-  if len(evaluated_sentences) <= 0:
-    raise ValueError("Collections must contain at least 1 sentence.")
-
-  lcs_union = set()
-  reference_words = _split_into_words([reference_sentence])
-  combined_lcs_length = 0
-  for eval_s in evaluated_sentences:
-    evaluated_words = _split_into_words([eval_s])
-    lcs = set(_recon_lcs(reference_words, evaluated_words))
-    combined_lcs_length += len(lcs)
-    lcs_union = lcs_union.union(lcs)
-
-  union_lcs_count = len(lcs_union)
-  union_lcs_value = union_lcs_count / combined_lcs_length
-  return union_lcs_value
+    """
+    Returns LCS_u(r_i, C) which is the LCS score of the union longest common
+    subsequence between reference sentence ri and candidate summary C. For example
+    if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and
+    c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is
+    "w1 w2" and the longest common subsequence of r_i and c2 is "w1 w3 w5". The
+    union longest common subsequence of r_i, c1, and c2 is "w1 w2 w3 w5" and
+    LCS_u(r_i, C) = 4/5.
+
+    Args:
+      evaluated_sentences: The sentences that have been picked by the summarizer
+      reference_sentence: One of the sentences in the reference summaries
+
+    Returns:
+      float: LCS_u(r_i, C)
+
+    ValueError:
+      Raises exception if a param has len <= 0
+    """
+    if len(evaluated_sentences) <= 0:
+        raise ValueError("Collections must contain at least 1 sentence.")
+
+    lcs_union = set()
+    reference_words = _split_into_words([reference_sentence])
+    combined_lcs_length = 0
+    for eval_s in evaluated_sentences:
+        evaluated_words = _split_into_words([eval_s])
+        lcs = set(_recon_lcs(reference_words, evaluated_words))
+        combined_lcs_length += len(lcs)
+        lcs_union = lcs_union.union(lcs)
+
+    union_lcs_count = len(lcs_union)
+    union_lcs_value = union_lcs_count / combined_lcs_length
+    return union_lcs_value
 
 
 def rouge_l_summary_level(evaluated_sentences, reference_sentences):
-  """
-  Computes ROUGE-L (summary level) of two text collections of sentences.
-  http://research.microsoft.com/en-us/um/people/cyl/download/papers/
-  rouge-working-note-v1.3.1.pdf
+    """
+    Computes ROUGE-L (summary level) of two text collections of sentences.
+    http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+    rouge-working-note-v1.3.1.pdf
 
-  Calculated according to:
-  R_lcs = SUM(1, u)[LCS<union>(r_i,C)]/m
-  P_lcs = SUM(1, u)[LCS<union>(r_i,C)]/n
-  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+    Calculated according to:
+    R_lcs = SUM(1, u)[LCS<union>(r_i,C)]/m
+    P_lcs = SUM(1, u)[LCS<union>(r_i,C)]/n
+    F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
 
-  where:
-  SUM(i,u) = SUM from i through u
-  u = number of sentences in reference summary
-  C = Candidate summary made up of v sentences
-  m = number of words in reference summary
-  n = number of words in candidate summary
+    where:
+    SUM(i,u) = SUM from i through u
+    u = number of sentences in reference summary
+    C = Candidate summary made up of v sentences
+    m = number of words in reference summary
+    n = number of words in candidate summary
 
-  Args:
-    evaluated_sentences: The sentences that have been picked by the summarizer
-    reference_sentence: One of the sentences in the reference summaries
+    Args:
+      evaluated_sentences: The sentences that have been picked by the summarizer
+      reference_sentence: One of the sentences in the reference summaries
 
-  Returns:
-    A float: F_lcs
+    Returns:
+      A float: F_lcs
 
-  Raises:
-    ValueError: raises exception if a param has len <= 0
-  """
-  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
-    raise ValueError("Collections must contain at least 1 sentence.")
+    Raises:
+      ValueError: raises exception if a param has len <= 0
+    """
+    if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+        raise ValueError("Collections must contain at least 1 sentence.")
 
-  # total number of words in reference sentences
-  m = len(_split_into_words(reference_sentences))
+    # total number of words in reference sentences
+    m = len(_split_into_words(reference_sentences))
 
-  # total number of words in evaluated sentences
-  n = len(_split_into_words(evaluated_sentences))
+    # total number of words in evaluated sentences
+    n = len(_split_into_words(evaluated_sentences))
 
-  union_lcs_sum_across_all_references = 0
-  for ref_s in reference_sentences:
-    union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences,
-                                                      ref_s)
-  return _f_p_r_lcs(union_lcs_sum_across_all_references, m, n)
+    union_lcs_sum_across_all_references = 0
+    for ref_s in reference_sentences:
+        union_lcs_sum_across_all_references += _union_lcs(
+            evaluated_sentences, ref_s)
+    return _f_p_r_lcs(union_lcs_sum_across_all_references, m, n)
 
 
 def rouge(hypotheses, references):
-  """Calculates average rouge scores for a list of hypotheses and
-  references"""
-
-  # Filter out hyps that are of 0 length
-  # hyps_and_refs = zip(hypotheses, references)
-  # hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0]
-  # hypotheses, references = zip(*hyps_and_refs)
-
-  # Calculate ROUGE-1 F1, precision, recall scores
-  rouge_1 = [
-      rouge_n([hyp], [ref], 1) for hyp, ref in zip(hypotheses, references)
-  ]
-  rouge_1_f, rouge_1_p, rouge_1_r = map(np.mean, zip(*rouge_1))
-
-  # Calculate ROUGE-2 F1, precision, recall scores
-  rouge_2 = [
-      rouge_n([hyp], [ref], 2) for hyp, ref in zip(hypotheses, references)
-  ]
-  rouge_2_f, rouge_2_p, rouge_2_r = map(np.mean, zip(*rouge_2))
-
-  # Calculate ROUGE-L F1, precision, recall scores
-  rouge_l = [
-      rouge_l_sentence_level([hyp], [ref])
-      for hyp, ref in zip(hypotheses, references)
-  ]
-  rouge_l_f, rouge_l_p, rouge_l_r = map(np.mean, zip(*rouge_l))
-
-  return {
-      "rouge_1/f_score": rouge_1_f,
-      "rouge_1/r_score": rouge_1_r,
-      "rouge_1/p_score": rouge_1_p,
-      "rouge_2/f_score": rouge_2_f,
-      "rouge_2/r_score": rouge_2_r,
-      "rouge_2/p_score": rouge_2_p,
-      "rouge_l/f_score": rouge_l_f,
-      "rouge_l/r_score": rouge_l_r,
-      "rouge_l/p_score": rouge_l_p,
-  }
+    """Calculates average rouge scores for a list of hypotheses and
+    references"""
+
+    # Filter out hyps that are of 0 length
+    # hyps_and_refs = zip(hypotheses, references)
+    # hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0]
+    # hypotheses, references = zip(*hyps_and_refs)
+
+    # Calculate ROUGE-1 F1, precision, recall scores
+    rouge_1 = [rouge_n([hyp], [ref], 1)
+               for hyp, ref in zip(hypotheses, references)]
+    rouge_1_f, rouge_1_p, rouge_1_r = map(np.mean, zip(*rouge_1))
+
+    # Calculate ROUGE-2 F1, precision, recall scores
+    rouge_2 = [rouge_n([hyp], [ref], 2)
+               for hyp, ref in zip(hypotheses, references)]
+    rouge_2_f, rouge_2_p, rouge_2_r = map(np.mean, zip(*rouge_2))
+
+    # Calculate ROUGE-L F1, precision, recall scores
+    rouge_l = [
+        rouge_l_sentence_level([hyp], [ref]) for hyp, ref in zip(hypotheses, references)
+    ]
+    rouge_l_f, rouge_l_p, rouge_l_r = map(np.mean, zip(*rouge_l))
+
+    return {
+        "rouge_1/f_score": rouge_1_f,
+        "rouge_1/r_score": rouge_1_r,
+        "rouge_1/p_score": rouge_1_p,
+        "rouge_2/f_score": rouge_2_f,
+        "rouge_2/r_score": rouge_2_r,
+        "rouge_2/p_score": rouge_2_p,
+        "rouge_l/f_score": rouge_l_f,
+        "rouge_l/r_score": rouge_l_r,
+        "rouge_l/p_score": rouge_l_p,
+    }
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/train.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/train.py
index 1f061486b..a440b1a12 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/train.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/train.py
@@ -33,718 +33,931 @@
 utils.check_tensorflow_version()
 
 __all__ = [
-    "run_sample_decode", "run_internal_eval", "run_external_eval",
-    "run_avg_external_eval", "run_full_eval", "init_stats", "update_stats",
-    "print_step_info", "process_stats", "train", "get_model_creator",
-    "add_info_summaries", "get_best_results"
+    "run_sample_decode",
+    "run_internal_eval",
+    "run_external_eval",
+    "run_avg_external_eval",
+    "run_full_eval",
+    "init_stats",
+    "update_stats",
+    "print_step_info",
+    "process_stats",
+    "train",
+    "get_model_creator",
+    "add_info_summaries",
+    "get_best_results",
 ]
 
 
-def run_sample_decode(infer_model, infer_sess, model_dir, hparams,
-                      summary_writer, src_data, tgt_data):
-  """Sample decode a random sentence from src_data."""
-  with infer_model.graph.as_default():
-    loaded_infer_model, global_step = model_helper.create_or_load_model(
-        infer_model.model, model_dir, infer_sess, "infer")
-
-  _sample_decode(loaded_infer_model, global_step, infer_sess, hparams,
-                 infer_model.iterator, src_data, tgt_data,
-                 infer_model.src_placeholder,
-                 infer_model.batch_size_placeholder, summary_writer)
-
-
-def run_internal_eval(eval_model,
-                      eval_sess,
-                      model_dir,
-                      hparams,
-                      summary_writer,
-                      use_test_set=True,
-                      dev_eval_iterator_feed_dict=None,
-                      test_eval_iterator_feed_dict=None):
-  """Compute internal evaluation (perplexity) for both dev / test.
-
-  Computes development and testing perplexities for given model.
-
-  Args:
-    eval_model: Evaluation model for which to compute perplexities.
-    eval_sess: Evaluation TensorFlow session.
-    model_dir: Directory from which to load evaluation model from.
-    hparams: Model hyper-parameters.
-    summary_writer: Summary writer for logging metrics to TensorBoard.
-    use_test_set: Computes testing perplexity if true; does not otherwise.
-      Note that the development perplexity is always computed regardless of
-      value of this parameter.
-    dev_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session.
-      Can be used to pass in additional inputs necessary for running the
-      development evaluation.
-    test_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session.
-      Can be used to pass in additional inputs necessary for running the
-      testing evaluation.
-  Returns:
-    Pair containing development perplexity and testing perplexity, in this
-    order.
-  """
-  if dev_eval_iterator_feed_dict is None:
-    dev_eval_iterator_feed_dict = {}
-  if test_eval_iterator_feed_dict is None:
-    test_eval_iterator_feed_dict = {}
-  with eval_model.graph.as_default():
-    loaded_eval_model, global_step = model_helper.create_or_load_model(
-        eval_model.model, model_dir, eval_sess, "eval")
-
-  dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src)
-  dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt)
-  dev_eval_iterator_feed_dict[eval_model.src_file_placeholder] = dev_src_file
-  dev_eval_iterator_feed_dict[eval_model.tgt_file_placeholder] = dev_tgt_file
-
-  dev_ppl = _internal_eval(loaded_eval_model, global_step, eval_sess,
-                           eval_model.iterator, dev_eval_iterator_feed_dict,
-                           summary_writer, "dev")
-  test_ppl = None
-  if use_test_set and hparams.test_prefix:
-    test_src_file = "%s.%s" % (hparams.test_prefix, hparams.src)
-    test_tgt_file = "%s.%s" % (hparams.test_prefix, hparams.tgt)
-    test_eval_iterator_feed_dict[
-        eval_model.src_file_placeholder] = test_src_file
-    test_eval_iterator_feed_dict[
-        eval_model.tgt_file_placeholder] = test_tgt_file
-    test_ppl = _internal_eval(loaded_eval_model, global_step, eval_sess,
-                              eval_model.iterator, test_eval_iterator_feed_dict,
-                              summary_writer, "test")
-  return dev_ppl, test_ppl
-
-
-def run_external_eval(infer_model,
-                      infer_sess,
-                      model_dir,
-                      hparams,
-                      summary_writer,
-                      save_best_dev=True,
-                      use_test_set=True,
-                      avg_ckpts=False,
-                      dev_infer_iterator_feed_dict=None,
-                      test_infer_iterator_feed_dict=None):
-  """Compute external evaluation for both dev / test.
-
-  Computes development and testing external evaluation (e.g. bleu, rouge) for
-  given model.
-
-  Args:
-    infer_model: Inference model for which to compute perplexities.
-    infer_sess: Inference TensorFlow session.
-    model_dir: Directory from which to load inference model from.
-    hparams: Model hyper-parameters.
-    summary_writer: Summary writer for logging metrics to TensorBoard.
-    use_test_set: Computes testing external evaluation if true; does not
-      otherwise. Note that the development external evaluation is always
-      computed regardless of value of this parameter.
-    dev_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session.
-      Can be used to pass in additional inputs necessary for running the
-      development external evaluation.
-    test_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session.
-      Can be used to pass in additional inputs necessary for running the
-      testing external evaluation.
-  Returns:
-    Triple containing development scores, testing scores and the TensorFlow
-    Variable for the global step number, in this order.
-  """
-  if dev_infer_iterator_feed_dict is None:
-    dev_infer_iterator_feed_dict = {}
-  if test_infer_iterator_feed_dict is None:
-    test_infer_iterator_feed_dict = {}
-  with infer_model.graph.as_default():
-    loaded_infer_model, global_step = model_helper.create_or_load_model(
-        infer_model.model, model_dir, infer_sess, "infer")
-
-  dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src)
-  dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt)
-  dev_infer_iterator_feed_dict[
-      infer_model.src_placeholder] = inference.load_data(dev_src_file)
-  dev_infer_iterator_feed_dict[
-      infer_model.batch_size_placeholder] = hparams.infer_batch_size
-  dev_scores = _external_eval(
-      loaded_infer_model,
-      global_step,
-      infer_sess,
-      hparams,
-      infer_model.iterator,
-      dev_infer_iterator_feed_dict,
-      dev_tgt_file,
-      "dev",
-      summary_writer,
-      save_on_best=save_best_dev,
-      avg_ckpts=avg_ckpts)
-
-  test_scores = None
-  if use_test_set and hparams.test_prefix:
-    test_src_file = "%s.%s" % (hparams.test_prefix, hparams.src)
-    test_tgt_file = "%s.%s" % (hparams.test_prefix, hparams.tgt)
-    test_infer_iterator_feed_dict[
-        infer_model.src_placeholder] = inference.load_data(test_src_file)
-    test_infer_iterator_feed_dict[
-        infer_model.batch_size_placeholder] = hparams.infer_batch_size
-    test_scores = _external_eval(
+def run_sample_decode(
+    infer_model, infer_sess, model_dir, hparams, summary_writer, src_data, tgt_data
+):
+    """Sample decode a random sentence from src_data."""
+    with infer_model.graph.as_default():
+        loaded_infer_model, global_step = model_helper.create_or_load_model(
+            infer_model.model, model_dir, infer_sess, "infer"
+        )
+
+    _sample_decode(
+        loaded_infer_model,
+        global_step,
+        infer_sess,
+        hparams,
+        infer_model.iterator,
+        src_data,
+        tgt_data,
+        infer_model.src_placeholder,
+        infer_model.batch_size_placeholder,
+        summary_writer,
+    )
+
+
+def run_internal_eval(
+    eval_model,
+    eval_sess,
+    model_dir,
+    hparams,
+    summary_writer,
+    use_test_set=True,
+    dev_eval_iterator_feed_dict=None,
+    test_eval_iterator_feed_dict=None,
+):
+    """Compute internal evaluation (perplexity) for both dev / test.
+
+    Computes development and testing perplexities for given model.
+
+    Args:
+      eval_model: Evaluation model for which to compute perplexities.
+      eval_sess: Evaluation TensorFlow session.
+      model_dir: Directory from which to load evaluation model from.
+      hparams: Model hyper-parameters.
+      summary_writer: Summary writer for logging metrics to TensorBoard.
+      use_test_set: Computes testing perplexity if true; does not otherwise.
+        Note that the development perplexity is always computed regardless of
+        value of this parameter.
+      dev_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session.
+        Can be used to pass in additional inputs necessary for running the
+        development evaluation.
+      test_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session.
+        Can be used to pass in additional inputs necessary for running the
+        testing evaluation.
+    Returns:
+      Pair containing development perplexity and testing perplexity, in this
+      order.
+    """
+    if dev_eval_iterator_feed_dict is None:
+        dev_eval_iterator_feed_dict = {}
+    if test_eval_iterator_feed_dict is None:
+        test_eval_iterator_feed_dict = {}
+    with eval_model.graph.as_default():
+        loaded_eval_model, global_step = model_helper.create_or_load_model(
+            eval_model.model, model_dir, eval_sess, "eval"
+        )
+
+    dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src)
+    dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt)
+    dev_eval_iterator_feed_dict[eval_model.src_file_placeholder] = dev_src_file
+    dev_eval_iterator_feed_dict[eval_model.tgt_file_placeholder] = dev_tgt_file
+
+    dev_ppl = _internal_eval(
+        loaded_eval_model,
+        global_step,
+        eval_sess,
+        eval_model.iterator,
+        dev_eval_iterator_feed_dict,
+        summary_writer,
+        "dev",
+    )
+    test_ppl = None
+    if use_test_set and hparams.test_prefix:
+        test_src_file = "%s.%s" % (hparams.test_prefix, hparams.src)
+        test_tgt_file = "%s.%s" % (hparams.test_prefix, hparams.tgt)
+        test_eval_iterator_feed_dict[eval_model.src_file_placeholder] = test_src_file
+        test_eval_iterator_feed_dict[eval_model.tgt_file_placeholder] = test_tgt_file
+        test_ppl = _internal_eval(
+            loaded_eval_model,
+            global_step,
+            eval_sess,
+            eval_model.iterator,
+            test_eval_iterator_feed_dict,
+            summary_writer,
+            "test",
+        )
+    return dev_ppl, test_ppl
+
+
+def run_external_eval(
+    infer_model,
+    infer_sess,
+    model_dir,
+    hparams,
+    summary_writer,
+    save_best_dev=True,
+    use_test_set=True,
+    avg_ckpts=False,
+    dev_infer_iterator_feed_dict=None,
+    test_infer_iterator_feed_dict=None,
+):
+    """Compute external evaluation for both dev / test.
+
+    Computes development and testing external evaluation (e.g. bleu, rouge) for
+    given model.
+
+    Args:
+      infer_model: Inference model for which to compute perplexities.
+      infer_sess: Inference TensorFlow session.
+      model_dir: Directory from which to load inference model from.
+      hparams: Model hyper-parameters.
+      summary_writer: Summary writer for logging metrics to TensorBoard.
+      use_test_set: Computes testing external evaluation if true; does not
+        otherwise. Note that the development external evaluation is always
+        computed regardless of value of this parameter.
+      dev_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session.
+        Can be used to pass in additional inputs necessary for running the
+        development external evaluation.
+      test_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session.
+        Can be used to pass in additional inputs necessary for running the
+        testing external evaluation.
+    Returns:
+      Triple containing development scores, testing scores and the TensorFlow
+      Variable for the global step number, in this order.
+    """
+    if dev_infer_iterator_feed_dict is None:
+        dev_infer_iterator_feed_dict = {}
+    if test_infer_iterator_feed_dict is None:
+        test_infer_iterator_feed_dict = {}
+    with infer_model.graph.as_default():
+        loaded_infer_model, global_step = model_helper.create_or_load_model(
+            infer_model.model, model_dir, infer_sess, "infer"
+        )
+
+    dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src)
+    dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt)
+    dev_infer_iterator_feed_dict[infer_model.src_placeholder] = inference.load_data(
+        dev_src_file
+    )
+    dev_infer_iterator_feed_dict[infer_model.batch_size_placeholder] = (
+        hparams.infer_batch_size
+    )
+    dev_scores = _external_eval(
         loaded_infer_model,
         global_step,
         infer_sess,
         hparams,
         infer_model.iterator,
-        test_infer_iterator_feed_dict,
-        test_tgt_file,
-        "test",
+        dev_infer_iterator_feed_dict,
+        dev_tgt_file,
+        "dev",
+        summary_writer,
+        save_on_best=save_best_dev,
+        avg_ckpts=avg_ckpts,
+    )
+
+    test_scores = None
+    if use_test_set and hparams.test_prefix:
+        test_src_file = "%s.%s" % (hparams.test_prefix, hparams.src)
+        test_tgt_file = "%s.%s" % (hparams.test_prefix, hparams.tgt)
+        test_infer_iterator_feed_dict[infer_model.src_placeholder] = (
+            inference.load_data(test_src_file)
+        )
+        test_infer_iterator_feed_dict[infer_model.batch_size_placeholder] = (
+            hparams.infer_batch_size
+        )
+        test_scores = _external_eval(
+            loaded_infer_model,
+            global_step,
+            infer_sess,
+            hparams,
+            infer_model.iterator,
+            test_infer_iterator_feed_dict,
+            test_tgt_file,
+            "test",
+            summary_writer,
+            save_on_best=False,
+            avg_ckpts=avg_ckpts,
+        )
+    return dev_scores, test_scores, global_step
+
+
+def run_avg_external_eval(
+    infer_model, infer_sess, model_dir, hparams, summary_writer, global_step
+):
+    """Creates an averaged checkpoint and run external eval with it."""
+    avg_dev_scores, avg_test_scores = None, None
+    if hparams.avg_ckpts:
+        # Convert VariableName:0 to VariableName.
+        global_step_name = infer_model.model.global_step.name.split(":")[0]
+        avg_model_dir = model_helper.avg_checkpoints(
+            model_dir, hparams.num_keep_ckpts, global_step, global_step_name
+        )
+
+        if avg_model_dir:
+            avg_dev_scores, avg_test_scores, _ = run_external_eval(
+                infer_model,
+                infer_sess,
+                avg_model_dir,
+                hparams,
+                summary_writer,
+                avg_ckpts=True,
+            )
+
+    return avg_dev_scores, avg_test_scores
+
+
+def run_internal_and_external_eval(
+    model_dir,
+    infer_model,
+    infer_sess,
+    eval_model,
+    eval_sess,
+    hparams,
+    summary_writer,
+    avg_ckpts=False,
+    dev_eval_iterator_feed_dict=None,
+    test_eval_iterator_feed_dict=None,
+    dev_infer_iterator_feed_dict=None,
+    test_infer_iterator_feed_dict=None,
+):
+    """Compute internal evaluation (perplexity) for both dev / test.
+
+    Computes development and testing perplexities for given model.
+
+    Args:
+      model_dir: Directory from which to load models from.
+      infer_model: Inference model for which to compute perplexities.
+      infer_sess: Inference TensorFlow session.
+      eval_model: Evaluation model for which to compute perplexities.
+      eval_sess: Evaluation TensorFlow session.
+      hparams: Model hyper-parameters.
+      summary_writer: Summary writer for logging metrics to TensorBoard.
+      avg_ckpts: Whether to compute average external evaluation scores.
+      dev_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session.
+        Can be used to pass in additional inputs necessary for running the
+        internal development evaluation.
+      test_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session.
+        Can be used to pass in additional inputs necessary for running the
+        internal testing evaluation.
+      dev_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session.
+        Can be used to pass in additional inputs necessary for running the
+        external development evaluation.
+      test_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session.
+        Can be used to pass in additional inputs necessary for running the
+        external testing evaluation.
+    Returns:
+      Triple containing results summary, global step Tensorflow Variable and
+      metrics in this order.
+    """
+    dev_ppl, test_ppl = run_internal_eval(
+        eval_model,
+        eval_sess,
+        model_dir,
+        hparams,
+        summary_writer,
+        dev_eval_iterator_feed_dict=dev_eval_iterator_feed_dict,
+        test_eval_iterator_feed_dict=test_eval_iterator_feed_dict,
+    )
+    dev_scores, test_scores, global_step = run_external_eval(
+        infer_model,
+        infer_sess,
+        model_dir,
+        hparams,
+        summary_writer,
+        dev_infer_iterator_feed_dict=dev_infer_iterator_feed_dict,
+        test_infer_iterator_feed_dict=test_infer_iterator_feed_dict,
+    )
+
+    metrics = {
+        "dev_ppl": dev_ppl,
+        "test_ppl": test_ppl,
+        "dev_scores": dev_scores,
+        "test_scores": test_scores,
+    }
+
+    avg_dev_scores, avg_test_scores = None, None
+    if avg_ckpts:
+        avg_dev_scores, avg_test_scores = run_avg_external_eval(
+            infer_model, infer_sess, model_dir, hparams, summary_writer, global_step
+        )
+        metrics["avg_dev_scores"] = avg_dev_scores
+        metrics["avg_test_scores"] = avg_test_scores
+
+    result_summary = _format_results(
+        "dev", dev_ppl, dev_scores, hparams.metrics)
+    if avg_dev_scores:
+        result_summary += ", " + _format_results(
+            "avg_dev", None, avg_dev_scores, hparams.metrics
+        )
+    if hparams.test_prefix:
+        result_summary += ", " + _format_results(
+            "test", test_ppl, test_scores, hparams.metrics
+        )
+        if avg_test_scores:
+            result_summary += ", " + _format_results(
+                "avg_test", None, avg_test_scores, hparams.metrics
+            )
+
+    return result_summary, global_step, metrics
+
+
+def run_full_eval(
+    model_dir,
+    infer_model,
+    infer_sess,
+    eval_model,
+    eval_sess,
+    hparams,
+    summary_writer,
+    sample_src_data,
+    sample_tgt_data,
+    avg_ckpts=False,
+):
+    """Wrapper for running sample_decode, internal_eval and external_eval.
+
+    Args:
+      model_dir: Directory from which to load models from.
+      infer_model: Inference model for which to compute perplexities.
+      infer_sess: Inference TensorFlow session.
+      eval_model: Evaluation model for which to compute perplexities.
+      eval_sess: Evaluation TensorFlow session.
+      hparams: Model hyper-parameters.
+      summary_writer: Summary writer for logging metrics to TensorBoard.
+      sample_src_data: sample of source data for sample decoding.
+      sample_tgt_data: sample of target data for sample decoding.
+      avg_ckpts: Whether to compute average external evaluation scores.
+    Returns:
+      Triple containing results summary, global step Tensorflow Variable and
+      metrics in this order.
+    """
+    run_sample_decode(
+        infer_model,
+        infer_sess,
+        model_dir,
+        hparams,
         summary_writer,
-        save_on_best=False,
-        avg_ckpts=avg_ckpts)
-  return dev_scores, test_scores, global_step
-
-
-def run_avg_external_eval(infer_model, infer_sess, model_dir, hparams,
-                          summary_writer, global_step):
-  """Creates an averaged checkpoint and run external eval with it."""
-  avg_dev_scores, avg_test_scores = None, None
-  if hparams.avg_ckpts:
-    # Convert VariableName:0 to VariableName.
-    global_step_name = infer_model.model.global_step.name.split(":")[0]
-    avg_model_dir = model_helper.avg_checkpoints(
-        model_dir, hparams.num_keep_ckpts, global_step, global_step_name)
-
-    if avg_model_dir:
-      avg_dev_scores, avg_test_scores, _ = run_external_eval(
-          infer_model,
-          infer_sess,
-          avg_model_dir,
-          hparams,
-          summary_writer,
-          avg_ckpts=True)
-
-  return avg_dev_scores, avg_test_scores
-
-
-def run_internal_and_external_eval(model_dir,
-                                   infer_model,
-                                   infer_sess,
-                                   eval_model,
-                                   eval_sess,
-                                   hparams,
-                                   summary_writer,
-                                   avg_ckpts=False,
-                                   dev_eval_iterator_feed_dict=None,
-                                   test_eval_iterator_feed_dict=None,
-                                   dev_infer_iterator_feed_dict=None,
-                                   test_infer_iterator_feed_dict=None):
-  """Compute internal evaluation (perplexity) for both dev / test.
-
-  Computes development and testing perplexities for given model.
-
-  Args:
-    model_dir: Directory from which to load models from.
-    infer_model: Inference model for which to compute perplexities.
-    infer_sess: Inference TensorFlow session.
-    eval_model: Evaluation model for which to compute perplexities.
-    eval_sess: Evaluation TensorFlow session.
-    hparams: Model hyper-parameters.
-    summary_writer: Summary writer for logging metrics to TensorBoard.
-    avg_ckpts: Whether to compute average external evaluation scores.
-    dev_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session.
-      Can be used to pass in additional inputs necessary for running the
-      internal development evaluation.
-    test_eval_iterator_feed_dict: Feed dictionary for a TensorFlow session.
-      Can be used to pass in additional inputs necessary for running the
-      internal testing evaluation.
-    dev_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session.
-      Can be used to pass in additional inputs necessary for running the
-      external development evaluation.
-    test_infer_iterator_feed_dict: Feed dictionary for a TensorFlow session.
-      Can be used to pass in additional inputs necessary for running the
-      external testing evaluation.
-  Returns:
-    Triple containing results summary, global step Tensorflow Variable and
-    metrics in this order.
-  """
-  dev_ppl, test_ppl = run_internal_eval(
-      eval_model,
-      eval_sess,
-      model_dir,
-      hparams,
-      summary_writer,
-      dev_eval_iterator_feed_dict=dev_eval_iterator_feed_dict,
-      test_eval_iterator_feed_dict=test_eval_iterator_feed_dict)
-  dev_scores, test_scores, global_step = run_external_eval(
-      infer_model,
-      infer_sess,
-      model_dir,
-      hparams,
-      summary_writer,
-      dev_infer_iterator_feed_dict=dev_infer_iterator_feed_dict,
-      test_infer_iterator_feed_dict=test_infer_iterator_feed_dict)
-
-  metrics = {
-      "dev_ppl": dev_ppl,
-      "test_ppl": test_ppl,
-      "dev_scores": dev_scores,
-      "test_scores": test_scores,
-  }
-
-  avg_dev_scores, avg_test_scores = None, None
-  if avg_ckpts:
-    avg_dev_scores, avg_test_scores = run_avg_external_eval(
-        infer_model, infer_sess, model_dir, hparams, summary_writer,
-        global_step)
-    metrics["avg_dev_scores"] = avg_dev_scores
-    metrics["avg_test_scores"] = avg_test_scores
-
-  result_summary = _format_results("dev", dev_ppl, dev_scores, hparams.metrics)
-  if avg_dev_scores:
-    result_summary += ", " + _format_results("avg_dev", None, avg_dev_scores,
-                                             hparams.metrics)
-  if hparams.test_prefix:
-    result_summary += ", " + _format_results("test", test_ppl, test_scores,
-                                             hparams.metrics)
-    if avg_test_scores:
-      result_summary += ", " + _format_results("avg_test", None,
-                                               avg_test_scores, hparams.metrics)
-
-  return result_summary, global_step, metrics
-
-
-def run_full_eval(model_dir,
-                  infer_model,
-                  infer_sess,
-                  eval_model,
-                  eval_sess,
-                  hparams,
-                  summary_writer,
-                  sample_src_data,
-                  sample_tgt_data,
-                  avg_ckpts=False):
-  """Wrapper for running sample_decode, internal_eval and external_eval.
-
-  Args:
-    model_dir: Directory from which to load models from.
-    infer_model: Inference model for which to compute perplexities.
-    infer_sess: Inference TensorFlow session.
-    eval_model: Evaluation model for which to compute perplexities.
-    eval_sess: Evaluation TensorFlow session.
-    hparams: Model hyper-parameters.
-    summary_writer: Summary writer for logging metrics to TensorBoard.
-    sample_src_data: sample of source data for sample decoding.
-    sample_tgt_data: sample of target data for sample decoding.
-    avg_ckpts: Whether to compute average external evaluation scores.
-  Returns:
-    Triple containing results summary, global step Tensorflow Variable and
-    metrics in this order.
-  """
-  run_sample_decode(infer_model, infer_sess, model_dir, hparams, summary_writer,
-                    sample_src_data, sample_tgt_data)
-  return run_internal_and_external_eval(model_dir, infer_model, infer_sess,
-                                        eval_model, eval_sess, hparams,
-                                        summary_writer, avg_ckpts)
+        sample_src_data,
+        sample_tgt_data,
+    )
+    return run_internal_and_external_eval(
+        model_dir,
+        infer_model,
+        infer_sess,
+        eval_model,
+        eval_sess,
+        hparams,
+        summary_writer,
+        avg_ckpts,
+    )
 
 
 def init_stats():
-  """Initialize statistics that we want to accumulate."""
-  return {"step_time": 0.0, "train_loss": 0.0,
-          "predict_count": 0.0,  # word count on the target side
-          "word_count": 0.0,  # word counts for both source and target
-          "sequence_count": 0.0,  # number of training examples processed
-          "grad_norm": 0.0}
+    """Initialize statistics that we want to accumulate."""
+    return {
+        "step_time": 0.0,
+        "train_loss": 0.0,
+        "predict_count": 0.0,  # word count on the target side
+        "word_count": 0.0,  # word counts for both source and target
+        "sequence_count": 0.0,  # number of training examples processed
+        "grad_norm": 0.0,
+    }
 
 
 def update_stats(stats, start_time, step_result):
-  """Update stats: write summary and accumulate statistics."""
-  _, output_tuple = step_result
+    """Update stats: write summary and accumulate statistics."""
+    _, output_tuple = step_result
 
-  # Update statistics
-  batch_size = output_tuple.batch_size
-  stats["step_time"] += time.time() - start_time
-  stats["train_loss"] += output_tuple.train_loss * batch_size
-  stats["grad_norm"] += output_tuple.grad_norm
-  stats["predict_count"] += output_tuple.predict_count
-  stats["word_count"] += output_tuple.word_count
-  stats["sequence_count"] += batch_size
+    # Update statistics
+    batch_size = output_tuple.batch_size
+    stats["step_time"] += time.time() - start_time
+    stats["train_loss"] += output_tuple.train_loss * batch_size
+    stats["grad_norm"] += output_tuple.grad_norm
+    stats["predict_count"] += output_tuple.predict_count
+    stats["word_count"] += output_tuple.word_count
+    stats["sequence_count"] += batch_size
 
-  return (output_tuple.global_step, output_tuple.learning_rate,
-          output_tuple.train_summary)
+    return (
+        output_tuple.global_step,
+        output_tuple.learning_rate,
+        output_tuple.train_summary,
+    )
 
 
 def print_step_info(prefix, global_step, info, result_summary, log_f):
-  """Print all info at the current global step."""
-  utils.print_out(
-      "%sstep %d lr %g step-time %.2fs wps %.2fK ppl %.2f gN %.2f %s, %s" %
-      (prefix, global_step, info["learning_rate"], info["avg_step_time"],
-       info["speed"], info["train_ppl"], info["avg_grad_norm"], result_summary,
-       time.ctime()),
-      log_f)
+    """Print all info at the current global step."""
+    utils.print_out(
+        "%sstep %d lr %g step-time %.2fs wps %.2fK ppl %.2f gN %.2f %s, %s"
+        % (
+            prefix,
+            global_step,
+            info["learning_rate"],
+            info["avg_step_time"],
+            info["speed"],
+            info["train_ppl"],
+            info["avg_grad_norm"],
+            result_summary,
+            time.ctime(),
+        ),
+        log_f,
+    )
 
 
 def add_info_summaries(summary_writer, global_step, info):
-  """Add stuffs in info to summaries."""
-  excluded_list = ["learning_rate"]
-  for key in info:
-    if key not in excluded_list:
-      utils.add_summary(summary_writer, global_step, key, info[key])
+    """Add stuffs in info to summaries."""
+    excluded_list = ["learning_rate"]
+    for key in info:
+        if key not in excluded_list:
+            utils.add_summary(summary_writer, global_step, key, info[key])
 
 
 def process_stats(stats, info, global_step, steps_per_stats, log_f):
-  """Update info and check for overflow."""
-  # Per-step info
-  info["avg_step_time"] = stats["step_time"] / steps_per_stats
-  info["avg_grad_norm"] = stats["grad_norm"] / steps_per_stats
-  info["avg_sequence_count"] = stats["sequence_count"] / steps_per_stats
-  info["speed"] = stats["word_count"] / (1000 * stats["step_time"])
-
-  # Per-predict info
-  info["train_ppl"] = (
-      utils.safe_exp(stats["train_loss"] / stats["predict_count"]))
-
-  # Check for overflow
-  is_overflow = False
-  train_ppl = info["train_ppl"]
-  if math.isnan(train_ppl) or math.isinf(train_ppl) or train_ppl > 1e20:
-    utils.print_out("  step %d overflow, stop early" % global_step,
-                    log_f)
-    is_overflow = True
-
-  return is_overflow
-
-
-def before_train(loaded_train_model, train_model, train_sess, global_step,
-                 hparams, log_f):
-  """Misc tasks to do before training."""
-  stats = init_stats()
-  info = {"train_ppl": 0.0, "speed": 0.0,
-          "avg_step_time": 0.0,
-          "avg_grad_norm": 0.0,
-          "avg_sequence_count": 0.0,
-          "learning_rate": loaded_train_model.learning_rate.eval(
-              session=train_sess)}
-  start_train_time = time.time()
-  utils.print_out("# Start step %d, lr %g, %s" %
-                  (global_step, info["learning_rate"], time.ctime()), log_f)
-
-  # Initialize all of the iterators
-  skip_count = hparams.batch_size * hparams.epoch_step
-  utils.print_out("# Init train iterator, skipping %d elements" % skip_count)
-  train_sess.run(
-      train_model.iterator.initializer,
-      feed_dict={train_model.skip_count_placeholder: skip_count})
-
-  return stats, info, start_train_time
+    """Update info and check for overflow."""
+    # Per-step info
+    info["avg_step_time"] = stats["step_time"] / steps_per_stats
+    info["avg_grad_norm"] = stats["grad_norm"] / steps_per_stats
+    info["avg_sequence_count"] = stats["sequence_count"] / steps_per_stats
+    info["speed"] = stats["word_count"] / (1000 * stats["step_time"])
+
+    # Per-predict info
+    info["train_ppl"] = utils.safe_exp(
+        stats["train_loss"] / stats["predict_count"])
+
+    # Check for overflow
+    is_overflow = False
+    train_ppl = info["train_ppl"]
+    if math.isnan(train_ppl) or math.isinf(train_ppl) or train_ppl > 1e20:
+        utils.print_out("  step %d overflow, stop early" % global_step, log_f)
+        is_overflow = True
+
+    return is_overflow
+
+
+def before_train(
+    loaded_train_model, train_model, train_sess, global_step, hparams, log_f
+):
+    """Misc tasks to do before training."""
+    stats = init_stats()
+    info = {
+        "train_ppl": 0.0,
+        "speed": 0.0,
+        "avg_step_time": 0.0,
+        "avg_grad_norm": 0.0,
+        "avg_sequence_count": 0.0,
+        "learning_rate": loaded_train_model.learning_rate.eval(session=train_sess),
+    }
+    start_train_time = time.time()
+    utils.print_out(
+        "# Start step %d, lr %g, %s"
+        % (global_step, info["learning_rate"], time.ctime()),
+        log_f,
+    )
+
+    # Initialize all of the iterators
+    skip_count = hparams.batch_size * hparams.epoch_step
+    utils.print_out("# Init train iterator, skipping %d elements" % skip_count)
+    train_sess.run(
+        train_model.iterator.initializer,
+        feed_dict={train_model.skip_count_placeholder: skip_count},
+    )
+
+    return stats, info, start_train_time
 
 
 def get_model_creator(hparams):
-  """Get the right model class depending on configuration."""
-  if (hparams.encoder_type == "gnmt" or
-      hparams.attention_architecture in ["gnmt", "gnmt_v2"]):
-    model_creator = gnmt_model.GNMTModel
-  elif hparams.attention_architecture == "standard":
-    model_creator = attention_model.AttentionModel
-  elif not hparams.attention:
-    model_creator = nmt_model.Model
-  else:
-    raise ValueError("Unknown attention architecture %s" %
-                     hparams.attention_architecture)
-  return model_creator
+    """Get the right model class depending on configuration."""
+    if hparams.encoder_type == "gnmt" or hparams.attention_architecture in [
+        "gnmt",
+        "gnmt_v2",
+    ]:
+        model_creator = gnmt_model.GNMTModel
+    elif hparams.attention_architecture == "standard":
+        model_creator = attention_model.AttentionModel
+    elif not hparams.attention:
+        model_creator = nmt_model.Model
+    else:
+        raise ValueError(
+            "Unknown attention architecture %s" % hparams.attention_architecture
+        )
+    return model_creator
 
 
 def train(hparams, scope=None, target_session=""):
-  """Train a translation model."""
-  log_device_placement = hparams.log_device_placement
-  out_dir = hparams.out_dir
-  num_train_steps = hparams.num_train_steps
-  steps_per_stats = hparams.steps_per_stats
-  steps_per_external_eval = hparams.steps_per_external_eval
-  steps_per_eval = 10 * steps_per_stats
-  avg_ckpts = hparams.avg_ckpts
-
-  if not steps_per_external_eval:
-    steps_per_external_eval = 5 * steps_per_eval
-
-  # Create model
-  model_creator = get_model_creator(hparams)
-  train_model = model_helper.create_train_model(model_creator, hparams, scope)
-  eval_model = model_helper.create_eval_model(model_creator, hparams, scope)
-  infer_model = model_helper.create_infer_model(model_creator, hparams, scope)
-
-  # Preload data for sample decoding.
-  dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src)
-  dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt)
-  sample_src_data = inference.load_data(dev_src_file)
-  sample_tgt_data = inference.load_data(dev_tgt_file)
-
-  summary_name = "train_log"
-  model_dir = hparams.out_dir
-
-  # Log and output files
-  log_file = os.path.join(out_dir, "log_%d" % time.time())
-  log_f = tf.gfile.GFile(log_file, mode="a")
-  utils.print_out("# log_file=%s" % log_file, log_f)
-
-  # TensorFlow model
-  config_proto = utils.get_config_proto(
-      log_device_placement=log_device_placement,
-      num_intra_threads=hparams.num_intra_threads,
-      num_inter_threads=hparams.num_inter_threads)
-  train_sess = tf.Session(
-      target=target_session, config=config_proto, graph=train_model.graph)
-  eval_sess = tf.Session(
-      target=target_session, config=config_proto, graph=eval_model.graph)
-  infer_sess = tf.Session(
-      target=target_session, config=config_proto, graph=infer_model.graph)
-
-  with train_model.graph.as_default():
-    loaded_train_model, global_step = model_helper.create_or_load_model(
-        train_model.model, model_dir, train_sess, "train")
-
-  # Summary writer
-  summary_writer = tf.summary.FileWriter(
-      os.path.join(out_dir, summary_name), train_model.graph)
-
-  # First evaluation
-  run_full_eval(
-      model_dir, infer_model, infer_sess,
-      eval_model, eval_sess, hparams,
-      summary_writer, sample_src_data,
-      sample_tgt_data, avg_ckpts)
-
-  last_stats_step = global_step
-  last_eval_step = global_step
-  last_external_eval_step = global_step
-
-  # This is the training loop.
-  stats, info, start_train_time = before_train(
-      loaded_train_model, train_model, train_sess, global_step, hparams, log_f)
-  while global_step < num_train_steps:
-    ### Run a step ###
-    start_time = time.time()
-    try:
-      step_result = loaded_train_model.train(train_sess)
-      hparams.epoch_step += 1
-    except tf.errors.OutOfRangeError:
-      # Finished going through the training dataset.  Go to next epoch.
-      hparams.epoch_step = 0
-      utils.print_out(
-          "# Finished an epoch, step %d. Perform external evaluation" %
-          global_step)
-      run_sample_decode(infer_model, infer_sess, model_dir, hparams,
-                        summary_writer, sample_src_data, sample_tgt_data)
-      run_external_eval(infer_model, infer_sess, model_dir, hparams,
-                        summary_writer)
-
-      if avg_ckpts:
-        run_avg_external_eval(infer_model, infer_sess, model_dir, hparams,
-                              summary_writer, global_step)
-
-      train_sess.run(
-          train_model.iterator.initializer,
-          feed_dict={train_model.skip_count_placeholder: 0})
-      continue
-
-    # Process step_result, accumulate stats, and write summary
-    global_step, info["learning_rate"], step_summary = update_stats(
-        stats, start_time, step_result)
-    summary_writer.add_summary(step_summary, global_step)
-
-    # Once in a while, we print statistics.
-    if global_step - last_stats_step >= steps_per_stats:
-      last_stats_step = global_step
-      is_overflow = process_stats(
-          stats, info, global_step, steps_per_stats, log_f)
-      print_step_info("  ", global_step, info, get_best_results(hparams),
-                      log_f)
-      if is_overflow:
-        break
-
-      # Reset statistics
-      stats = init_stats()
-
-    if global_step - last_eval_step >= steps_per_eval:
-      last_eval_step = global_step
-      utils.print_out("# Save eval, global step %d" % global_step)
-      add_info_summaries(summary_writer, global_step, info)
-
-      # Save checkpoint
-      loaded_train_model.saver.save(
-          train_sess,
-          os.path.join(out_dir, "translate.ckpt"),
-          global_step=global_step)
-
-      # Evaluate on dev/test
-      run_sample_decode(infer_model, infer_sess,
-                        model_dir, hparams, summary_writer, sample_src_data,
-                        sample_tgt_data)
-      run_internal_eval(
-          eval_model, eval_sess, model_dir, hparams, summary_writer)
-
-    if global_step - last_external_eval_step >= steps_per_external_eval:
-      last_external_eval_step = global_step
-
-      # Save checkpoint
-      loaded_train_model.saver.save(
-          train_sess,
-          os.path.join(out_dir, "translate.ckpt"),
-          global_step=global_step)
-      run_sample_decode(infer_model, infer_sess,
-                        model_dir, hparams, summary_writer, sample_src_data,
-                        sample_tgt_data)
-      run_external_eval(
-          infer_model, infer_sess, model_dir,
-          hparams, summary_writer)
-
-      if avg_ckpts:
-        run_avg_external_eval(infer_model, infer_sess, model_dir, hparams,
-                              summary_writer, global_step)
-
-  # Done training
-  loaded_train_model.saver.save(
-      train_sess,
-      os.path.join(out_dir, "translate.ckpt"),
-      global_step=global_step)
-
-  (result_summary, _, final_eval_metrics) = (
-      run_full_eval(
-          model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams,
-          summary_writer, sample_src_data, sample_tgt_data, avg_ckpts))
-  print_step_info("# Final, ", global_step, info, result_summary, log_f)
-  utils.print_time("# Done training!", start_train_time)
-
-  summary_writer.close()
-
-  utils.print_out("# Start evaluating saved best models.")
-  for metric in hparams.metrics:
-    best_model_dir = getattr(hparams, "best_" + metric + "_dir")
+    """Train a translation model."""
+    log_device_placement = hparams.log_device_placement
+    out_dir = hparams.out_dir
+    num_train_steps = hparams.num_train_steps
+    steps_per_stats = hparams.steps_per_stats
+    steps_per_external_eval = hparams.steps_per_external_eval
+    steps_per_eval = 10 * steps_per_stats
+    avg_ckpts = hparams.avg_ckpts
+
+    if not steps_per_external_eval:
+        steps_per_external_eval = 5 * steps_per_eval
+
+    # Create model
+    model_creator = get_model_creator(hparams)
+    train_model = model_helper.create_train_model(
+        model_creator, hparams, scope)
+    eval_model = model_helper.create_eval_model(model_creator, hparams, scope)
+    infer_model = model_helper.create_infer_model(
+        model_creator, hparams, scope)
+
+    # Preload data for sample decoding.
+    dev_src_file = "%s.%s" % (hparams.dev_prefix, hparams.src)
+    dev_tgt_file = "%s.%s" % (hparams.dev_prefix, hparams.tgt)
+    sample_src_data = inference.load_data(dev_src_file)
+    sample_tgt_data = inference.load_data(dev_tgt_file)
+
+    summary_name = "train_log"
+    model_dir = hparams.out_dir
+
+    # Log and output files
+    log_file = os.path.join(out_dir, "log_%d" % time.time())
+    log_f = tf.gfile.GFile(log_file, mode="a")
+    utils.print_out("# log_file=%s" % log_file, log_f)
+
+    # TensorFlow model
+    config_proto = utils.get_config_proto(
+        log_device_placement=log_device_placement,
+        num_intra_threads=hparams.num_intra_threads,
+        num_inter_threads=hparams.num_inter_threads,
+    )
+    train_sess = tf.Session(
+        target=target_session, config=config_proto, graph=train_model.graph
+    )
+    eval_sess = tf.Session(
+        target=target_session, config=config_proto, graph=eval_model.graph
+    )
+    infer_sess = tf.Session(
+        target=target_session, config=config_proto, graph=infer_model.graph
+    )
+
+    with train_model.graph.as_default():
+        loaded_train_model, global_step = model_helper.create_or_load_model(
+            train_model.model, model_dir, train_sess, "train"
+        )
+
+    # Summary writer
     summary_writer = tf.summary.FileWriter(
-        os.path.join(best_model_dir, summary_name), infer_model.graph)
-    result_summary, best_global_step, _ = run_full_eval(
-        best_model_dir, infer_model, infer_sess, eval_model, eval_sess, hparams,
-        summary_writer, sample_src_data, sample_tgt_data)
-    print_step_info("# Best %s, " % metric, best_global_step, info,
-                    result_summary, log_f)
-    summary_writer.close()
+        os.path.join(out_dir, summary_name), train_model.graph
+    )
 
-    if avg_ckpts:
-      best_model_dir = getattr(hparams, "avg_best_" + metric + "_dir")
-      summary_writer = tf.summary.FileWriter(
-          os.path.join(best_model_dir, summary_name), infer_model.graph)
-      result_summary, best_global_step, _ = run_full_eval(
-          best_model_dir, infer_model, infer_sess, eval_model, eval_sess,
-          hparams, summary_writer, sample_src_data, sample_tgt_data)
-      print_step_info("# Averaged Best %s, " % metric, best_global_step, info,
-                      result_summary, log_f)
-      summary_writer.close()
+    # First evaluation
+    run_full_eval(
+        model_dir,
+        infer_model,
+        infer_sess,
+        eval_model,
+        eval_sess,
+        hparams,
+        summary_writer,
+        sample_src_data,
+        sample_tgt_data,
+        avg_ckpts,
+    )
+
+    last_stats_step = global_step
+    last_eval_step = global_step
+    last_external_eval_step = global_step
+
+    # This is the training loop.
+    stats, info, start_train_time = before_train(
+        loaded_train_model, train_model, train_sess, global_step, hparams, log_f
+    )
+    while global_step < num_train_steps:
+        ### Run a step ###
+        start_time = time.time()
+        try:
+            step_result = loaded_train_model.train(train_sess)
+            hparams.epoch_step += 1
+        except tf.errors.OutOfRangeError:
+            # Finished going through the training dataset.  Go to next epoch.
+            hparams.epoch_step = 0
+            utils.print_out(
+                "# Finished an epoch, step %d. Perform external evaluation"
+                % global_step
+            )
+            run_sample_decode(
+                infer_model,
+                infer_sess,
+                model_dir,
+                hparams,
+                summary_writer,
+                sample_src_data,
+                sample_tgt_data,
+            )
+            run_external_eval(
+                infer_model, infer_sess, model_dir, hparams, summary_writer
+            )
+
+            if avg_ckpts:
+                run_avg_external_eval(
+                    infer_model,
+                    infer_sess,
+                    model_dir,
+                    hparams,
+                    summary_writer,
+                    global_step,
+                )
+
+            train_sess.run(
+                train_model.iterator.initializer,
+                feed_dict={train_model.skip_count_placeholder: 0},
+            )
+            continue
+
+        # Process step_result, accumulate stats, and write summary
+        global_step, info["learning_rate"], step_summary = update_stats(
+            stats, start_time, step_result
+        )
+        summary_writer.add_summary(step_summary, global_step)
+
+        # Once in a while, we print statistics.
+        if global_step - last_stats_step >= steps_per_stats:
+            last_stats_step = global_step
+            is_overflow = process_stats(
+                stats, info, global_step, steps_per_stats, log_f
+            )
+            print_step_info(
+                "  ",
+                global_step,
+                info,
+                get_best_results(hparams),
+                log_f)
+            if is_overflow:
+                break
+
+            # Reset statistics
+            stats = init_stats()
+
+        if global_step - last_eval_step >= steps_per_eval:
+            last_eval_step = global_step
+            utils.print_out("# Save eval, global step %d" % global_step)
+            add_info_summaries(summary_writer, global_step, info)
+
+            # Save checkpoint
+            loaded_train_model.saver.save(
+                train_sess,
+                os.path.join(out_dir, "translate.ckpt"),
+                global_step=global_step,
+            )
+
+            # Evaluate on dev/test
+            run_sample_decode(
+                infer_model,
+                infer_sess,
+                model_dir,
+                hparams,
+                summary_writer,
+                sample_src_data,
+                sample_tgt_data,
+            )
+            run_internal_eval(
+                eval_model,
+                eval_sess,
+                model_dir,
+                hparams,
+                summary_writer)
+
+        if global_step - last_external_eval_step >= steps_per_external_eval:
+            last_external_eval_step = global_step
+
+            # Save checkpoint
+            loaded_train_model.saver.save(
+                train_sess,
+                os.path.join(out_dir, "translate.ckpt"),
+                global_step=global_step,
+            )
+            run_sample_decode(
+                infer_model,
+                infer_sess,
+                model_dir,
+                hparams,
+                summary_writer,
+                sample_src_data,
+                sample_tgt_data,
+            )
+            run_external_eval(
+                infer_model, infer_sess, model_dir, hparams, summary_writer
+            )
+
+            if avg_ckpts:
+                run_avg_external_eval(
+                    infer_model,
+                    infer_sess,
+                    model_dir,
+                    hparams,
+                    summary_writer,
+                    global_step,
+                )
+
+    # Done training
+    loaded_train_model.saver.save(
+        train_sess, os.path.join(out_dir, "translate.ckpt"), global_step=global_step
+    )
+
+    (result_summary, _, final_eval_metrics) = run_full_eval(
+        model_dir,
+        infer_model,
+        infer_sess,
+        eval_model,
+        eval_sess,
+        hparams,
+        summary_writer,
+        sample_src_data,
+        sample_tgt_data,
+        avg_ckpts,
+    )
+    print_step_info("# Final, ", global_step, info, result_summary, log_f)
+    utils.print_time("# Done training!", start_train_time)
+
+    summary_writer.close()
 
-  return final_eval_metrics, global_step
+    utils.print_out("# Start evaluating saved best models.")
+    for metric in hparams.metrics:
+        best_model_dir = getattr(hparams, "best_" + metric + "_dir")
+        summary_writer = tf.summary.FileWriter(
+            os.path.join(best_model_dir, summary_name), infer_model.graph
+        )
+        result_summary, best_global_step, _ = run_full_eval(
+            best_model_dir,
+            infer_model,
+            infer_sess,
+            eval_model,
+            eval_sess,
+            hparams,
+            summary_writer,
+            sample_src_data,
+            sample_tgt_data,
+        )
+        print_step_info(
+            "# Best %s, " % metric, best_global_step, info, result_summary, log_f
+        )
+        summary_writer.close()
+
+        if avg_ckpts:
+            best_model_dir = getattr(hparams, "avg_best_" + metric + "_dir")
+            summary_writer = tf.summary.FileWriter(
+                os.path.join(best_model_dir, summary_name), infer_model.graph
+            )
+            result_summary, best_global_step, _ = run_full_eval(
+                best_model_dir,
+                infer_model,
+                infer_sess,
+                eval_model,
+                eval_sess,
+                hparams,
+                summary_writer,
+                sample_src_data,
+                sample_tgt_data,
+            )
+            print_step_info(
+                "# Averaged Best %s, " % metric,
+                best_global_step,
+                info,
+                result_summary,
+                log_f,
+            )
+            summary_writer.close()
+
+    return final_eval_metrics, global_step
 
 
 def _format_results(name, ppl, scores, metrics):
-  """Format results."""
-  result_str = ""
-  if ppl:
-    result_str = "%s ppl %.2f" % (name, ppl)
-  if scores:
-    for metric in metrics:
-      if result_str:
-        result_str += ", %s %s %.1f" % (name, metric, scores[metric])
-      else:
-        result_str = "%s %s %.1f" % (name, metric, scores[metric])
-  return result_str
+    """Format results."""
+    result_str = ""
+    if ppl:
+        result_str = "%s ppl %.2f" % (name, ppl)
+    if scores:
+        for metric in metrics:
+            if result_str:
+                result_str += ", %s %s %.1f" % (name, metric, scores[metric])
+            else:
+                result_str = "%s %s %.1f" % (name, metric, scores[metric])
+    return result_str
 
 
 def get_best_results(hparams):
-  """Summary of the current best results."""
-  tokens = []
-  for metric in hparams.metrics:
-    tokens.append("%s %.2f" % (metric, getattr(hparams, "best_" + metric)))
-  return ", ".join(tokens)
-
-
-def _internal_eval(model, global_step, sess, iterator, iterator_feed_dict,
-                   summary_writer, label):
-  """Computing perplexity."""
-  sess.run(iterator.initializer, feed_dict=iterator_feed_dict)
-  ppl = model_helper.compute_perplexity(model, sess, label)
-  utils.add_summary(summary_writer, global_step, "%s_ppl" % label, ppl)
-  return ppl
-
-
-def _sample_decode(model, global_step, sess, hparams, iterator, src_data,
-                   tgt_data, iterator_src_placeholder,
-                   iterator_batch_size_placeholder, summary_writer):
-  """Pick a sentence and decode."""
-  decode_id = random.randint(0, len(src_data) - 1)
-  utils.print_out("  # %d" % decode_id)
-
-  iterator_feed_dict = {
-      iterator_src_placeholder: [src_data[decode_id]],
-      iterator_batch_size_placeholder: 1,
-  }
-  sess.run(iterator.initializer, feed_dict=iterator_feed_dict)
-
-  nmt_outputs, attention_summary = model.decode(sess)
-
-  if hparams.infer_mode == "beam_search":
-    # get the top translation.
-    nmt_outputs = nmt_outputs[0]
-
-  translation = nmt_utils.get_translation(
-      nmt_outputs,
-      sent_id=0,
-      tgt_eos=hparams.eos,
-      subword_option=hparams.subword_option)
-  utils.print_out("    src: %s" % src_data[decode_id])
-  utils.print_out("    ref: %s" % tgt_data[decode_id])
-  utils.print_out(b"    nmt: " + translation)
-
-  # Summary
-  if attention_summary is not None:
-    summary_writer.add_summary(attention_summary, global_step)
-
-
-def _external_eval(model, global_step, sess, hparams, iterator,
-                   iterator_feed_dict, tgt_file, label, summary_writer,
-                   save_on_best, avg_ckpts=False):
-  """External evaluation such as BLEU and ROUGE scores."""
-  out_dir = hparams.out_dir
-  decode = global_step > 0
-
-  if avg_ckpts:
-    label = "avg_" + label
-
-  if decode:
-    utils.print_out("# External evaluation, global step %d" % global_step)
-
-  sess.run(iterator.initializer, feed_dict=iterator_feed_dict)
-
-  output = os.path.join(out_dir, "output_%s" % label)
-  scores = nmt_utils.decode_and_evaluate(
-      label,
-      model,
-      sess,
-      output,
-      ref_file=tgt_file,
-      metrics=hparams.metrics,
-      subword_option=hparams.subword_option,
-      beam_width=hparams.beam_width,
-      tgt_eos=hparams.eos,
-      decode=decode,
-      infer_mode=hparams.infer_mode)
-  # Save on best metrics
-  if decode:
+    """Summary of the current best results."""
+    tokens = []
     for metric in hparams.metrics:
-      if avg_ckpts:
-        best_metric_label = "avg_best_" + metric
-      else:
-        best_metric_label = "best_" + metric
-
-      utils.add_summary(summary_writer, global_step, "%s_%s" % (label, metric),
-                        scores[metric])
-      # metric: larger is better
-      if save_on_best and scores[metric] > getattr(hparams, best_metric_label):
-        setattr(hparams, best_metric_label, scores[metric])
-        model.saver.save(
-            sess,
-            os.path.join(
-                getattr(hparams, best_metric_label + "_dir"), "translate.ckpt"),
-            global_step=model.global_step)
-    utils.save_hparams(out_dir, hparams)
-  return scores
+        tokens.append("%s %.2f" % (metric, getattr(hparams, "best_" + metric)))
+    return ", ".join(tokens)
+
+
+def _internal_eval(
+    model, global_step, sess, iterator, iterator_feed_dict, summary_writer, label
+):
+    """Computing perplexity."""
+    sess.run(iterator.initializer, feed_dict=iterator_feed_dict)
+    ppl = model_helper.compute_perplexity(model, sess, label)
+    utils.add_summary(summary_writer, global_step, "%s_ppl" % label, ppl)
+    return ppl
+
+
+def _sample_decode(
+    model,
+    global_step,
+    sess,
+    hparams,
+    iterator,
+    src_data,
+    tgt_data,
+    iterator_src_placeholder,
+    iterator_batch_size_placeholder,
+    summary_writer,
+):
+    """Pick a sentence and decode."""
+    decode_id = random.randint(0, len(src_data) - 1)
+    utils.print_out("  # %d" % decode_id)
+
+    iterator_feed_dict = {
+        iterator_src_placeholder: [src_data[decode_id]],
+        iterator_batch_size_placeholder: 1,
+    }
+    sess.run(iterator.initializer, feed_dict=iterator_feed_dict)
+
+    nmt_outputs, attention_summary = model.decode(sess)
+
+    if hparams.infer_mode == "beam_search":
+        # get the top translation.
+        nmt_outputs = nmt_outputs[0]
+
+    translation = nmt_utils.get_translation(
+        nmt_outputs,
+        sent_id=0,
+        tgt_eos=hparams.eos,
+        subword_option=hparams.subword_option,
+    )
+    utils.print_out("    src: %s" % src_data[decode_id])
+    utils.print_out("    ref: %s" % tgt_data[decode_id])
+    utils.print_out(b"    nmt: " + translation)
+
+    # Summary
+    if attention_summary is not None:
+        summary_writer.add_summary(attention_summary, global_step)
+
+
+def _external_eval(
+    model,
+    global_step,
+    sess,
+    hparams,
+    iterator,
+    iterator_feed_dict,
+    tgt_file,
+    label,
+    summary_writer,
+    save_on_best,
+    avg_ckpts=False,
+):
+    """External evaluation such as BLEU and ROUGE scores."""
+    out_dir = hparams.out_dir
+    decode = global_step > 0
+
+    if avg_ckpts:
+        label = "avg_" + label
+
+    if decode:
+        utils.print_out("# External evaluation, global step %d" % global_step)
+
+    sess.run(iterator.initializer, feed_dict=iterator_feed_dict)
+
+    output = os.path.join(out_dir, "output_%s" % label)
+    scores = nmt_utils.decode_and_evaluate(
+        label,
+        model,
+        sess,
+        output,
+        ref_file=tgt_file,
+        metrics=hparams.metrics,
+        subword_option=hparams.subword_option,
+        beam_width=hparams.beam_width,
+        tgt_eos=hparams.eos,
+        decode=decode,
+        infer_mode=hparams.infer_mode,
+    )
+    # Save on best metrics
+    if decode:
+        for metric in hparams.metrics:
+            if avg_ckpts:
+                best_metric_label = "avg_best_" + metric
+            else:
+                best_metric_label = "best_" + metric
+
+            utils.add_summary(
+                summary_writer, global_step, "%s_%s" % (
+                    label, metric), scores[metric]
+            )
+            # metric: larger is better
+            if save_on_best and scores[metric] > getattr(
+                    hparams, best_metric_label):
+                setattr(hparams, best_metric_label, scores[metric])
+                model.saver.save(
+                    sess,
+                    os.path.join(
+                        getattr(
+                            hparams, best_metric_label + "_dir"), "translate.ckpt"
+                    ),
+                    global_step=model.global_step,
+                )
+        utils.save_hparams(out_dir, hparams)
+    return scores
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/common_test_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/common_test_utils.py
index 68ff209f9..57918a1fc 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/common_test_utils.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/common_test_utils.py
@@ -27,110 +27,120 @@
 from ..utils import standard_hparams_utils
 
 
-def create_test_hparams(unit_type="lstm",
-                        encoder_type="uni",
-                        num_layers=4,
-                        attention="",
-                        attention_architecture=None,
-                        use_residual=False,
-                        inference_indices=None,
-                        num_translations_per_input=1,
-                        beam_width=0,
-                        init_op="uniform"):
-  """Create training and inference test hparams."""
-  num_residual_layers = 0
-  if use_residual:
-    # TODO(rzhao): Put num_residual_layers computation logic into
-    # `model_utils.py`, so we can also test it here.
-    num_residual_layers = 2
-
-  standard_hparams = standard_hparams_utils.create_standard_hparams()
-
-  # Networks
-  standard_hparams.num_units = 5
-  standard_hparams.num_encoder_layers = num_layers
-  standard_hparams.num_decoder_layers = num_layers
-  standard_hparams.dropout = 0.5
-  standard_hparams.unit_type = unit_type
-  standard_hparams.encoder_type = encoder_type
-  standard_hparams.residual = use_residual
-  standard_hparams.num_residual_layers = num_residual_layers
-
-  # Attention mechanisms
-  standard_hparams.attention = attention
-  standard_hparams.attention_architecture = attention_architecture
-
-  # Train
-  standard_hparams.init_op = init_op
-  standard_hparams.num_train_steps = 1
-  standard_hparams.decay_scheme = ""
-
-  # Infer
-  standard_hparams.tgt_max_len_infer = 100
-  standard_hparams.beam_width = beam_width
-  standard_hparams.num_translations_per_input = num_translations_per_input
-
-  # Misc
-  standard_hparams.forget_bias = 0.0
-  standard_hparams.random_seed = 3
-  standard_hparams.language_model = False
-
-  # Vocab
-  standard_hparams.src_vocab_size = 5
-  standard_hparams.tgt_vocab_size = 5
-  standard_hparams.eos = "</s>"
-  standard_hparams.sos = "<s>"
-  standard_hparams.src_vocab_file = ""
-  standard_hparams.tgt_vocab_file = ""
-  standard_hparams.src_embed_file = ""
-  standard_hparams.tgt_embed_file = ""
-
-  # For inference.py test
-  standard_hparams.subword_option = "bpe"
-  standard_hparams.src = "src"
-  standard_hparams.tgt = "tgt"
-  standard_hparams.src_max_len = 400
-  standard_hparams.tgt_eos_id = 0
-  standard_hparams.inference_indices = inference_indices
-  return standard_hparams
+def create_test_hparams(
+    unit_type="lstm",
+    encoder_type="uni",
+    num_layers=4,
+    attention="",
+    attention_architecture=None,
+    use_residual=False,
+    inference_indices=None,
+    num_translations_per_input=1,
+    beam_width=0,
+    init_op="uniform",
+):
+    """Create training and inference test hparams."""
+    num_residual_layers = 0
+    if use_residual:
+        # TODO(rzhao): Put num_residual_layers computation logic into
+        # `model_utils.py`, so we can also test it here.
+        num_residual_layers = 2
+
+    standard_hparams = standard_hparams_utils.create_standard_hparams()
+
+    # Networks
+    standard_hparams.num_units = 5
+    standard_hparams.num_encoder_layers = num_layers
+    standard_hparams.num_decoder_layers = num_layers
+    standard_hparams.dropout = 0.5
+    standard_hparams.unit_type = unit_type
+    standard_hparams.encoder_type = encoder_type
+    standard_hparams.residual = use_residual
+    standard_hparams.num_residual_layers = num_residual_layers
+
+    # Attention mechanisms
+    standard_hparams.attention = attention
+    standard_hparams.attention_architecture = attention_architecture
+
+    # Train
+    standard_hparams.init_op = init_op
+    standard_hparams.num_train_steps = 1
+    standard_hparams.decay_scheme = ""
+
+    # Infer
+    standard_hparams.tgt_max_len_infer = 100
+    standard_hparams.beam_width = beam_width
+    standard_hparams.num_translations_per_input = num_translations_per_input
+
+    # Misc
+    standard_hparams.forget_bias = 0.0
+    standard_hparams.random_seed = 3
+    standard_hparams.language_model = False
+
+    # Vocab
+    standard_hparams.src_vocab_size = 5
+    standard_hparams.tgt_vocab_size = 5
+    standard_hparams.eos = "</s>"
+    standard_hparams.sos = "<s>"
+    standard_hparams.src_vocab_file = ""
+    standard_hparams.tgt_vocab_file = ""
+    standard_hparams.src_embed_file = ""
+    standard_hparams.tgt_embed_file = ""
+
+    # For inference.py test
+    standard_hparams.subword_option = "bpe"
+    standard_hparams.src = "src"
+    standard_hparams.tgt = "tgt"
+    standard_hparams.src_max_len = 400
+    standard_hparams.tgt_eos_id = 0
+    standard_hparams.inference_indices = inference_indices
+    return standard_hparams
 
 
 def create_test_iterator(hparams, mode):
-  """Create test iterator."""
-  src_vocab_table = lookup_ops.index_table_from_tensor(
-      tf.constant([hparams.eos, "a", "b", "c", "d"]))
-  tgt_vocab_mapping = tf.constant([hparams.sos, hparams.eos, "a", "b", "c"])
-  tgt_vocab_table = lookup_ops.index_table_from_tensor(tgt_vocab_mapping)
-  if mode == tf.contrib.learn.ModeKeys.INFER:
-    reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_tensor(
-        tgt_vocab_mapping)
-
-  src_dataset = tf.data.Dataset.from_tensor_slices(
-      tf.constant(["a a b b c", "a b b"]))
-
-  if mode != tf.contrib.learn.ModeKeys.INFER:
-    tgt_dataset = tf.data.Dataset.from_tensor_slices(
-        tf.constant(["a b c b c", "a b c b"]))
-    return (
-        iterator_utils.get_iterator(
-            src_dataset=src_dataset,
-            tgt_dataset=tgt_dataset,
-            src_vocab_table=src_vocab_table,
-            tgt_vocab_table=tgt_vocab_table,
-            batch_size=hparams.batch_size,
-            sos=hparams.sos,
-            eos=hparams.eos,
-            random_seed=hparams.random_seed,
-            num_buckets=hparams.num_buckets),
-        src_vocab_table,
-        tgt_vocab_table)
-  else:
-    return (
-        iterator_utils.get_infer_iterator(
-            src_dataset=src_dataset,
-            src_vocab_table=src_vocab_table,
-            eos=hparams.eos,
-            batch_size=hparams.batch_size),
-        src_vocab_table,
-        tgt_vocab_table,
-        reverse_tgt_vocab_table)
+    """Create test iterator."""
+    src_vocab_table = lookup_ops.index_table_from_tensor(
+        tf.constant([hparams.eos, "a", "b", "c", "d"])
+    )
+    tgt_vocab_mapping = tf.constant([hparams.sos, hparams.eos, "a", "b", "c"])
+    tgt_vocab_table = lookup_ops.index_table_from_tensor(tgt_vocab_mapping)
+    if mode == tf.contrib.learn.ModeKeys.INFER:
+        reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_tensor(
+            tgt_vocab_mapping
+        )
+
+    src_dataset = tf.data.Dataset.from_tensor_slices(
+        tf.constant(["a a b b c", "a b b"])
+    )
+
+    if mode != tf.contrib.learn.ModeKeys.INFER:
+        tgt_dataset = tf.data.Dataset.from_tensor_slices(
+            tf.constant(["a b c b c", "a b c b"])
+        )
+        return (
+            iterator_utils.get_iterator(
+                src_dataset=src_dataset,
+                tgt_dataset=tgt_dataset,
+                src_vocab_table=src_vocab_table,
+                tgt_vocab_table=tgt_vocab_table,
+                batch_size=hparams.batch_size,
+                sos=hparams.sos,
+                eos=hparams.eos,
+                random_seed=hparams.random_seed,
+                num_buckets=hparams.num_buckets,
+            ),
+            src_vocab_table,
+            tgt_vocab_table,
+        )
+    else:
+        return (
+            iterator_utils.get_infer_iterator(
+                src_dataset=src_dataset,
+                src_vocab_table=src_vocab_table,
+                eos=hparams.eos,
+                batch_size=hparams.batch_size,
+            ),
+            src_vocab_table,
+            tgt_vocab_table,
+            reverse_tgt_vocab_table,
+        )
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils.py
index e567e119f..5d563bec7 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils.py
@@ -29,156 +29,158 @@
 
 
 def evaluate(ref_file, trans_file, metric, subword_option=None):
-  """Pick a metric and evaluate depending on task."""
-  # BLEU scores for translation task
-  if metric.lower() == "bleu":
-    evaluation_score = _bleu(ref_file, trans_file,
-                             subword_option=subword_option)
-  # ROUGE scores for summarization tasks
-  elif metric.lower() == "rouge":
-    evaluation_score = _rouge(ref_file, trans_file,
-                              subword_option=subword_option)
-  elif metric.lower() == "accuracy":
-    evaluation_score = _accuracy(ref_file, trans_file)
-  elif metric.lower() == "word_accuracy":
-    evaluation_score = _word_accuracy(ref_file, trans_file)
-  else:
-    raise ValueError("Unknown metric %s" % metric)
-
-  return evaluation_score
+    """Pick a metric and evaluate depending on task."""
+    # BLEU scores for translation task
+    if metric.lower() == "bleu":
+        evaluation_score = _bleu(
+            ref_file,
+            trans_file,
+            subword_option=subword_option)
+    # ROUGE scores for summarization tasks
+    elif metric.lower() == "rouge":
+        evaluation_score = _rouge(
+            ref_file, trans_file, subword_option=subword_option)
+    elif metric.lower() == "accuracy":
+        evaluation_score = _accuracy(ref_file, trans_file)
+    elif metric.lower() == "word_accuracy":
+        evaluation_score = _word_accuracy(ref_file, trans_file)
+    else:
+        raise ValueError("Unknown metric %s" % metric)
+
+    return evaluation_score
 
 
 def _clean(sentence, subword_option):
-  """Clean and handle BPE or SPM outputs."""
-  sentence = sentence.strip()
+    """Clean and handle BPE or SPM outputs."""
+    sentence = sentence.strip()
 
-  # BPE
-  if subword_option == "bpe":
-    sentence = re.sub("@@ ", "", sentence)
+    # BPE
+    if subword_option == "bpe":
+        sentence = re.sub("@@ ", "", sentence)
 
-  # SPM
-  elif subword_option == "spm":
-    sentence = u"".join(sentence.split()).replace(u"\u2581", u" ").lstrip()
+    # SPM
+    elif subword_option == "spm":
+        sentence = "".join(sentence.split()).replace("\u2581", " ").lstrip()
 
-  return sentence
+    return sentence
 
 
 # Follow //transconsole/localization/machine_translation/metrics/bleu_calc.py
 def _bleu(ref_file, trans_file, subword_option=None):
-  """Compute BLEU scores and handling BPE."""
-  max_order = 4
-  smooth = False
-
-  ref_files = [ref_file]
-  reference_text = []
-  for reference_filename in ref_files:
-    with codecs.getreader("utf-8")(
-        tf.gfile.GFile(reference_filename, "rb")) as fh:
-      reference_text.append(fh.readlines())
-
-  per_segment_references = []
-  for references in zip(*reference_text):
-    reference_list = []
-    for reference in references:
-      reference = _clean(reference, subword_option)
-      reference_list.append(reference.split(" "))
-    per_segment_references.append(reference_list)
-
-  translations = []
-  with codecs.getreader("utf-8")(tf.gfile.GFile(trans_file, "rb")) as fh:
-    for line in fh:
-      line = _clean(line, subword_option=None)
-      translations.append(line.split(" "))
-
-  # bleu_score, precisions, bp, ratio, translation_length, reference_length
-  bleu_score, _, _, _, _, _ = bleu.compute_bleu(
-      per_segment_references, translations, max_order, smooth)
-  return 100 * bleu_score
+    """Compute BLEU scores and handling BPE."""
+    max_order = 4
+    smooth = False
+
+    ref_files = [ref_file]
+    reference_text = []
+    for reference_filename in ref_files:
+        with codecs.getreader("utf-8")(tf.gfile.GFile(reference_filename, "rb")) as fh:
+            reference_text.append(fh.readlines())
+
+    per_segment_references = []
+    for references in zip(*reference_text):
+        reference_list = []
+        for reference in references:
+            reference = _clean(reference, subword_option)
+            reference_list.append(reference.split(" "))
+        per_segment_references.append(reference_list)
+
+    translations = []
+    with codecs.getreader("utf-8")(tf.gfile.GFile(trans_file, "rb")) as fh:
+        for line in fh:
+            line = _clean(line, subword_option=None)
+            translations.append(line.split(" "))
+
+    # bleu_score, precisions, bp, ratio, translation_length, reference_length
+    bleu_score, _, _, _, _, _ = bleu.compute_bleu(
+        per_segment_references, translations, max_order, smooth
+    )
+    return 100 * bleu_score
 
 
 def _rouge(ref_file, summarization_file, subword_option=None):
-  """Compute ROUGE scores and handling BPE."""
+    """Compute ROUGE scores and handling BPE."""
 
-  references = []
-  with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as fh:
-    for line in fh:
-      references.append(_clean(line, subword_option))
+    references = []
+    with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as fh:
+        for line in fh:
+            references.append(_clean(line, subword_option))
 
-  hypotheses = []
-  with codecs.getreader("utf-8")(
-      tf.gfile.GFile(summarization_file, "rb")) as fh:
-    for line in fh:
-      hypotheses.append(_clean(line, subword_option=None))
+    hypotheses = []
+    with codecs.getreader("utf-8")(tf.gfile.GFile(summarization_file, "rb")) as fh:
+        for line in fh:
+            hypotheses.append(_clean(line, subword_option=None))
 
-  rouge_score_map = rouge.rouge(hypotheses, references)
-  return 100 * rouge_score_map["rouge_l/f_score"]
+    rouge_score_map = rouge.rouge(hypotheses, references)
+    return 100 * rouge_score_map["rouge_l/f_score"]
 
 
 def _accuracy(label_file, pred_file):
-  """Compute accuracy, each line contains a label."""
+    """Compute accuracy, each line contains a label."""
 
-  with codecs.getreader("utf-8")(tf.gfile.GFile(label_file, "rb")) as label_fh:
-    with codecs.getreader("utf-8")(tf.gfile.GFile(pred_file, "rb")) as pred_fh:
-      count = 0.0
-      match = 0.0
-      for label in label_fh:
-        label = label.strip()
-        pred = pred_fh.readline().strip()
-        if label == pred:
-          match += 1
-        count += 1
-  return 100 * match / count
+    with codecs.getreader("utf-8")(tf.gfile.GFile(label_file, "rb")) as label_fh:
+        with codecs.getreader("utf-8")(tf.gfile.GFile(pred_file, "rb")) as pred_fh:
+            count = 0.0
+            match = 0.0
+            for label in label_fh:
+                label = label.strip()
+                pred = pred_fh.readline().strip()
+                if label == pred:
+                    match += 1
+                count += 1
+    return 100 * match / count
 
 
 def _word_accuracy(label_file, pred_file):
-  """Compute accuracy on per word basis."""
-
-  with codecs.getreader("utf-8")(tf.gfile.GFile(label_file, "r")) as label_fh:
-    with codecs.getreader("utf-8")(tf.gfile.GFile(pred_file, "r")) as pred_fh:
-      total_acc, total_count = 0., 0.
-      for sentence in label_fh:
-        labels = sentence.strip().split(" ")
-        preds = pred_fh.readline().strip().split(" ")
-        match = 0.0
-        for pos in range(min(len(labels), len(preds))):
-          label = labels[pos]
-          pred = preds[pos]
-          if label == pred:
-            match += 1
-        total_acc += 100 * match / max(len(labels), len(preds))
-        total_count += 1
-  return total_acc / total_count
+    """Compute accuracy on per word basis."""
+
+    with codecs.getreader("utf-8")(tf.gfile.GFile(label_file, "r")) as label_fh:
+        with codecs.getreader("utf-8")(tf.gfile.GFile(pred_file, "r")) as pred_fh:
+            total_acc, total_count = 0.0, 0.0
+            for sentence in label_fh:
+                labels = sentence.strip().split(" ")
+                preds = pred_fh.readline().strip().split(" ")
+                match = 0.0
+                for pos in range(min(len(labels), len(preds))):
+                    label = labels[pos]
+                    pred = preds[pos]
+                    if label == pred:
+                        match += 1
+                total_acc += 100 * match / max(len(labels), len(preds))
+                total_count += 1
+    return total_acc / total_count
 
 
 def _moses_bleu(multi_bleu_script, tgt_test, trans_file, subword_option=None):
-  """Compute BLEU scores using Moses multi-bleu.perl script."""
-
-  # TODO(thangluong): perform rewrite using python
-  # BPE
-  if subword_option == "bpe":
-    debpe_tgt_test = tgt_test + ".debpe"
-    if not os.path.exists(debpe_tgt_test):
-      # TODO(thangluong): not use shell=True, can be a security hazard
-      subprocess.call("cp %s %s" % (tgt_test, debpe_tgt_test), shell=True)
-      subprocess.call("sed s/@@ //g %s" % (debpe_tgt_test),
-                      shell=True)
-    tgt_test = debpe_tgt_test
-  elif subword_option == "spm":
-    despm_tgt_test = tgt_test + ".despm"
-    if not os.path.exists(despm_tgt_test):
-      subprocess.call("cp %s %s" % (tgt_test, despm_tgt_test))
-      subprocess.call("sed s/ //g %s" % (despm_tgt_test))
-      subprocess.call(u"sed s/^\u2581/g %s" % (despm_tgt_test))
-      subprocess.call(u"sed s/\u2581/ /g %s" % (despm_tgt_test))
-    tgt_test = despm_tgt_test
-  cmd = "%s %s < %s" % (multi_bleu_script, tgt_test, trans_file)
-
-  # subprocess
-  # TODO(thangluong): not use shell=True, can be a security hazard
-  bleu_output = subprocess.check_output(cmd, shell=True)
-
-  # extract BLEU score
-  m = re.search("BLEU = (.+?),", bleu_output)
-  bleu_score = float(m.group(1))
-
-  return bleu_score
+    """Compute BLEU scores using Moses multi-bleu.perl script."""
+
+    # TODO(thangluong): perform rewrite using python
+    # BPE
+    if subword_option == "bpe":
+        debpe_tgt_test = tgt_test + ".debpe"
+        if not os.path.exists(debpe_tgt_test):
+            # TODO(thangluong): not use shell=True, can be a security hazard
+            subprocess.call(
+                "cp %s %s" %
+                (tgt_test, debpe_tgt_test), shell=True)
+            subprocess.call("sed s/@@ //g %s" % (debpe_tgt_test), shell=True)
+        tgt_test = debpe_tgt_test
+    elif subword_option == "spm":
+        despm_tgt_test = tgt_test + ".despm"
+        if not os.path.exists(despm_tgt_test):
+            subprocess.call("cp %s %s" % (tgt_test, despm_tgt_test))
+            subprocess.call("sed s/ //g %s" % (despm_tgt_test))
+            subprocess.call("sed s/^\u2581/g %s" % (despm_tgt_test))
+            subprocess.call("sed s/\u2581/ /g %s" % (despm_tgt_test))
+        tgt_test = despm_tgt_test
+    cmd = "%s %s < %s" % (multi_bleu_script, tgt_test, trans_file)
+
+    # subprocess
+    # TODO(thangluong): not use shell=True, can be a security hazard
+    bleu_output = subprocess.check_output(cmd, shell=True)
+
+    # extract BLEU score
+    m = re.search("BLEU = (.+?),", bleu_output)
+    bleu_score = float(m.group(1))
+
+    return bleu_score
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils_test.py
index f4a5e10ec..9512d673a 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils_test.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils_test.py
@@ -26,50 +26,53 @@
 
 class EvaluationUtilsTest(tf.test.TestCase):
 
-  def testEvaluate(self):
-    output = "nmt/testdata/deen_output"
-    ref_bpe = "nmt/testdata/deen_ref_bpe"
-    ref_spm = "nmt/testdata/deen_ref_spm"
+    def testEvaluate(self):
+        output = "nmt/testdata/deen_output"
+        ref_bpe = "nmt/testdata/deen_ref_bpe"
+        ref_spm = "nmt/testdata/deen_ref_spm"
 
-    expected_bleu_score = 22.5855084573
-    expected_rouge_score = 50.8429782599
+        expected_bleu_score = 22.5855084573
+        expected_rouge_score = 50.8429782599
 
-    bpe_bleu_score = evaluation_utils.evaluate(
-        ref_bpe, output, "bleu", "bpe")
-    bpe_rouge_score = evaluation_utils.evaluate(
-        ref_bpe, output, "rouge", "bpe")
+        bpe_bleu_score = evaluation_utils.evaluate(
+            ref_bpe, output, "bleu", "bpe")
+        bpe_rouge_score = evaluation_utils.evaluate(
+            ref_bpe, output, "rouge", "bpe")
 
-    self.assertAlmostEqual(expected_bleu_score, bpe_bleu_score)
-    self.assertAlmostEqual(expected_rouge_score, bpe_rouge_score)
+        self.assertAlmostEqual(expected_bleu_score, bpe_bleu_score)
+        self.assertAlmostEqual(expected_rouge_score, bpe_rouge_score)
 
-    spm_bleu_score = evaluation_utils.evaluate(
-        ref_spm, output, "bleu", "spm")
-    spm_rouge_score = evaluation_utils.evaluate(
-        ref_spm, output, "rouge", "spm")
+        spm_bleu_score = evaluation_utils.evaluate(
+            ref_spm, output, "bleu", "spm")
+        spm_rouge_score = evaluation_utils.evaluate(
+            ref_spm, output, "rouge", "spm")
 
-    self.assertAlmostEqual(expected_rouge_score, spm_rouge_score)
-    self.assertAlmostEqual(expected_bleu_score, spm_bleu_score)
+        self.assertAlmostEqual(expected_rouge_score, spm_rouge_score)
+        self.assertAlmostEqual(expected_bleu_score, spm_bleu_score)
 
-  def testAccuracy(self):
-    pred_output = "nmt/testdata/pred_output"
-    label_ref = "nmt/testdata/label_ref"
+    def testAccuracy(self):
+        pred_output = "nmt/testdata/pred_output"
+        label_ref = "nmt/testdata/label_ref"
 
-    expected_accuracy_score = 60.00
+        expected_accuracy_score = 60.00
 
-    accuracy_score = evaluation_utils.evaluate(
-        label_ref, pred_output, "accuracy")
-    self.assertAlmostEqual(expected_accuracy_score, accuracy_score)
+        accuracy_score = evaluation_utils.evaluate(
+            label_ref, pred_output, "accuracy")
+        self.assertAlmostEqual(expected_accuracy_score, accuracy_score)
 
-  def testWordAccuracy(self):
-    pred_output = "nmt/testdata/pred_output"
-    label_ref = "nmt/testdata/label_ref"
+    def testWordAccuracy(self):
+        pred_output = "nmt/testdata/pred_output"
+        label_ref = "nmt/testdata/label_ref"
 
-    expected_word_accuracy_score = 60.00
+        expected_word_accuracy_score = 60.00
 
-    word_accuracy_score = evaluation_utils.evaluate(
-        label_ref, pred_output, "word_accuracy")
-    self.assertAlmostEqual(expected_word_accuracy_score, word_accuracy_score)
+        word_accuracy_score = evaluation_utils.evaluate(
+            label_ref, pred_output, "word_accuracy"
+        )
+        self.assertAlmostEqual(
+            expected_word_accuracy_score,
+            word_accuracy_score)
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils.py
index 31efb11ff..c6f57a3c0 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils.py
@@ -27,222 +27,282 @@
 
 # NOTE(ebrevdo): When we subclass this, instances' __dict__ becomes empty.
 class BatchedInput(
-    collections.namedtuple("BatchedInput",
-                           ("initializer", "source", "target_input",
-                            "target_output", "source_sequence_length",
-                            "target_sequence_length"))):
-  pass
-
-
-def get_infer_iterator(src_dataset,
-                       src_vocab_table,
-                       batch_size,
-                       eos,
-                       src_max_len=None,
-                       use_char_encode=False):
-  if use_char_encode:
-    src_eos_id = vocab_utils.EOS_CHAR_ID
-  else:
-    src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32)
-  src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)
-
-  if src_max_len:
-    src_dataset = src_dataset.map(lambda src: src[:src_max_len])
-
-  if use_char_encode:
-    # Convert the word strings to character ids
-    src_dataset = src_dataset.map(
-        lambda src: tf.reshape(vocab_utils.tokens_to_bytes(src), [-1]))
-  else:
-    # Convert the word strings to ids
-    src_dataset = src_dataset.map(
-        lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32))
-
-  # Add in the word counts.
-  if use_char_encode:
-    src_dataset = src_dataset.map(
-        lambda src: (src,
-                     tf.to_int32(
-                         tf.size(src) / vocab_utils.DEFAULT_CHAR_MAXLEN)))
-  else:
-    src_dataset = src_dataset.map(lambda src: (src, tf.size(src)))
-
-  def batching_func(x):
-    return x.padded_batch(
-        batch_size,
-        # The entry is the source line rows;
-        # this has unknown-length vectors.  The last entry is
-        # the source row size; this is a scalar.
-        padded_shapes=(
-            tf.TensorShape([None]),  # src
-            tf.TensorShape([])),  # src_len
-        # Pad the source sequences with eos tokens.
-        # (Though notice we don't generally need to do this since
-        # later on we will be masking out calculations past the true sequence.
-        padding_values=(
-            src_eos_id,  # src
-            0))  # src_len -- unused
-
-  batched_dataset = batching_func(src_dataset)
-  batched_iter = batched_dataset.make_initializable_iterator()
-  (src_ids, src_seq_len) = batched_iter.get_next()
-  return BatchedInput(
-      initializer=batched_iter.initializer,
-      source=src_ids,
-      target_input=None,
-      target_output=None,
-      source_sequence_length=src_seq_len,
-      target_sequence_length=None)
-
-
-def get_iterator(src_dataset,
-                 tgt_dataset,
-                 src_vocab_table,
-                 tgt_vocab_table,
-                 batch_size,
-                 sos,
-                 eos,
-                 random_seed,
-                 num_buckets,
-                 src_max_len=None,
-                 tgt_max_len=None,
-                 num_parallel_calls=4,
-                 output_buffer_size=None,
-                 skip_count=None,
-                 num_shards=1,
-                 shard_index=0,
-                 reshuffle_each_iteration=True,
-                 use_char_encode=False):
-  if not output_buffer_size:
-    output_buffer_size = batch_size * 1000
-
-  if use_char_encode:
-    src_eos_id = vocab_utils.EOS_CHAR_ID
-  else:
-    src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32)
-
-  tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(sos)), tf.int32)
-  tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(eos)), tf.int32)
-
-  src_tgt_dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset))
-
-  src_tgt_dataset = src_tgt_dataset.shard(num_shards, shard_index)
-  if skip_count is not None:
-    src_tgt_dataset = src_tgt_dataset.skip(skip_count)
-
-  src_tgt_dataset = src_tgt_dataset.shuffle(
-      output_buffer_size, random_seed, reshuffle_each_iteration)
-
-  src_tgt_dataset = src_tgt_dataset.map(
-      lambda src, tgt: (
-          tf.string_split([src]).values, tf.string_split([tgt]).values),
-      num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
-
-  # Filter zero length input sequences.
-  src_tgt_dataset = src_tgt_dataset.filter(
-      lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0))
-
-  if src_max_len:
-    src_tgt_dataset = src_tgt_dataset.map(
-        lambda src, tgt: (src[:src_max_len], tgt),
-        num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
-  if tgt_max_len:
-    src_tgt_dataset = src_tgt_dataset.map(
-        lambda src, tgt: (src, tgt[:tgt_max_len]),
-        num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
+    collections.namedtuple(
+        "BatchedInput",
+        (
+            "initializer",
+            "source",
+            "target_input",
+            "target_output",
+            "source_sequence_length",
+            "target_sequence_length",
+        ),
+    )
+):
+    pass
+
+
+def get_infer_iterator(
+    src_dataset,
+    src_vocab_table,
+    batch_size,
+    eos,
+    src_max_len=None,
+    use_char_encode=False,
+):
+    if use_char_encode:
+        src_eos_id = vocab_utils.EOS_CHAR_ID
+    else:
+        src_eos_id = tf.cast(
+            src_vocab_table.lookup(
+                tf.constant(eos)), tf.int32)
+    src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values)
+
+    if src_max_len:
+        src_dataset = src_dataset.map(lambda src: src[:src_max_len])
+
+    if use_char_encode:
+        # Convert the word strings to character ids
+        src_dataset = src_dataset.map(
+            lambda src: tf.reshape(vocab_utils.tokens_to_bytes(src), [-1])
+        )
+    else:
+        # Convert the word strings to ids
+        src_dataset = src_dataset.map(
+            lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32)
+        )
+
+    # Add in the word counts.
+    if use_char_encode:
+        src_dataset = src_dataset.map(
+            lambda src: (
+                src,
+                tf.to_int32(tf.size(src) / vocab_utils.DEFAULT_CHAR_MAXLEN),
+            )
+        )
+    else:
+        src_dataset = src_dataset.map(lambda src: (src, tf.size(src)))
+
+    def batching_func(x):
+        return x.padded_batch(
+            batch_size,
+            # The entry is the source line rows;
+            # this has unknown-length vectors.  The last entry is
+            # the source row size; this is a scalar.
+            padded_shapes=(
+                tf.TensorShape([None]),  # src
+                tf.TensorShape([]),
+            ),  # src_len
+            # Pad the source sequences with eos tokens.
+            # (Though notice we don't generally need to do this since
+            # later on we will be masking out calculations past the true
+            # sequence.
+            padding_values=(src_eos_id, 0),  # src
+        )  # src_len -- unused
+
+    batched_dataset = batching_func(src_dataset)
+    batched_iter = batched_dataset.make_initializable_iterator()
+    (src_ids, src_seq_len) = batched_iter.get_next()
+    return BatchedInput(
+        initializer=batched_iter.initializer,
+        source=src_ids,
+        target_input=None,
+        target_output=None,
+        source_sequence_length=src_seq_len,
+        target_sequence_length=None,
+    )
+
+
+def get_iterator(
+    src_dataset,
+    tgt_dataset,
+    src_vocab_table,
+    tgt_vocab_table,
+    batch_size,
+    sos,
+    eos,
+    random_seed,
+    num_buckets,
+    src_max_len=None,
+    tgt_max_len=None,
+    num_parallel_calls=4,
+    output_buffer_size=None,
+    skip_count=None,
+    num_shards=1,
+    shard_index=0,
+    reshuffle_each_iteration=True,
+    use_char_encode=False,
+):
+    if not output_buffer_size:
+        output_buffer_size = batch_size * 1000
+
+    if use_char_encode:
+        src_eos_id = vocab_utils.EOS_CHAR_ID
+    else:
+        src_eos_id = tf.cast(
+            src_vocab_table.lookup(
+                tf.constant(eos)), tf.int32)
+
+    tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(sos)), tf.int32)
+    tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(eos)), tf.int32)
+
+    src_tgt_dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset))
+
+    src_tgt_dataset = src_tgt_dataset.shard(num_shards, shard_index)
+    if skip_count is not None:
+        src_tgt_dataset = src_tgt_dataset.skip(skip_count)
+
+    src_tgt_dataset = src_tgt_dataset.shuffle(
+        output_buffer_size, random_seed, reshuffle_each_iteration
+    )
 
-  # Convert the word strings to ids.  Word strings that are not in the
-  # vocab get the lookup table's default_value integer.
-  if use_char_encode:
-    src_tgt_dataset = src_tgt_dataset.map(
-        lambda src, tgt: (tf.reshape(vocab_utils.tokens_to_bytes(src), [-1]),
-                          tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)),
-        num_parallel_calls=num_parallel_calls)
-  else:
-    src_tgt_dataset = src_tgt_dataset.map(
-        lambda src, tgt: (tf.cast(src_vocab_table.lookup(src), tf.int32),
-                          tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)),
-        num_parallel_calls=num_parallel_calls)
-
-  src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size)
-  # Create a tgt_input prefixed with <sos> and a tgt_output suffixed with <eos>.
-  src_tgt_dataset = src_tgt_dataset.map(
-      lambda src, tgt: (src,
-                        tf.concat(([tgt_sos_id], tgt), 0),
-                        tf.concat((tgt, [tgt_eos_id]), 0)),
-      num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
-  # Add in sequence lengths.
-  if use_char_encode:
     src_tgt_dataset = src_tgt_dataset.map(
-        lambda src, tgt_in, tgt_out: (
-            src, tgt_in, tgt_out,
-            tf.to_int32(tf.size(src) / vocab_utils.DEFAULT_CHAR_MAXLEN),
-            tf.size(tgt_in)),
-        num_parallel_calls=num_parallel_calls)
-  else:
+        lambda src, tgt: (
+            tf.string_split(
+                [src]).values, tf.string_split(
+                [tgt]).values),
+        num_parallel_calls=num_parallel_calls,
+    ).prefetch(output_buffer_size)
+
+    # Filter zero length input sequences.
+    src_tgt_dataset = src_tgt_dataset.filter(
+        lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)
+    )
+
+    if src_max_len:
+        src_tgt_dataset = src_tgt_dataset.map(
+            lambda src, tgt: (src[:src_max_len], tgt),
+            num_parallel_calls=num_parallel_calls,
+        ).prefetch(output_buffer_size)
+    if tgt_max_len:
+        src_tgt_dataset = src_tgt_dataset.map(
+            lambda src, tgt: (src, tgt[:tgt_max_len]),
+            num_parallel_calls=num_parallel_calls,
+        ).prefetch(output_buffer_size)
+
+    # Convert the word strings to ids.  Word strings that are not in the
+    # vocab get the lookup table's default_value integer.
+    if use_char_encode:
+        src_tgt_dataset = src_tgt_dataset.map(
+            lambda src, tgt: (
+                tf.reshape(vocab_utils.tokens_to_bytes(src), [-1]),
+                tf.cast(tgt_vocab_table.lookup(tgt), tf.int32),
+            ),
+            num_parallel_calls=num_parallel_calls,
+        )
+    else:
+        src_tgt_dataset = src_tgt_dataset.map(
+            lambda src, tgt: (
+                tf.cast(src_vocab_table.lookup(src), tf.int32),
+                tf.cast(tgt_vocab_table.lookup(tgt), tf.int32),
+            ),
+            num_parallel_calls=num_parallel_calls,
+        )
+
+    src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size)
+    # Create a tgt_input prefixed with <sos> and a tgt_output suffixed with
+    # <eos>.
     src_tgt_dataset = src_tgt_dataset.map(
-        lambda src, tgt_in, tgt_out: (
-            src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in)),
-        num_parallel_calls=num_parallel_calls)
-
-  src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size)
-
-  # Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...)
-  def batching_func(x):
-    return x.padded_batch(
-        batch_size,
-        # The first three entries are the source and target line rows;
-        # these have unknown-length vectors.  The last two entries are
-        # the source and target row sizes; these are scalars.
-        padded_shapes=(
-            tf.TensorShape([None]),  # src
-            tf.TensorShape([None]),  # tgt_input
-            tf.TensorShape([None]),  # tgt_output
-            tf.TensorShape([]),  # src_len
-            tf.TensorShape([])),  # tgt_len
-        # Pad the source and target sequences with eos tokens.
-        # (Though notice we don't generally need to do this since
-        # later on we will be masking out calculations past the true sequence.
-        padding_values=(
-            src_eos_id,  # src
-            tgt_eos_id,  # tgt_input
-            tgt_eos_id,  # tgt_output
-            0,  # src_len -- unused
-            0))  # tgt_len -- unused
-
-  if num_buckets > 1:
-
-    def key_func(unused_1, unused_2, unused_3, src_len, tgt_len):
-      # Calculate bucket_width by maximum source sequence length.
-      # Pairs with length [0, bucket_width) go to bucket 0, length
-      # [bucket_width, 2 * bucket_width) go to bucket 1, etc.  Pairs with length
-      # over ((num_bucket-1) * bucket_width) words all go into the last bucket.
-      if src_max_len:
-        bucket_width = (src_max_len + num_buckets - 1) // num_buckets
-      else:
-        bucket_width = 10
-
-      # Bucket sentence pairs by the length of their source sentence and target
-      # sentence.
-      bucket_id = tf.maximum(src_len // bucket_width, tgt_len // bucket_width)
-      return tf.to_int64(tf.minimum(num_buckets, bucket_id))
-
-    def reduce_func(unused_key, windowed_data):
-      return batching_func(windowed_data)
-
-    batched_dataset = src_tgt_dataset.apply(
-        tf.contrib.data.group_by_window(
-            key_func=key_func, reduce_func=reduce_func, window_size=batch_size))
-
-  else:
-    batched_dataset = batching_func(src_tgt_dataset)
-  batched_iter = batched_dataset.make_initializable_iterator()
-  (src_ids, tgt_input_ids, tgt_output_ids, src_seq_len,
-   tgt_seq_len) = (batched_iter.get_next())
-  return BatchedInput(
-      initializer=batched_iter.initializer,
-      source=src_ids,
-      target_input=tgt_input_ids,
-      target_output=tgt_output_ids,
-      source_sequence_length=src_seq_len,
-      target_sequence_length=tgt_seq_len)
+        lambda src, tgt: (
+            src,
+            tf.concat(([tgt_sos_id], tgt), 0),
+            tf.concat((tgt, [tgt_eos_id]), 0),
+        ),
+        num_parallel_calls=num_parallel_calls,
+    ).prefetch(output_buffer_size)
+    # Add in sequence lengths.
+    if use_char_encode:
+        src_tgt_dataset = src_tgt_dataset.map(
+            lambda src, tgt_in, tgt_out: (
+                src,
+                tgt_in,
+                tgt_out,
+                tf.to_int32(tf.size(src) / vocab_utils.DEFAULT_CHAR_MAXLEN),
+                tf.size(tgt_in),
+            ),
+            num_parallel_calls=num_parallel_calls,
+        )
+    else:
+        src_tgt_dataset = src_tgt_dataset.map(
+            lambda src, tgt_in, tgt_out: (
+                src,
+                tgt_in,
+                tgt_out,
+                tf.size(src),
+                tf.size(tgt_in),
+            ),
+            num_parallel_calls=num_parallel_calls,
+        )
+
+    src_tgt_dataset = src_tgt_dataset.prefetch(output_buffer_size)
+
+    # Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...)
+    def batching_func(x):
+        return x.padded_batch(
+            batch_size,
+            # The first three entries are the source and target line rows;
+            # these have unknown-length vectors.  The last two entries are
+            # the source and target row sizes; these are scalars.
+            padded_shapes=(
+                tf.TensorShape([None]),  # src
+                tf.TensorShape([None]),  # tgt_input
+                tf.TensorShape([None]),  # tgt_output
+                tf.TensorShape([]),  # src_len
+                tf.TensorShape([]),
+            ),  # tgt_len
+            # Pad the source and target sequences with eos tokens.
+            # (Though notice we don't generally need to do this since
+            # later on we will be masking out calculations past the true
+            # sequence.
+            padding_values=(
+                src_eos_id,  # src
+                tgt_eos_id,  # tgt_input
+                tgt_eos_id,  # tgt_output
+                0,  # src_len -- unused
+                0,
+            ),
+        )  # tgt_len -- unused
+
+    if num_buckets > 1:
+
+        def key_func(unused_1, unused_2, unused_3, src_len, tgt_len):
+            # Calculate bucket_width by maximum source sequence length.
+            # Pairs with length [0, bucket_width) go to bucket 0, length
+            # [bucket_width, 2 * bucket_width) go to bucket 1, etc.  Pairs with length
+            # over ((num_bucket-1) * bucket_width) words all go into the last
+            # bucket.
+            if src_max_len:
+                bucket_width = (src_max_len + num_buckets - 1) // num_buckets
+            else:
+                bucket_width = 10
+
+            # Bucket sentence pairs by the length of their source sentence and target
+            # sentence.
+            bucket_id = tf.maximum(
+                src_len // bucket_width,
+                tgt_len // bucket_width)
+            return tf.to_int64(tf.minimum(num_buckets, bucket_id))
+
+        def reduce_func(unused_key, windowed_data):
+            return batching_func(windowed_data)
+
+        batched_dataset = src_tgt_dataset.apply(
+            tf.contrib.data.group_by_window(
+                key_func=key_func, reduce_func=reduce_func, window_size=batch_size
+            )
+        )
+
+    else:
+        batched_dataset = batching_func(src_tgt_dataset)
+    batched_iter = batched_dataset.make_initializable_iterator()
+    (src_ids, tgt_input_ids, tgt_output_ids, src_seq_len, tgt_seq_len) = (
+        batched_iter.get_next()
+    )
+    return BatchedInput(
+        initializer=batched_iter.initializer,
+        source=src_ids,
+        target_input=tgt_input_ids,
+        target_output=tgt_output_ids,
+        source_sequence_length=src_seq_len,
+        target_sequence_length=tgt_seq_len,
+    )
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils_test.py
index 3ef1dc8d9..7ba426c7e 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils_test.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils_test.py
@@ -28,294 +28,300 @@
 
 class IteratorUtilsTest(tf.test.TestCase):
 
-  def testGetIterator(self):
-    tf.set_random_seed(1)
-    tgt_vocab_table = src_vocab_table = lookup_ops.index_table_from_tensor(
-        tf.constant(["a", "b", "c", "eos", "sos"]))
-    src_dataset = tf.data.Dataset.from_tensor_slices(
-        tf.constant(["f e a g", "c c a", "d", "c a"]))
-    tgt_dataset = tf.data.Dataset.from_tensor_slices(
-        tf.constant(["c c", "a b", "", "b c"]))
-    hparams = tf.contrib.training.HParams(
-        random_seed=3,
-        num_buckets=5,
-        eos="eos",
-        sos="sos")
-    batch_size = 2
-    src_max_len = 3
-    iterator = iterator_utils.get_iterator(
-        src_dataset=src_dataset,
-        tgt_dataset=tgt_dataset,
-        src_vocab_table=src_vocab_table,
-        tgt_vocab_table=tgt_vocab_table,
-        batch_size=batch_size,
-        sos=hparams.sos,
-        eos=hparams.eos,
-        random_seed=hparams.random_seed,
-        num_buckets=hparams.num_buckets,
-        src_max_len=src_max_len,
-        reshuffle_each_iteration=False)
-    table_initializer = tf.tables_initializer()
-    source = iterator.source
-    target_input = iterator.target_input
-    target_output = iterator.target_output
-    src_seq_len = iterator.source_sequence_length
-    tgt_seq_len = iterator.target_sequence_length
-    self.assertEqual([None, None], source.shape.as_list())
-    self.assertEqual([None, None], target_input.shape.as_list())
-    self.assertEqual([None, None], target_output.shape.as_list())
-    self.assertEqual([None], src_seq_len.shape.as_list())
-    self.assertEqual([None], tgt_seq_len.shape.as_list())
-    with self.test_session() as sess:
-      sess.run(table_initializer)
-      sess.run(iterator.initializer)
+    def testGetIterator(self):
+        tf.set_random_seed(1)
+        tgt_vocab_table = src_vocab_table = lookup_ops.index_table_from_tensor(
+            tf.constant(["a", "b", "c", "eos", "sos"])
+        )
+        src_dataset = tf.data.Dataset.from_tensor_slices(
+            tf.constant(["f e a g", "c c a", "d", "c a"])
+        )
+        tgt_dataset = tf.data.Dataset.from_tensor_slices(
+            tf.constant(["c c", "a b", "", "b c"])
+        )
+        hparams = tf.contrib.training.HParams(
+            random_seed=3, num_buckets=5, eos="eos", sos="sos"
+        )
+        batch_size = 2
+        src_max_len = 3
+        iterator = iterator_utils.get_iterator(
+            src_dataset=src_dataset,
+            tgt_dataset=tgt_dataset,
+            src_vocab_table=src_vocab_table,
+            tgt_vocab_table=tgt_vocab_table,
+            batch_size=batch_size,
+            sos=hparams.sos,
+            eos=hparams.eos,
+            random_seed=hparams.random_seed,
+            num_buckets=hparams.num_buckets,
+            src_max_len=src_max_len,
+            reshuffle_each_iteration=False,
+        )
+        table_initializer = tf.tables_initializer()
+        source = iterator.source
+        target_input = iterator.target_input
+        target_output = iterator.target_output
+        src_seq_len = iterator.source_sequence_length
+        tgt_seq_len = iterator.target_sequence_length
+        self.assertEqual([None, None], source.shape.as_list())
+        self.assertEqual([None, None], target_input.shape.as_list())
+        self.assertEqual([None, None], target_output.shape.as_list())
+        self.assertEqual([None], src_seq_len.shape.as_list())
+        self.assertEqual([None], tgt_seq_len.shape.as_list())
+        with self.test_session() as sess:
+            sess.run(table_initializer)
+            sess.run(iterator.initializer)
 
-      (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
-          sess.run((source, src_seq_len, target_input, target_output,
-                    tgt_seq_len)))
-      self.assertAllEqual(
-          [[-1, -1, 0], # "f" == unknown, "e" == unknown, a
-           [2, 0, 3]],  # c a eos -- eos is padding
-          source_v)
-      self.assertAllEqual([3, 2], src_len_v)
-      self.assertAllEqual(
-          [[4, 2, 2],   # sos c c
-           [4, 1, 2]],  # sos b c
-          target_input_v)
-      self.assertAllEqual(
-          [[2, 2, 3],   # c c eos
-           [1, 2, 3]],  # b c eos
-          target_output_v)
-      self.assertAllEqual([3, 3], tgt_len_v)
+            (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
+                sess.run(
+                    (source, src_seq_len, target_input, target_output, tgt_seq_len)
+                )
+            )
+            self.assertAllEqual(
+                [
+                    [-1, -1, 0],  # "f" == unknown, "e" == unknown, a
+                    [2, 0, 3],
+                ],  # c a eos -- eos is padding
+                source_v,
+            )
+            self.assertAllEqual([3, 2], src_len_v)
+            self.assertAllEqual(
+                [[4, 2, 2], [4, 1, 2]], target_input_v  # sos c c  # sos b c
+            )
+            self.assertAllEqual(
+                [[2, 2, 3], [1, 2, 3]], target_output_v  # c c eos  # b c eos
+            )
+            self.assertAllEqual([3, 3], tgt_len_v)
 
-      (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
-          sess.run((source, src_seq_len, target_input, target_output,
-                    tgt_seq_len)))
-      self.assertAllEqual(
-          [[2, 2, 0]],  # c c a
-          source_v)
-      self.assertAllEqual([3], src_len_v)
-      self.assertAllEqual(
-          [[4, 0, 1]],  # sos a b
-          target_input_v)
-      self.assertAllEqual(
-          [[0, 1, 3]],  # a b eos
-          target_output_v)
-      self.assertAllEqual([3], tgt_len_v)
+            (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
+                sess.run(
+                    (source, src_seq_len, target_input, target_output, tgt_seq_len)
+                )
+            )
+            self.assertAllEqual([[2, 2, 0]], source_v)  # c c a
+            self.assertAllEqual([3], src_len_v)
+            self.assertAllEqual([[4, 0, 1]], target_input_v)  # sos a b
+            self.assertAllEqual([[0, 1, 3]], target_output_v)  # a b eos
+            self.assertAllEqual([3], tgt_len_v)
 
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(source)
+            with self.assertRaisesOpError("End of sequence"):
+                sess.run(source)
 
-  def testGetIteratorWithShard(self):
-    tf.set_random_seed(1)
-    tgt_vocab_table = src_vocab_table = lookup_ops.index_table_from_tensor(
-        tf.constant(["a", "b", "c", "eos", "sos"]))
-    src_dataset = tf.data.Dataset.from_tensor_slices(
-        tf.constant(["c c a", "f e a g", "d", "c a"]))
-    tgt_dataset = tf.data.Dataset.from_tensor_slices(
-        tf.constant(["a b", "c c", "", "b c"]))
-    hparams = tf.contrib.training.HParams(
-        random_seed=3,
-        num_buckets=5,
-        eos="eos",
-        sos="sos")
-    batch_size = 2
-    src_max_len = 3
-    iterator = iterator_utils.get_iterator(
-        src_dataset=src_dataset,
-        tgt_dataset=tgt_dataset,
-        src_vocab_table=src_vocab_table,
-        tgt_vocab_table=tgt_vocab_table,
-        batch_size=batch_size,
-        sos=hparams.sos,
-        eos=hparams.eos,
-        random_seed=hparams.random_seed,
-        num_buckets=hparams.num_buckets,
-        src_max_len=src_max_len,
-        num_shards=2,
-        shard_index=1,
-        reshuffle_each_iteration=False)
-    table_initializer = tf.tables_initializer()
-    source = iterator.source
-    target_input = iterator.target_input
-    target_output = iterator.target_output
-    src_seq_len = iterator.source_sequence_length
-    tgt_seq_len = iterator.target_sequence_length
-    self.assertEqual([None, None], source.shape.as_list())
-    self.assertEqual([None, None], target_input.shape.as_list())
-    self.assertEqual([None, None], target_output.shape.as_list())
-    self.assertEqual([None], src_seq_len.shape.as_list())
-    self.assertEqual([None], tgt_seq_len.shape.as_list())
-    with self.test_session() as sess:
-      sess.run(table_initializer)
-      sess.run(iterator.initializer)
+    def testGetIteratorWithShard(self):
+        tf.set_random_seed(1)
+        tgt_vocab_table = src_vocab_table = lookup_ops.index_table_from_tensor(
+            tf.constant(["a", "b", "c", "eos", "sos"])
+        )
+        src_dataset = tf.data.Dataset.from_tensor_slices(
+            tf.constant(["c c a", "f e a g", "d", "c a"])
+        )
+        tgt_dataset = tf.data.Dataset.from_tensor_slices(
+            tf.constant(["a b", "c c", "", "b c"])
+        )
+        hparams = tf.contrib.training.HParams(
+            random_seed=3, num_buckets=5, eos="eos", sos="sos"
+        )
+        batch_size = 2
+        src_max_len = 3
+        iterator = iterator_utils.get_iterator(
+            src_dataset=src_dataset,
+            tgt_dataset=tgt_dataset,
+            src_vocab_table=src_vocab_table,
+            tgt_vocab_table=tgt_vocab_table,
+            batch_size=batch_size,
+            sos=hparams.sos,
+            eos=hparams.eos,
+            random_seed=hparams.random_seed,
+            num_buckets=hparams.num_buckets,
+            src_max_len=src_max_len,
+            num_shards=2,
+            shard_index=1,
+            reshuffle_each_iteration=False,
+        )
+        table_initializer = tf.tables_initializer()
+        source = iterator.source
+        target_input = iterator.target_input
+        target_output = iterator.target_output
+        src_seq_len = iterator.source_sequence_length
+        tgt_seq_len = iterator.target_sequence_length
+        self.assertEqual([None, None], source.shape.as_list())
+        self.assertEqual([None, None], target_input.shape.as_list())
+        self.assertEqual([None, None], target_output.shape.as_list())
+        self.assertEqual([None], src_seq_len.shape.as_list())
+        self.assertEqual([None], tgt_seq_len.shape.as_list())
+        with self.test_session() as sess:
+            sess.run(table_initializer)
+            sess.run(iterator.initializer)
 
-      (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
-          sess.run((source, src_seq_len, target_input, target_output,
-                    tgt_seq_len)))
-      self.assertAllEqual(
-          [[-1, -1, 0], # "f" == unknown, "e" == unknown, a
-           [2, 0, 3]],  # c a eos -- eos is padding
-          source_v)
-      self.assertAllEqual([3, 2], src_len_v)
-      self.assertAllEqual(
-          [[4, 2, 2],   # sos c c
-           [4, 1, 2]],  # sos b c
-          target_input_v)
-      self.assertAllEqual(
-          [[2, 2, 3],   # c c eos
-           [1, 2, 3]],  # b c eos
-          target_output_v)
-      self.assertAllEqual([3, 3], tgt_len_v)
+            (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
+                sess.run(
+                    (source, src_seq_len, target_input, target_output, tgt_seq_len)
+                )
+            )
+            self.assertAllEqual(
+                [
+                    [-1, -1, 0],  # "f" == unknown, "e" == unknown, a
+                    [2, 0, 3],
+                ],  # c a eos -- eos is padding
+                source_v,
+            )
+            self.assertAllEqual([3, 2], src_len_v)
+            self.assertAllEqual(
+                [[4, 2, 2], [4, 1, 2]], target_input_v  # sos c c  # sos b c
+            )
+            self.assertAllEqual(
+                [[2, 2, 3], [1, 2, 3]], target_output_v  # c c eos  # b c eos
+            )
+            self.assertAllEqual([3, 3], tgt_len_v)
 
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(source)
+            with self.assertRaisesOpError("End of sequence"):
+                sess.run(source)
 
-  def testGetIteratorWithSkipCount(self):
-    tf.set_random_seed(1)
-    tgt_vocab_table = src_vocab_table = lookup_ops.index_table_from_tensor(
-        tf.constant(["a", "b", "c", "eos", "sos"]))
-    src_dataset = tf.data.Dataset.from_tensor_slices(
-        tf.constant(["c a", "c c a", "d", "f e a g"]))
-    tgt_dataset = tf.data.Dataset.from_tensor_slices(
-        tf.constant(["b c", "a b", "", "c c"]))
-    hparams = tf.contrib.training.HParams(
-        random_seed=3,
-        num_buckets=5,
-        eos="eos",
-        sos="sos")
-    batch_size = 2
-    src_max_len = 3
-    skip_count = tf.placeholder(shape=(), dtype=tf.int64)
-    iterator = iterator_utils.get_iterator(
-        src_dataset=src_dataset,
-        tgt_dataset=tgt_dataset,
-        src_vocab_table=src_vocab_table,
-        tgt_vocab_table=tgt_vocab_table,
-        batch_size=batch_size,
-        sos=hparams.sos,
-        eos=hparams.eos,
-        random_seed=hparams.random_seed,
-        num_buckets=hparams.num_buckets,
-        src_max_len=src_max_len,
-        skip_count=skip_count,
-        reshuffle_each_iteration=False)
-    table_initializer = tf.tables_initializer()
-    source = iterator.source
-    target_input = iterator.target_input
-    target_output = iterator.target_output
-    src_seq_len = iterator.source_sequence_length
-    tgt_seq_len = iterator.target_sequence_length
-    self.assertEqual([None, None], source.shape.as_list())
-    self.assertEqual([None, None], target_input.shape.as_list())
-    self.assertEqual([None, None], target_output.shape.as_list())
-    self.assertEqual([None], src_seq_len.shape.as_list())
-    self.assertEqual([None], tgt_seq_len.shape.as_list())
-    with self.test_session() as sess:
-      sess.run(table_initializer)
-      sess.run(iterator.initializer, feed_dict={skip_count: 3})
+    def testGetIteratorWithSkipCount(self):
+        tf.set_random_seed(1)
+        tgt_vocab_table = src_vocab_table = lookup_ops.index_table_from_tensor(
+            tf.constant(["a", "b", "c", "eos", "sos"])
+        )
+        src_dataset = tf.data.Dataset.from_tensor_slices(
+            tf.constant(["c a", "c c a", "d", "f e a g"])
+        )
+        tgt_dataset = tf.data.Dataset.from_tensor_slices(
+            tf.constant(["b c", "a b", "", "c c"])
+        )
+        hparams = tf.contrib.training.HParams(
+            random_seed=3, num_buckets=5, eos="eos", sos="sos"
+        )
+        batch_size = 2
+        src_max_len = 3
+        skip_count = tf.placeholder(shape=(), dtype=tf.int64)
+        iterator = iterator_utils.get_iterator(
+            src_dataset=src_dataset,
+            tgt_dataset=tgt_dataset,
+            src_vocab_table=src_vocab_table,
+            tgt_vocab_table=tgt_vocab_table,
+            batch_size=batch_size,
+            sos=hparams.sos,
+            eos=hparams.eos,
+            random_seed=hparams.random_seed,
+            num_buckets=hparams.num_buckets,
+            src_max_len=src_max_len,
+            skip_count=skip_count,
+            reshuffle_each_iteration=False,
+        )
+        table_initializer = tf.tables_initializer()
+        source = iterator.source
+        target_input = iterator.target_input
+        target_output = iterator.target_output
+        src_seq_len = iterator.source_sequence_length
+        tgt_seq_len = iterator.target_sequence_length
+        self.assertEqual([None, None], source.shape.as_list())
+        self.assertEqual([None, None], target_input.shape.as_list())
+        self.assertEqual([None, None], target_output.shape.as_list())
+        self.assertEqual([None], src_seq_len.shape.as_list())
+        self.assertEqual([None], tgt_seq_len.shape.as_list())
+        with self.test_session() as sess:
+            sess.run(table_initializer)
+            sess.run(iterator.initializer, feed_dict={skip_count: 3})
 
-      (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
-          sess.run((source, src_seq_len, target_input, target_output,
-                    tgt_seq_len)))
-      self.assertAllEqual(
-          [[-1, -1, 0]], # "f" == unknown, "e" == unknown, a
-          source_v)
-      self.assertAllEqual([3], src_len_v)
-      self.assertAllEqual(
-          [[4, 2, 2]],   # sos c c
-          target_input_v)
-      self.assertAllEqual(
-          [[2, 2, 3]],   # c c eos
-          target_output_v)
-      self.assertAllEqual([3], tgt_len_v)
+            (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
+                sess.run(
+                    (source, src_seq_len, target_input, target_output, tgt_seq_len)
+                )
+            )
+            self.assertAllEqual(
+                [[-1, -1, 0]], source_v  # "f" == unknown, "e" == unknown, a
+            )
+            self.assertAllEqual([3], src_len_v)
+            self.assertAllEqual([[4, 2, 2]], target_input_v)  # sos c c
+            self.assertAllEqual([[2, 2, 3]], target_output_v)  # c c eos
+            self.assertAllEqual([3], tgt_len_v)
 
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(source)
+            with self.assertRaisesOpError("End of sequence"):
+                sess.run(source)
 
-      # Re-init iterator with skip_count=0.
-      sess.run(iterator.initializer, feed_dict={skip_count: 0})
+            # Re-init iterator with skip_count=0.
+            sess.run(iterator.initializer, feed_dict={skip_count: 0})
 
-      (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
-          sess.run((source, src_seq_len, target_input, target_output,
-                    tgt_seq_len)))
-      self.assertAllEqual(
-          [[2, 0, 3],    # c a eos -- eos is padding
-           [-1, -1, 0]], # "f" == unknown, "e" == unknown, a
-          source_v)
-      self.assertAllEqual([2, 3], src_len_v)
-      self.assertAllEqual(
-          [[4, 1, 2],   # sos b c
-           [4, 2, 2]],  # sos c c
-          target_input_v)
-      self.assertAllEqual(
-          [[1, 2, 3],   # b c eos
-           [2, 2, 3]],  # c c eos
-          target_output_v)
-      self.assertAllEqual([3, 3], tgt_len_v)
+            (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
+                sess.run(
+                    (source, src_seq_len, target_input, target_output, tgt_seq_len)
+                )
+            )
+            self.assertAllEqual(
+                [
+                    [2, 0, 3],  # c a eos -- eos is padding
+                    [-1, -1, 0],
+                ],  # "f" == unknown, "e" == unknown, a
+                source_v,
+            )
+            self.assertAllEqual([2, 3], src_len_v)
+            self.assertAllEqual(
+                [[4, 1, 2], [4, 2, 2]], target_input_v  # sos b c  # sos c c
+            )
+            self.assertAllEqual(
+                [[1, 2, 3], [2, 2, 3]], target_output_v  # b c eos  # c c eos
+            )
+            self.assertAllEqual([3, 3], tgt_len_v)
 
-      (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
-          sess.run((source, src_seq_len, target_input, target_output,
-                    tgt_seq_len)))
-      self.assertAllEqual(
-          [[2, 2, 0]],  # c c a
-          source_v)
-      self.assertAllEqual([3], src_len_v)
-      self.assertAllEqual(
-          [[4, 0, 1]],  # sos a b
-          target_input_v)
-      self.assertAllEqual(
-          [[0, 1, 3]],  # a b eos
-          target_output_v)
-      self.assertAllEqual([3], tgt_len_v)
+            (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (
+                sess.run(
+                    (source, src_seq_len, target_input, target_output, tgt_seq_len)
+                )
+            )
+            self.assertAllEqual([[2, 2, 0]], source_v)  # c c a
+            self.assertAllEqual([3], src_len_v)
+            self.assertAllEqual([[4, 0, 1]], target_input_v)  # sos a b
+            self.assertAllEqual([[0, 1, 3]], target_output_v)  # a b eos
+            self.assertAllEqual([3], tgt_len_v)
 
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(source)
+            with self.assertRaisesOpError("End of sequence"):
+                sess.run(source)
 
+    def testGetInferIterator(self):
+        src_vocab_table = lookup_ops.index_table_from_tensor(
+            tf.constant(["a", "b", "c", "eos", "sos"])
+        )
+        src_dataset = tf.data.Dataset.from_tensor_slices(
+            tf.constant(["c c a", "c a", "d", "f e a g"])
+        )
+        hparams = tf.contrib.training.HParams(
+            random_seed=3, eos="eos", sos="sos")
+        batch_size = 2
+        src_max_len = 3
+        iterator = iterator_utils.get_infer_iterator(
+            src_dataset=src_dataset,
+            src_vocab_table=src_vocab_table,
+            batch_size=batch_size,
+            eos=hparams.eos,
+            src_max_len=src_max_len,
+        )
+        table_initializer = tf.tables_initializer()
+        source = iterator.source
+        seq_len = iterator.source_sequence_length
+        self.assertEqual([None, None], source.shape.as_list())
+        self.assertEqual([None], seq_len.shape.as_list())
+        with self.test_session() as sess:
+            sess.run(table_initializer)
+            sess.run(iterator.initializer)
 
-  def testGetInferIterator(self):
-    src_vocab_table = lookup_ops.index_table_from_tensor(
-        tf.constant(["a", "b", "c", "eos", "sos"]))
-    src_dataset = tf.data.Dataset.from_tensor_slices(
-        tf.constant(["c c a", "c a", "d", "f e a g"]))
-    hparams = tf.contrib.training.HParams(
-        random_seed=3,
-        eos="eos",
-        sos="sos")
-    batch_size = 2
-    src_max_len = 3
-    iterator = iterator_utils.get_infer_iterator(
-        src_dataset=src_dataset,
-        src_vocab_table=src_vocab_table,
-        batch_size=batch_size,
-        eos=hparams.eos,
-        src_max_len=src_max_len)
-    table_initializer = tf.tables_initializer()
-    source = iterator.source
-    seq_len = iterator.source_sequence_length
-    self.assertEqual([None, None], source.shape.as_list())
-    self.assertEqual([None], seq_len.shape.as_list())
-    with self.test_session() as sess:
-      sess.run(table_initializer)
-      sess.run(iterator.initializer)
+            (source_v, seq_len_v) = sess.run((source, seq_len))
+            self.assertAllEqual([[2, 2, 0], [2, 0, 3]],
+                                source_v)  # c c a  # c a eos
+            self.assertAllEqual([3, 2], seq_len_v)
 
-      (source_v, seq_len_v) = sess.run((source, seq_len))
-      self.assertAllEqual(
-          [[2, 2, 0],   # c c a
-           [2, 0, 3]],  # c a eos
-          source_v)
-      self.assertAllEqual([3, 2], seq_len_v)
+            (source_v, seq_len_v) = sess.run((source, seq_len))
+            self.assertAllEqual(
+                [
+                    [-1, 3, 3],  # "d" == unknown, eos eos
+                    [-1, -1, 0],
+                ],  # "f" == unknown, "e" == unknown, a
+                source_v,
+            )
+            self.assertAllEqual([1, 3], seq_len_v)
 
-      (source_v, seq_len_v) = sess.run((source, seq_len))
-      self.assertAllEqual(
-          [[-1, 3, 3],    # "d" == unknown, eos eos
-           [-1, -1, 0]],  # "f" == unknown, "e" == unknown, a
-          source_v)
-      self.assertAllEqual([1, 3], seq_len_v)
-
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run((source, seq_len))
+            with self.assertRaisesOpError("End of sequence"):
+                sess.run((source, seq_len))
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils.py
index 63dc5a69c..508211c9f 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils.py
@@ -30,152 +30,167 @@
 
 
 def check_tensorflow_version():
-  min_tf_version = "1.4.0-dev20171024"
-  if (version.LooseVersion(tf.__version__) <
-      version.LooseVersion(min_tf_version)):
-    raise EnvironmentError("Tensorflow version must >= %s" % min_tf_version)
+    min_tf_version = "1.4.0-dev20171024"
+    if version.LooseVersion(
+            tf.__version__) < version.LooseVersion(min_tf_version):
+        raise EnvironmentError(
+            "Tensorflow version must >= %s" %
+            min_tf_version)
 
 
 def safe_exp(value):
-  """Exponentiation with catching of overflow error."""
-  try:
-    ans = math.exp(value)
-  except OverflowError:
-    ans = float("inf")
-  return ans
+    """Exponentiation with catching of overflow error."""
+    try:
+        ans = math.exp(value)
+    except OverflowError:
+        ans = float("inf")
+    return ans
 
 
 def print_time(s, start_time):
-  """Take a start time, print elapsed duration, and return a new time."""
-  print("%s, time %ds, %s." % (s, (time.time() - start_time), time.ctime()))
-  sys.stdout.flush()
-  return time.time()
+    """Take a start time, print elapsed duration, and return a new time."""
+    print("%s, time %ds, %s." % (s, (time.time() - start_time), time.ctime()))
+    sys.stdout.flush()
+    return time.time()
 
 
 def print_out(s, f=None, new_line=True):
-  """Similar to print but with support to flush and output to a file."""
-  if isinstance(s, bytes):
-    s = s.decode("utf-8")
+    """Similar to print but with support to flush and output to a file."""
+    if isinstance(s, bytes):
+        s = s.decode("utf-8")
 
-  if f:
-    f.write(s.encode("utf-8"))
-    if new_line:
-      f.write(b"\n")
+    if f:
+        f.write(s.encode("utf-8"))
+        if new_line:
+            f.write(b"\n")
 
-  # stdout
-  out_s = s.encode("utf-8")
-  if not isinstance(out_s, str):
-    out_s = out_s.decode("utf-8")
-  print(out_s, end="", file=sys.stdout)
+    # stdout
+    out_s = s.encode("utf-8")
+    if not isinstance(out_s, str):
+        out_s = out_s.decode("utf-8")
+    print(out_s, end="", file=sys.stdout)
 
-  if new_line:
-    sys.stdout.write("\n")
-  sys.stdout.flush()
+    if new_line:
+        sys.stdout.write("\n")
+    sys.stdout.flush()
 
 
 def print_hparams(hparams, skip_patterns=None, header=None):
-  """Print hparams, can skip keys based on pattern."""
-  if header: print_out("%s" % header)
-  values = hparams.values()
-  for key in sorted(values.keys()):
-    if not skip_patterns or all(
-        [skip_pattern not in key for skip_pattern in skip_patterns]):
-      print_out("  %s=%s" % (key, str(values[key])))
+    """Print hparams, can skip keys based on pattern."""
+    if header:
+        print_out("%s" % header)
+    values = hparams.values()
+    for key in sorted(values.keys()):
+        if not skip_patterns or all(
+            [skip_pattern not in key for skip_pattern in skip_patterns]
+        ):
+            print_out("  %s=%s" % (key, str(values[key])))
 
 
 def load_hparams(model_dir):
-  """Load hparams from an existing model directory."""
-  hparams_file = os.path.join(model_dir, "hparams")
-  if tf.gfile.Exists(hparams_file):
-    print_out("# Loading hparams from %s" % hparams_file)
-    with codecs.getreader("utf-8")(tf.gfile.GFile(hparams_file, "rb")) as f:
-      try:
-        hparams_values = json.load(f)
-        hparams = tf.contrib.training.HParams(**hparams_values)
-      except ValueError:
-        print_out("  can't load hparams file")
+    """Load hparams from an existing model directory."""
+    hparams_file = os.path.join(model_dir, "hparams")
+    if tf.gfile.Exists(hparams_file):
+        print_out("# Loading hparams from %s" % hparams_file)
+        with codecs.getreader("utf-8")(tf.gfile.GFile(hparams_file, "rb")) as f:
+            try:
+                hparams_values = json.load(f)
+                hparams = tf.contrib.training.HParams(**hparams_values)
+            except ValueError:
+                print_out("  can't load hparams file")
+                return None
+        return hparams
+    else:
         return None
-    return hparams
-  else:
-    return None
 
 
 def maybe_parse_standard_hparams(hparams, hparams_path):
-  """Override hparams values with existing standard hparams config."""
-  if hparams_path and tf.gfile.Exists(hparams_path):
-    print_out("# Loading standard hparams from %s" % hparams_path)
-    with codecs.getreader("utf-8")(tf.gfile.GFile(hparams_path, "rb")) as f:
-      hparams.parse_json(f.read())
-  return hparams
+    """Override hparams values with existing standard hparams config."""
+    if hparams_path and tf.gfile.Exists(hparams_path):
+        print_out("# Loading standard hparams from %s" % hparams_path)
+        with codecs.getreader("utf-8")(tf.gfile.GFile(hparams_path, "rb")) as f:
+            hparams.parse_json(f.read())
+    return hparams
 
 
 def save_hparams(out_dir, hparams):
-  """Save hparams."""
-  hparams_file = os.path.join(out_dir, "hparams")
-  print_out("  saving hparams to %s" % hparams_file)
-  with codecs.getwriter("utf-8")(tf.gfile.GFile(hparams_file, "wb")) as f:
-    f.write(hparams.to_json(indent=4, sort_keys=True))
+    """Save hparams."""
+    hparams_file = os.path.join(out_dir, "hparams")
+    print_out("  saving hparams to %s" % hparams_file)
+    with codecs.getwriter("utf-8")(tf.gfile.GFile(hparams_file, "wb")) as f:
+        f.write(hparams.to_json(indent=4, sort_keys=True))
 
 
 def debug_tensor(s, msg=None, summarize=10):
-  """Print the shape and value of a tensor at test time. Return a new tensor."""
-  if not msg:
-    msg = s.name
-  return tf.Print(s, [tf.shape(s), s], msg + " ", summarize=summarize)
+    """Print the shape and value of a tensor at test time. Return a new tensor."""
+    if not msg:
+        msg = s.name
+    return tf.Print(s, [tf.shape(s), s], msg + " ", summarize=summarize)
 
 
 def add_summary(summary_writer, global_step, tag, value):
-  """Add a new summary to the current summary_writer.
-  Useful to log things that are not part of the training graph, e.g., tag=BLEU.
-  """
-  summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
-  summary_writer.add_summary(summary, global_step)
-
-
-def get_config_proto(log_device_placement=False, allow_soft_placement=True,
-                     num_intra_threads=0, num_inter_threads=0):
-  # GPU options:
-  # https://www.tensorflow.org/versions/r0.10/how_tos/using_gpu/index.html
-  config_proto = tf.ConfigProto(
-      log_device_placement=log_device_placement,
-      allow_soft_placement=allow_soft_placement)
-  config_proto.gpu_options.allow_growth = True
-
-  # CPU threads options
-  if num_intra_threads:
-    config_proto.intra_op_parallelism_threads = num_intra_threads
-  if num_inter_threads:
-    config_proto.inter_op_parallelism_threads = num_inter_threads
-
-  return config_proto
+    """Add a new summary to the current summary_writer.
+    Useful to log things that are not part of the training graph, e.g., tag=BLEU.
+    """
+    summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
+    summary_writer.add_summary(summary, global_step)
+
+
+def get_config_proto(
+    log_device_placement=False,
+    allow_soft_placement=True,
+    num_intra_threads=0,
+    num_inter_threads=0,
+):
+    # GPU options:
+    # https://www.tensorflow.org/versions/r0.10/how_tos/using_gpu/index.html
+    config_proto = tf.ConfigProto(
+        log_device_placement=log_device_placement,
+        allow_soft_placement=allow_soft_placement,
+    )
+    config_proto.gpu_options.allow_growth = True
+
+    # CPU threads options
+    if num_intra_threads:
+        config_proto.intra_op_parallelism_threads = num_intra_threads
+    if num_inter_threads:
+        config_proto.inter_op_parallelism_threads = num_inter_threads
+
+    return config_proto
 
 
 def format_text(words):
-  """Convert a sequence words into sentence."""
-  if (not hasattr(words, "__len__") and  # for numpy array
-      not isinstance(words, collections.Iterable)):
-    words = [words]
-  return b" ".join(words)
+    """Convert a sequence words into sentence."""
+    if not hasattr(words, "__len__") and not isinstance(  # for numpy array
+        words, collections.Iterable
+    ):
+        words = [words]
+    return b" ".join(words)
 
 
 def format_bpe_text(symbols, delimiter=b"@@"):
-  """Convert a sequence of bpe words into sentence."""
-  words = []
-  word = b""
-  if isinstance(symbols, str):
-    symbols = symbols.encode()
-  delimiter_len = len(delimiter)
-  for symbol in symbols:
-    if len(symbol) >= delimiter_len and symbol[-delimiter_len:] == delimiter:
-      word += symbol[:-delimiter_len]
-    else:  # end of a word
-      word += symbol
-      words.append(word)
-      word = b""
-  return b" ".join(words)
+    """Convert a sequence of bpe words into sentence."""
+    words = []
+    word = b""
+    if isinstance(symbols, str):
+        symbols = symbols.encode()
+    delimiter_len = len(delimiter)
+    for symbol in symbols:
+        if len(
+                symbol) >= delimiter_len and symbol[-delimiter_len:] == delimiter:
+            word += symbol[:-delimiter_len]
+        else:  # end of a word
+            word += symbol
+            words.append(word)
+            word = b""
+    return b" ".join(words)
 
 
 def format_spm_text(symbols):
-  """Decode a text in SPM (https://github.com/google/sentencepiece) format."""
-  return u"".join(format_text(symbols).decode("utf-8").split()).replace(
-      u"\u2581", u" ").strip().encode("utf-8")
+    """Decode a text in SPM (https://github.com/google/sentencepiece) format."""
+    return (
+        "".join(format_text(symbols).decode("utf-8").split())
+        .replace("\u2581", " ")
+        .strip()
+        .encode("utf-8")
+    )
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils_test.py
index a15551d8c..5bc177926 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils_test.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils_test.py
@@ -26,24 +26,26 @@
 
 class MiscUtilsTest(tf.test.TestCase):
 
-  def testFormatBpeText(self):
-    bpe_line = (
-        b"En@@ ough to make already reluc@@ tant men hesitate to take screening"
-        b" tests ."
-    )
-    expected_result = (
-        b"Enough to make already reluctant men hesitate to take screening tests"
-        b" ."
-    )
-    self.assertEqual(expected_result,
-                     misc_utils.format_bpe_text(bpe_line.split(b" ")))
-
-  def testFormatSPMText(self):
-    spm_line = u"\u2581This \u2581is \u2581a \u2581 te st .".encode("utf-8")
-    expected_result = "This is a test."
-    self.assertEqual(expected_result,
-                     misc_utils.format_spm_text(spm_line.split(b" ")))
+    def testFormatBpeText(self):
+        bpe_line = (
+            b"En@@ ough to make already reluc@@ tant men hesitate to take screening"
+            b" tests ."
+        )
+        expected_result = (
+            b"Enough to make already reluctant men hesitate to take screening tests"
+            b" ."
+        )
+        self.assertEqual(
+            expected_result, misc_utils.format_bpe_text(bpe_line.split(b" "))
+        )
+
+    def testFormatSPMText(self):
+        spm_line = "\u2581This \u2581is \u2581a \u2581 te st .".encode("utf-8")
+        expected_result = "This is a test."
+        self.assertEqual(
+            expected_result, misc_utils.format_spm_text(spm_line.split(b" "))
+        )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/nmt_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/nmt_utils.py
index ff417a668..857b74e6a 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/nmt_utils.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/nmt_utils.py
@@ -27,113 +27,131 @@
 __all__ = ["decode_and_evaluate", "get_translation"]
 
 
-def decode_and_evaluate(run,
-                        iterations,
-                        name,
-                        model,
-                        sess,
-                        trans_file,
-                        ref_file,
-                        metrics,
-                        subword_option,
-                        beam_width,
-                        tgt_eos,
-                        num_translations_per_input=1,
-                        decode=True,
-                        infer_mode="greedy"):
-  """Decode a test set and compute a score according to the evaluation task."""
-  # Decode
-  if decode:
-    utils.print_out("  decoding to output %s" % trans_file)
-
-    num_sentences = 0
-    with codecs.getwriter("utf-8")(
-        tf.gfile.GFile(trans_file, mode="wb")) as trans_f:
-      trans_f.write("")  # Write empty string to ensure file is created.
-
-      if infer_mode == "greedy":
-        num_translations_per_input = 1
-      elif infer_mode == "beam_search":
-        num_translations_per_input = min(num_translations_per_input, beam_width)
-
-      print("  infer_mode %s, beam_width %g, num translations per input %d. " \
-            % (infer_mode, beam_width, num_translations_per_input))
-      print("  total iterations count %d." % iterations)
-
-      # prediction time is the time for the model prediction only
-      # overall time is the time for data pre-processing and data post-processing
-      prediction_times = list()
-      overall_start = time.time()
-
-      n = 0
-      while n < iterations:
-        n += 1
-        while True:
-          try:
-            start = time.time()
-            nmt_outputs, _ = model.decode(sess)
-            prediction_times.append(time.time() - start)
-            if infer_mode != "beam_search":
-              nmt_outputs = np.expand_dims(nmt_outputs, 0)
-
-            batch_size = nmt_outputs.shape[1]
-
-            num_sentences += batch_size
-            for sent_id in range(batch_size):
-              for beam_id in range(num_translations_per_input):
-                translation = get_translation(
-                          nmt_outputs[beam_id],
-                         sent_id,
-                         tgt_eos=tgt_eos,
-                         subword_option=subword_option)
-                if run == 'accuracy':
-                  trans_f.write((translation + b"\n").decode("utf-8"))
-
-          except tf.errors.OutOfRangeError:
-            utils.print_time(
-                "  done, num sentences %d, num translations per input %d" %
-                (num_sentences, num_translations_per_input), overall_start)
-            break
-
-    overall_time = (time.time() - overall_start)
-    if run == 'performance':
-      print("\nAverage Prediction Latency: {:.5f} sec per batch.".format(
-        sum(prediction_times)/float(len(prediction_times))))
-      print("Overall Latency: {:.5f} sec for the entire test "
-            "dataset.".format(overall_time/float(iterations)))
-      print("Overall Throughput : {:.3f} sentences per sec.".format(
-            num_sentences/float(overall_time)))
-
-  # Evaluation
-  evaluation_scores = {}
-  if ref_file and tf.gfile.Exists(trans_file):
-    for metric in metrics:
-      score = evaluation_utils.evaluate(
-             ref_file,
-             trans_file,
-             metric,
-             subword_option=subword_option)
-      evaluation_scores[metric] = score
-      utils.print_out("  %s %s: %.1f" % (metric, name, score))
-
-  return evaluation_scores
+def decode_and_evaluate(
+    run,
+    iterations,
+    name,
+    model,
+    sess,
+    trans_file,
+    ref_file,
+    metrics,
+    subword_option,
+    beam_width,
+    tgt_eos,
+    num_translations_per_input=1,
+    decode=True,
+    infer_mode="greedy",
+):
+    """Decode a test set and compute a score according to the evaluation task."""
+    # Decode
+    if decode:
+        utils.print_out("  decoding to output %s" % trans_file)
+
+        num_sentences = 0
+        with codecs.getwriter("utf-8")(
+            tf.gfile.GFile(trans_file, mode="wb")
+        ) as trans_f:
+            trans_f.write("")  # Write empty string to ensure file is created.
+
+            if infer_mode == "greedy":
+                num_translations_per_input = 1
+            elif infer_mode == "beam_search":
+                num_translations_per_input = min(
+                    num_translations_per_input, beam_width)
+
+            print(
+                "  infer_mode %s, beam_width %g, num translations per input %d. "
+                % (infer_mode, beam_width, num_translations_per_input)
+            )
+            print("  total iterations count %d." % iterations)
+
+            # prediction time is the time for the model prediction only
+            # overall time is the time for data pre-processing and data
+            # post-processing
+            prediction_times = list()
+            overall_start = time.time()
+
+            n = 0
+            while n < iterations:
+                n += 1
+                while True:
+                    try:
+                        start = time.time()
+                        nmt_outputs, _ = model.decode(sess)
+                        prediction_times.append(time.time() - start)
+                        if infer_mode != "beam_search":
+                            nmt_outputs = np.expand_dims(nmt_outputs, 0)
+
+                        batch_size = nmt_outputs.shape[1]
+
+                        num_sentences += batch_size
+                        for sent_id in range(batch_size):
+                            for beam_id in range(num_translations_per_input):
+                                translation = get_translation(
+                                    nmt_outputs[beam_id],
+                                    sent_id,
+                                    tgt_eos=tgt_eos,
+                                    subword_option=subword_option,
+                                )
+                                if run == "accuracy":
+                                    trans_f.write(
+                                        (translation + b"\n").decode("utf-8"))
+
+                    except tf.errors.OutOfRangeError:
+                        utils.print_time(
+                            "  done, num sentences %d, num translations per input %d"
+                            % (num_sentences, num_translations_per_input),
+                            overall_start,
+                        )
+                        break
+
+        overall_time = time.time() - overall_start
+        if run == "performance":
+            print(
+                "\nAverage Prediction Latency: {:.5f} sec per batch.".format(
+                    sum(prediction_times) / float(len(prediction_times))
+                )
+            )
+            print(
+                "Overall Latency: {:.5f} sec for the entire test "
+                "dataset.".format(overall_time / float(iterations))
+            )
+            print(
+                "Overall Throughput : {:.3f} sentences per sec.".format(
+                    num_sentences / float(overall_time)
+                )
+            )
+
+    # Evaluation
+    evaluation_scores = {}
+    if ref_file and tf.gfile.Exists(trans_file):
+        for metric in metrics:
+            score = evaluation_utils.evaluate(
+                ref_file, trans_file, metric, subword_option=subword_option
+            )
+            evaluation_scores[metric] = score
+            utils.print_out("  %s %s: %.1f" % (metric, name, score))
+
+    return evaluation_scores
 
 
 def get_translation(nmt_outputs, sent_id, tgt_eos, subword_option):
-  """Given batch decoding outputs, select a sentence and turn to text."""
-  if tgt_eos: tgt_eos = tgt_eos.encode("utf-8")
-  # Select a sentence
-  output = nmt_outputs[sent_id, :].tolist()
-
-  # If there is an eos symbol in outputs, cut them at that point.
-  if tgt_eos and tgt_eos in output:
-    output = output[:output.index(tgt_eos)]
-
-  if subword_option == "bpe":  # BPE
-    translation = utils.format_bpe_text(output)
-  elif subword_option == "spm":  # SPM
-    translation = utils.format_spm_text(output)
-  else:
-    translation = utils.format_text(output)
-
-  return translation
+    """Given batch decoding outputs, select a sentence and turn to text."""
+    if tgt_eos:
+        tgt_eos = tgt_eos.encode("utf-8")
+    # Select a sentence
+    output = nmt_outputs[sent_id, :].tolist()
+
+    # If there is an eos symbol in outputs, cut them at that point.
+    if tgt_eos and tgt_eos in output:
+        output = output[: output.index(tgt_eos)]
+
+    if subword_option == "bpe":  # BPE
+        translation = utils.format_bpe_text(output)
+    elif subword_option == "spm":  # SPM
+        translation = utils.format_spm_text(output)
+    else:
+        translation = utils.format_text(output)
+
+    return translation
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/standard_hparams_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/standard_hparams_utils.py
index c47a6f6b3..1643ff0e7 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/standard_hparams_utils.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/standard_hparams_utils.py
@@ -23,89 +23,81 @@
 
 
 def create_standard_hparams():
-  return tf.contrib.training.HParams(
-      # Data
-      src="",
-      tgt="",
-      train_prefix="",
-      dev_prefix="",
-      test_prefix="",
-      vocab_prefix="",
-      embed_prefix="",
-      out_dir="",
-
-      # Networks
-      num_units=512,
-      num_encoder_layers=2,
-      num_decoder_layers=2,
-      dropout=0.2,
-      unit_type="lstm",
-      encoder_type="bi",
-      residual=False,
-      time_major=True,
-      num_embeddings_partitions=0,
-      num_enc_emb_partitions=0,
-      num_dec_emb_partitions=0,
-
-      # Attention mechanisms
-      attention="scaled_luong",
-      attention_architecture="standard",
-      output_attention=True,
-      pass_hidden_state=True,
-
-      # Train
-      optimizer="sgd",
-      batch_size=128,
-      init_op="uniform",
-      init_weight=0.1,
-      max_gradient_norm=5.0,
-      learning_rate=1.0,
-      warmup_steps=0,
-      warmup_scheme="t2t",
-      decay_scheme="luong234",
-      colocate_gradients_with_ops=True,
-      num_train_steps=12000,
-      num_sampled_softmax=0,
-
-      # Data constraints
-      num_buckets=5,
-      max_train=0,
-      src_max_len=50,
-      tgt_max_len=50,
-      src_max_len_infer=0,
-      tgt_max_len_infer=0,
-
-      # Data format
-      sos="<s>",
-      eos="</s>",
-      subword_option="",
-      use_char_encode=False,
-      check_special_token=True,
-
-      # Misc
-      forget_bias=1.0,
-      num_gpus=1,
-      epoch_step=0,  # record where we were within an epoch.
-      steps_per_stats=100,
-      steps_per_external_eval=0,
-      share_vocab=False,
-      metrics=["bleu"],
-      log_device_placement=False,
-      random_seed=None,
-      # only enable beam search during inference when beam_width > 0.
-      beam_width=0,
-      length_penalty_weight=0.0,
-      override_loaded_hparams=True,
-      num_keep_ckpts=5,
-      avg_ckpts=False,
-
-      # For inference
-      inference_indices=None,
-      infer_batch_size=32,
-      sampling_temperature=0.0,
-      num_translations_per_input=1,
-      infer_mode="greedy",
-
-      # Language model
-      language_model=False,
-  )
+    return tf.contrib.training.HParams(
+        # Data
+        src="",
+        tgt="",
+        train_prefix="",
+        dev_prefix="",
+        test_prefix="",
+        vocab_prefix="",
+        embed_prefix="",
+        out_dir="",
+        # Networks
+        num_units=512,
+        num_encoder_layers=2,
+        num_decoder_layers=2,
+        dropout=0.2,
+        unit_type="lstm",
+        encoder_type="bi",
+        residual=False,
+        time_major=True,
+        num_embeddings_partitions=0,
+        num_enc_emb_partitions=0,
+        num_dec_emb_partitions=0,
+        # Attention mechanisms
+        attention="scaled_luong",
+        attention_architecture="standard",
+        output_attention=True,
+        pass_hidden_state=True,
+        # Train
+        optimizer="sgd",
+        batch_size=128,
+        init_op="uniform",
+        init_weight=0.1,
+        max_gradient_norm=5.0,
+        learning_rate=1.0,
+        warmup_steps=0,
+        warmup_scheme="t2t",
+        decay_scheme="luong234",
+        colocate_gradients_with_ops=True,
+        num_train_steps=12000,
+        num_sampled_softmax=0,
+        # Data constraints
+        num_buckets=5,
+        max_train=0,
+        src_max_len=50,
+        tgt_max_len=50,
+        src_max_len_infer=0,
+        tgt_max_len_infer=0,
+        # Data format
+        sos="<s>",
+        eos="</s>",
+        subword_option="",
+        use_char_encode=False,
+        check_special_token=True,
+        # Misc
+        forget_bias=1.0,
+        num_gpus=1,
+        epoch_step=0,  # record where we were within an epoch.
+        steps_per_stats=100,
+        steps_per_external_eval=0,
+        share_vocab=False,
+        metrics=["bleu"],
+        log_device_placement=False,
+        random_seed=None,
+        # only enable beam search during inference when beam_width > 0.
+        beam_width=0,
+        length_penalty_weight=0.0,
+        override_loaded_hparams=True,
+        num_keep_ckpts=5,
+        avg_ckpts=False,
+        # For inference
+        inference_indices=None,
+        infer_batch_size=32,
+        sampling_temperature=0.0,
+        num_translations_per_input=1,
+        infer_mode="greedy",
+        # Language model
+        language_model=False,
+    )
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils.py
index 5063bf2ef..9a3103bf2 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils.py
@@ -45,150 +45,162 @@
 
 
 def _string_to_bytes(text, max_length):
-  """Given string and length, convert to byte seq of at most max_length.
+    """Given string and length, convert to byte seq of at most max_length.
 
-  This process mimics docqa/elmo's preprocessing:
-  https://github.com/allenai/document-qa/blob/master/docqa/elmo/data.py
+    This process mimics docqa/elmo's preprocessing:
+    https://github.com/allenai/document-qa/blob/master/docqa/elmo/data.py
 
-  Note that we make use of BOS_CHAR_ID and EOS_CHAR_ID in iterator_utils.py & 
-  our usage differs from docqa/elmo.
+    Note that we make use of BOS_CHAR_ID and EOS_CHAR_ID in iterator_utils.py &
+    our usage differs from docqa/elmo.
 
-  Args:
-    text: tf.string tensor of shape []
-    max_length: max number of chars for each word.
+    Args:
+      text: tf.string tensor of shape []
+      max_length: max number of chars for each word.
 
-  Returns:
-    A tf.int32 tensor of the byte encoded text.
-  """
-  byte_ids = tf.to_int32(tf.decode_raw(text, tf.uint8))
-  byte_ids = byte_ids[:max_length - 2]
-  padding = tf.fill([max_length - tf.shape(byte_ids)[0] - 2], PAD_CHAR_ID)
-  byte_ids = tf.concat(
-      [[BOW_CHAR_ID], byte_ids, [EOW_CHAR_ID], padding], axis=0)
-  tf.logging.info(byte_ids)
+    Returns:
+      A tf.int32 tensor of the byte encoded text.
+    """
+    byte_ids = tf.to_int32(tf.decode_raw(text, tf.uint8))
+    byte_ids = byte_ids[: max_length - 2]
+    padding = tf.fill([max_length - tf.shape(byte_ids)[0] - 2], PAD_CHAR_ID)
+    byte_ids = tf.concat(
+        [[BOW_CHAR_ID], byte_ids, [EOW_CHAR_ID], padding], axis=0)
+    tf.logging.info(byte_ids)
 
-  byte_ids = tf.reshape(byte_ids, [max_length])
-  tf.logging.info(byte_ids.get_shape().as_list())
-  return byte_ids + 1
+    byte_ids = tf.reshape(byte_ids, [max_length])
+    tf.logging.info(byte_ids.get_shape().as_list())
+    return byte_ids + 1
 
 
 def tokens_to_bytes(tokens):
-  """Given a sequence of strings, map to sequence of bytes.
-
-  Args:
-    tokens: A tf.string tensor
-
-  Returns:
-    A tensor of shape words.shape + [bytes_per_word] containing byte versions
-    of each word.
-  """
-  bytes_per_word = DEFAULT_CHAR_MAXLEN
-  with tf.device("/cpu:0"):
-    tf.assert_rank(tokens, 1)
-    shape = tf.shape(tokens)
-    tf.logging.info(tokens)
-    tokens_flat = tf.reshape(tokens, [-1])
-    as_bytes_flat = tf.map_fn(
-        fn=lambda x: _string_to_bytes(x, max_length=bytes_per_word),
-        elems=tokens_flat,
-        dtype=tf.int32,
-        back_prop=False)
-    tf.logging.info(as_bytes_flat)
-    as_bytes = tf.reshape(as_bytes_flat, [shape[0], bytes_per_word])
-  return as_bytes
+    """Given a sequence of strings, map to sequence of bytes.
+
+    Args:
+      tokens: A tf.string tensor
+
+    Returns:
+      A tensor of shape words.shape + [bytes_per_word] containing byte versions
+      of each word.
+    """
+    bytes_per_word = DEFAULT_CHAR_MAXLEN
+    with tf.device("/cpu:0"):
+        tf.assert_rank(tokens, 1)
+        shape = tf.shape(tokens)
+        tf.logging.info(tokens)
+        tokens_flat = tf.reshape(tokens, [-1])
+        as_bytes_flat = tf.map_fn(
+            fn=lambda x: _string_to_bytes(x, max_length=bytes_per_word),
+            elems=tokens_flat,
+            dtype=tf.int32,
+            back_prop=False,
+        )
+        tf.logging.info(as_bytes_flat)
+        as_bytes = tf.reshape(as_bytes_flat, [shape[0], bytes_per_word])
+    return as_bytes
 
 
 def load_vocab(vocab_file):
-  vocab = []
-  with codecs.getreader("utf-8")(tf.gfile.GFile(vocab_file, "rb")) as f:
-    vocab_size = 0
-    for word in f:
-      vocab_size += 1
-      vocab.append(word.strip())
-  return vocab, vocab_size
-
-
-def check_vocab(vocab_file, out_dir, check_special_token=True, sos=None,
-                eos=None, unk=None):
-  """Check if vocab_file doesn't exist, create from corpus_file."""
-  if tf.gfile.Exists(vocab_file):
-    utils.print_out("# Vocab file %s exists" % vocab_file)
-    vocab, vocab_size = load_vocab(vocab_file)
-    if check_special_token:
-      # Verify if the vocab starts with unk, sos, eos
-      # If not, prepend those tokens & generate a new vocab file
-      if not unk: unk = UNK
-      if not sos: sos = SOS
-      if not eos: eos = EOS
-      assert len(vocab) >= 3
-      if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
-        utils.print_out("The first 3 vocab words [%s, %s, %s]"
-                        " are not [%s, %s, %s]" %
-                        (vocab[0], vocab[1], vocab[2], unk, sos, eos))
-        vocab = [unk, sos, eos] + vocab
-        vocab_size += 3
-        new_vocab_file = os.path.join(out_dir, os.path.basename(vocab_file))
-        with codecs.getwriter("utf-8")(
-            tf.gfile.GFile(new_vocab_file, "wb")) as f:
-          for word in vocab:
-            f.write("%s\n" % word)
-        vocab_file = new_vocab_file
-  else:
-    raise ValueError("vocab_file '%s' does not exist." % vocab_file)
-
-  vocab_size = len(vocab)
-  return vocab_size, vocab_file
+    vocab = []
+    with codecs.getreader("utf-8")(tf.gfile.GFile(vocab_file, "rb")) as f:
+        vocab_size = 0
+        for word in f:
+            vocab_size += 1
+            vocab.append(word.strip())
+    return vocab, vocab_size
+
+
+def check_vocab(
+    vocab_file, out_dir, check_special_token=True, sos=None, eos=None, unk=None
+):
+    """Check if vocab_file doesn't exist, create from corpus_file."""
+    if tf.gfile.Exists(vocab_file):
+        utils.print_out("# Vocab file %s exists" % vocab_file)
+        vocab, vocab_size = load_vocab(vocab_file)
+        if check_special_token:
+            # Verify if the vocab starts with unk, sos, eos
+            # If not, prepend those tokens & generate a new vocab file
+            if not unk:
+                unk = UNK
+            if not sos:
+                sos = SOS
+            if not eos:
+                eos = EOS
+            assert len(vocab) >= 3
+            if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
+                utils.print_out(
+                    "The first 3 vocab words [%s, %s, %s]"
+                    " are not [%s, %s, %s]"
+                    % (vocab[0], vocab[1], vocab[2], unk, sos, eos)
+                )
+                vocab = [unk, sos, eos] + vocab
+                vocab_size += 3
+                new_vocab_file = os.path.join(
+                    out_dir, os.path.basename(vocab_file))
+                with codecs.getwriter("utf-8")(
+                    tf.gfile.GFile(new_vocab_file, "wb")
+                ) as f:
+                    for word in vocab:
+                        f.write("%s\n" % word)
+                vocab_file = new_vocab_file
+    else:
+        raise ValueError("vocab_file '%s' does not exist." % vocab_file)
+
+    vocab_size = len(vocab)
+    return vocab_size, vocab_file
 
 
 def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
-  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
-  src_vocab_table = lookup_ops.index_table_from_file(
-      src_vocab_file, default_value=UNK_ID)
-  if share_vocab:
-    tgt_vocab_table = src_vocab_table
-  else:
-    tgt_vocab_table = lookup_ops.index_table_from_file(
-        tgt_vocab_file, default_value=UNK_ID)
-  return src_vocab_table, tgt_vocab_table
+    """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
+    src_vocab_table = lookup_ops.index_table_from_file(
+        src_vocab_file, default_value=UNK_ID
+    )
+    if share_vocab:
+        tgt_vocab_table = src_vocab_table
+    else:
+        tgt_vocab_table = lookup_ops.index_table_from_file(
+            tgt_vocab_file, default_value=UNK_ID
+        )
+    return src_vocab_table, tgt_vocab_table
 
 
 def load_embed_txt(embed_file):
-  """Load embed_file into a python dictionary.
-
-  Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming
-  Here is an exampe assuming embed_size=5:
-
-  the -0.071549 0.093459 0.023738 -0.090339 0.056123
-  to 0.57346 0.5417 -0.23477 -0.3624 0.4037
-  and 0.20327 0.47348 0.050877 0.002103 0.060547
-
-  For word2vec format, the first line will be: <num_words> <emb_size>.
-
-  Args:
-    embed_file: file path to the embedding file.
-  Returns:
-    a dictionary that maps word to vector, and the size of embedding dimensions.
-  """
-  emb_dict = dict()
-  emb_size = None
-
-  is_first_line = True
-  with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f:
-    for line in f:
-      tokens = line.rstrip().split(" ")
-      if is_first_line:
-        is_first_line = False
-        if len(tokens) == 2:  # header line
-          emb_size = int(tokens[1])
-          continue
-      word = tokens[0]
-      vec = list(map(float, tokens[1:]))
-      emb_dict[word] = vec
-      if emb_size:
-        if emb_size != len(vec):
-          utils.print_out(
-              "Ignoring %s since embeding size is inconsistent." % word)
-          del emb_dict[word]
-      else:
-        emb_size = len(vec)
-  return emb_dict, emb_size
+    """Load embed_file into a python dictionary.
+
+    Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming
+    Here is an exampe assuming embed_size=5:
+
+    the -0.071549 0.093459 0.023738 -0.090339 0.056123
+    to 0.57346 0.5417 -0.23477 -0.3624 0.4037
+    and 0.20327 0.47348 0.050877 0.002103 0.060547
+
+    For word2vec format, the first line will be: <num_words> <emb_size>.
+
+    Args:
+      embed_file: file path to the embedding file.
+    Returns:
+      a dictionary that maps word to vector, and the size of embedding dimensions.
+    """
+    emb_dict = dict()
+    emb_size = None
+
+    is_first_line = True
+    with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f:
+        for line in f:
+            tokens = line.rstrip().split(" ")
+            if is_first_line:
+                is_first_line = False
+                if len(tokens) == 2:  # header line
+                    emb_size = int(tokens[1])
+                    continue
+            word = tokens[0]
+            vec = list(map(float, tokens[1:]))
+            emb_dict[word] = vec
+            if emb_size:
+                if emb_size != len(vec):
+                    utils.print_out(
+                        "Ignoring %s since embeding size is inconsistent." % word
+                    )
+                    del emb_dict[word]
+            else:
+                emb_size = len(vec)
+    return emb_dict, emb_size
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils_test.py
index 391400f5c..8dfd041f4 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils_test.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils_test.py
@@ -28,30 +28,31 @@
 
 class VocabUtilsTest(tf.test.TestCase):
 
-  def testCheckVocab(self):
-    # Create a vocab file
-    vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir")
-    os.makedirs(vocab_dir)
-    vocab_file = os.path.join(vocab_dir, "vocab_file")
-    vocab = ["a", "b", "c"]
-    with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f:
-      for word in vocab:
-        f.write("%s\n" % word)
-
-    # Call vocab_utils
-    out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir")
-    os.makedirs(out_dir)
-    vocab_size, new_vocab_file = vocab_utils.check_vocab(
-        vocab_file, out_dir)
-
-    # Assert: we expect the code to add  <unk>, <s>, </s> and
-    # create a new vocab file
-    self.assertEqual(len(vocab) + 3, vocab_size)
-    self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file)
-    new_vocab, _ = vocab_utils.load_vocab(new_vocab_file)
-    self.assertEqual(
-        [vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab)
+    def testCheckVocab(self):
+        # Create a vocab file
+        vocab_dir = os.path.join(tf.test.get_temp_dir(), "vocab_dir")
+        os.makedirs(vocab_dir)
+        vocab_file = os.path.join(vocab_dir, "vocab_file")
+        vocab = ["a", "b", "c"]
+        with codecs.getwriter("utf-8")(tf.gfile.GFile(vocab_file, "wb")) as f:
+            for word in vocab:
+                f.write("%s\n" % word)
+
+        # Call vocab_utils
+        out_dir = os.path.join(tf.test.get_temp_dir(), "out_dir")
+        os.makedirs(out_dir)
+        vocab_size, new_vocab_file = vocab_utils.check_vocab(
+            vocab_file, out_dir)
+
+        # Assert: we expect the code to add  <unk>, <s>, </s> and
+        # create a new vocab file
+        self.assertEqual(len(vocab) + 3, vocab_size)
+        self.assertEqual(os.path.join(out_dir, "vocab_file"), new_vocab_file)
+        new_vocab, _ = vocab_utils.load_vocab(new_vocab_file)
+        self.assertEqual(
+            [vocab_utils.UNK, vocab_utils.SOS, vocab_utils.EOS] + vocab, new_vocab
+        )
 
 
 if __name__ == "__main__":
-  tf.test.main()
+    tf.test.main()
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/process_accuracy.py b/retired_benchmarks/translation/gnmt/tensorflow/process_accuracy.py
index 77343f39a..53f3c7cf1 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/process_accuracy.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/process_accuracy.py
@@ -24,29 +24,44 @@
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--reference', type=str, default=os.path.join(os.getcwd(), 'nmt', 'data', 'newstest2014.tok.bpe.32000.de'),
-                            help="Reference text to compare accuracy against.")
-    parser.add_argument('--accuracy_log', type=str, default = 'mlperf_log_accuracy.json',
-                            help="Accuracy log file")
+    parser.add_argument(
+        "--reference",
+        type=str,
+        default=os.path.join(
+            os.getcwd(), "nmt", "data", "newstest2014.tok.bpe.32000.de"
+        ),
+        help="Reference text to compare accuracy against.",
+    )
+    parser.add_argument(
+        "--accuracy_log",
+        type=str,
+        default="mlperf_log_accuracy.json",
+        help="Accuracy log file",
+    )
     args = parser.parse_args()
 
-
     # Check whether reference and log files exist
     if not os.path.exists(args.reference):
-        print("Could not find reference file {}. Please specify its location".format(args.reference))
+        print(
+            "Could not find reference file {}. Please specify its location".format(
+                args.reference
+            )
+        )
         sys.exit(0)
 
     if not os.path.exists(args.accuracy_log):
-        print("Could not find accuracy log file {}. Please specify its location".format(args.accuracy_log))
+        print(
+            "Could not find accuracy log file {}. Please specify its location".format(
+                args.accuracy_log
+            )
+        )
         sys.exit(0)
 
-    
     ##
     # @note: List of lists of words from the reference
     # @note: ref[i][j] refers to the j'th word of sentence i
     ref = []
-    with codecs.getreader("utf-8")(
-        tf.gfile.GFile(args.reference, "rb")) as ifh:
+    with codecs.getreader("utf-8")(tf.gfile.GFile(args.reference, "rb")) as ifh:
         ref_sentences = ifh.readlines()
         # Sanitize each sentence and convert to array of words
         ref = [e_utils._clean(s, "bpe").split(" ") for s in ref_sentences]
@@ -81,7 +96,6 @@
             # Update the Running BLEU Scorer for this sentence
             runningBLUE.add_sentence(ref[sent_id], trans)
 
-
     (bleu, _, _, _, _, _) = runningBLUE.calc_BLEU_score()
 
-    print("BLEU: %.1f" % (bleu * 100))
\ No newline at end of file
+    print("BLEU: %.1f" % (bleu * 100))
diff --git a/retired_benchmarks/translation/gnmt/tensorflow/run_task.py b/retired_benchmarks/translation/gnmt/tensorflow/run_task.py
index 365ce76ec..b2c0be68f 100644
--- a/retired_benchmarks/translation/gnmt/tensorflow/run_task.py
+++ b/retired_benchmarks/translation/gnmt/tensorflow/run_task.py
@@ -15,73 +15,117 @@
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument('--run', type=str, default='performance',
-                    help="Specify either 'accuracy' for BLEU metric or "
-                    "'performance' (default) for prediction latency and throughput"
+parser.add_argument(
+    "--run",
+    type=str,
+    default="performance",
+    help="Specify either 'accuracy' for BLEU metric or "
+    "'performance' (default) for prediction latency and throughput",
 )
 
 
-parser.add_argument('--batch_size', type=str, default='32',
-                    help="Specify inference batch size"
+parser.add_argument(
+    "--batch_size", type=str, default="32", help="Specify inference batch size"
 )
 
-parser.add_argument('--num_inter_threads', type=str, default='0',
-                    help="Specify inference num_inter_threads"
+parser.add_argument(
+    "--num_inter_threads",
+    type=str,
+    default="0",
+    help="Specify inference num_inter_threads",
 )
 
-parser.add_argument('--num_intra_threads', type=str, default='0',
-                    help="Specify inference num_intra_threads"
+parser.add_argument(
+    "--num_intra_threads",
+    type=str,
+    default="0",
+    help="Specify inference num_intra_threads",
 )
 
-parser.add_argument('--dataset_path', type=str, 
-                    default=os.path.join(os.getcwd(), 'nmt', 'data'),
-                    help="Specify dataset directory path"
+parser.add_argument(
+    "--dataset_path",
+    type=str,
+    default=os.path.join(os.getcwd(), "nmt", "data"),
+    help="Specify dataset directory path",
 )
 
-parser.add_argument('--model_path', type=str, 
-                    default=os.path.join(os.getcwd(), 'ende_gnmt_model_4_layer'),
-                    help="Specify model directory path"
+parser.add_argument(
+    "--model_path",
+    type=str,
+    default=os.path.join(os.getcwd(), "ende_gnmt_model_4_layer"),
+    help="Specify model directory path",
 )
 
-parser.add_argument('--output_path', type=str,
-                    default=os.path.join(os.getcwd(), 'nmt', 'data'),
-                    help="Specify output directory path"
+parser.add_argument(
+    "--output_path",
+    type=str,
+    default=os.path.join(os.getcwd(), "nmt", "data"),
+    help="Specify output directory path",
 )
 
 
-
 args = parser.parse_args()
 
-cpk_path = os.path.join(args.model_path, 'translate.ckpt')
+cpk_path = os.path.join(args.model_path, "translate.ckpt")
 
-haparams_path = os.path.join(os.getcwd(), 'nmt', 'standard_hparams',
-                             'wmt16_gnmt_4_layer.json')
+haparams_path = os.path.join(
+    os.getcwd(), "nmt", "standard_hparams", "wmt16_gnmt_4_layer.json"
+)
 
-vocab_prefix = os.path.join(args.dataset_path, 'vocab.bpe.32000')
+vocab_prefix = os.path.join(args.dataset_path, "vocab.bpe.32000")
 
-inference_ref_file = os.path.join(args.dataset_path, 'newstest2014.tok.bpe.32000.de')
+inference_ref_file = os.path.join(
+    args.dataset_path,
+    "newstest2014.tok.bpe.32000.de")
 
-inference_input_file = os.path.join(args.dataset_path, 'newstest2014.tok.bpe.32000.en')
+inference_input_file = os.path.join(
+    args.dataset_path,
+    "newstest2014.tok.bpe.32000.en")
 
-out_dir = os.path.join(args.output_path, 'result', 'output')
+out_dir = os.path.join(args.output_path, "result", "output")
 
-inference_output_file = os.path.join(args.output_path, 'output', 'g_nmt-out')
+inference_output_file = os.path.join(args.output_path, "output", "g_nmt-out")
 
-outpath = os.path.join(args.output_path, 'output', 'console_out_gnmt.txt')
+outpath = os.path.join(args.output_path, "output", "console_out_gnmt.txt")
 
-cmd = "python -m nmt.nmt \
+cmd = (
+    "python -m nmt.nmt \
     --src=en --tgt=de \
-    --ckpt="+cpk_path+" \
-    --hparams_path="+haparams_path+" \
-    --out_dir="+out_dir+" \
-    --vocab_prefix="+vocab_prefix+" \
-    --inference_input_file="+inference_input_file+" \
-    --inference_output_file="+inference_output_file+" \
-    --inference_ref_file="+inference_ref_file+" \
-    --infer_batch_size="+args.batch_size+" \
-    --num_inter_threads="+args.num_inter_threads+" \
-    --num_intra_threads="+args.num_intra_threads+" \
-    --iterations="+str(iterations)+" \
-    --run="+args.run
+    --ckpt="
+    + cpk_path
+    + " \
+    --hparams_path="
+    + haparams_path
+    + " \
+    --out_dir="
+    + out_dir
+    + " \
+    --vocab_prefix="
+    + vocab_prefix
+    + " \
+    --inference_input_file="
+    + inference_input_file
+    + " \
+    --inference_output_file="
+    + inference_output_file
+    + " \
+    --inference_ref_file="
+    + inference_ref_file
+    + " \
+    --infer_batch_size="
+    + args.batch_size
+    + " \
+    --num_inter_threads="
+    + args.num_inter_threads
+    + " \
+    --num_intra_threads="
+    + args.num_intra_threads
+    + " \
+    --iterations="
+    + str(iterations)
+    + " \
+    --run="
+    + args.run
+)
 
 return_code = subprocess.call(cmd, shell=True)
diff --git a/retired_benchmarks/vision/classification_and_detection/python/backend.py b/retired_benchmarks/vision/classification_and_detection/python/backend.py
index 955eddb88..6fc13454a 100755
--- a/retired_benchmarks/vision/classification_and_detection/python/backend.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/backend.py
@@ -2,10 +2,10 @@
 abstract backend class
 """
 
-
 # pylint: disable=unused-argument,missing-docstring
 
-class Backend():
+
+class Backend:
     def __init__(self):
         self.inputs = []
         self.outputs = []
diff --git a/retired_benchmarks/vision/classification_and_detection/python/backend_pytorch_native.py b/retired_benchmarks/vision/classification_and_detection/python/backend_pytorch_native.py
index f631ac5d3..ac79dee3a 100755
--- a/retired_benchmarks/vision/classification_and_detection/python/backend_pytorch_native.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/backend_pytorch_native.py
@@ -1,18 +1,19 @@
 """
-pytoch native backend 
+pytoch native backend
 """
+
 # pylint: disable=unused-argument,missing-docstring
 import torch  # currently supports pytorch1.0
 import backend
 
 
-
 class BackendPytorchNative(backend.Backend):
     def __init__(self):
         super(BackendPytorchNative, self).__init__()
         self.sess = None
         self.model = None
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
     def version(self):
         return torch.__version__
 
@@ -23,7 +24,10 @@ def image_format(self):
         return "NCHW"
 
     def load(self, model_path, inputs=None, outputs=None):
-        self.model = torch.load(model_path,map_location=lambda storage, loc: storage)
+        self.model = torch.load(
+            model_path,
+            map_location=lambda storage,
+            loc: storage)
         self.model.eval()
         # find inputs from the model if not passed in by config
         if inputs:
@@ -48,10 +52,9 @@ def load(self, model_path, inputs=None, outputs=None):
         self.model = self.model.to(self.device)
         return self
 
-        
     def predict(self, feed):
-        key=[key for key in feed.keys()][0]    
+        key = [key for key in feed.keys()][0]
         feed[key] = torch.tensor(feed[key]).float().to(self.device)
         with torch.no_grad():
-            output = self.model(feed[key])    
+            output = self.model(feed[key])
         return output
diff --git a/retired_benchmarks/vision/classification_and_detection/python/backend_tf.py b/retired_benchmarks/vision/classification_and_detection/python/backend_tf.py
index 05245a454..7cf2e14e4 100755
--- a/retired_benchmarks/vision/classification_and_detection/python/backend_tf.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/backend_tf.py
@@ -23,11 +23,13 @@ def name(self):
         return "tensorflow"
 
     def image_format(self):
-        # By default tensorflow uses NHWC (and the cpu implementation only does NHWC)
+        # By default tensorflow uses NHWC (and the cpu implementation only does
+        # NHWC)
         return "NHWC"
 
     def load(self, model_path, inputs=None, outputs=None):
-        # there is no input/output meta data i the graph so it need to come from config.
+        # there is no input/output meta data i the graph so it need to come
+        # from config.
         if not inputs:
             raise ValueError("BackendTensorflow needs inputs")
         if not outputs:
@@ -36,26 +38,40 @@ def load(self, model_path, inputs=None, outputs=None):
         self.inputs = inputs
 
         infer_config = tf.compat.v1.ConfigProto()
-        infer_config.intra_op_parallelism_threads = int(os.environ['TF_INTRA_OP_PARALLELISM_THREADS']) \
-                if 'TF_INTRA_OP_PARALLELISM_THREADS' in os.environ else os.cpu_count()
-        infer_config.inter_op_parallelism_threads = int(os.environ['TF_INTER_OP_PARALLELISM_THREADS']) \
-                if 'TF_INTER_OP_PARALLELISM_THREADS' in os.environ else os.cpu_count()
+        infer_config.intra_op_parallelism_threads = (
+            int(os.environ["TF_INTRA_OP_PARALLELISM_THREADS"])
+            if "TF_INTRA_OP_PARALLELISM_THREADS" in os.environ
+            else os.cpu_count()
+        )
+        infer_config.inter_op_parallelism_threads = (
+            int(os.environ["TF_INTER_OP_PARALLELISM_THREADS"])
+            if "TF_INTER_OP_PARALLELISM_THREADS" in os.environ
+            else os.cpu_count()
+        )
         infer_config.use_per_session_threads = 1
 
         # TODO: support checkpoint and saved_model formats?
         graph_def = tf.compat.v1.GraphDef()
         with tf.compat.v1.gfile.FastGFile(model_path, "rb") as f:
             graph_def.ParseFromString(f.read())
-        for as_datatype_enum in [dtypes.float32.as_datatype_enum, dtypes.uint8.as_datatype_enum]:
+        for as_datatype_enum in [
+            dtypes.float32.as_datatype_enum,
+            dtypes.uint8.as_datatype_enum,
+        ]:
             try:
-                optimized_graph_def = optimize_for_inference(graph_def, [item.split(':')[0] for item in inputs],
-                        [item.split(':')[0] for item in outputs], as_datatype_enum, False)
+                optimized_graph_def = optimize_for_inference(
+                    graph_def,
+                    [item.split(":")[0] for item in inputs],
+                    [item.split(":")[0] for item in outputs],
+                    as_datatype_enum,
+                    False,
+                )
                 graph_def = optimized_graph_def
                 break
             except ValueError:
                 pass
 
-        g = tf.compat.v1.import_graph_def(graph_def, name='')
+        g = tf.compat.v1.import_graph_def(graph_def, name="")
         self.sess = tf.compat.v1.Session(graph=g, config=infer_config)
         return self
 
diff --git a/retired_benchmarks/vision/classification_and_detection/python/backend_tflite.py b/retired_benchmarks/vision/classification_and_detection/python/backend_tflite.py
index 7c8c78c13..fa6cc5ba2 100755
--- a/retired_benchmarks/vision/classification_and_detection/python/backend_tflite.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/backend_tflite.py
@@ -10,12 +10,14 @@
     # try dedicated tflite package first
     import tflite_runtime
     import tflite_runtime.interpreter as tflite
+
     _version = tflite_runtime.__version__
     _git_version = tflite_runtime.__git_version__
-except:
+except BaseException:
     # fall back to tflite bundled in tensorflow
     import tensorflow as tf
     from tensorflow.lite.python import interpreter as tflite
+
     _version = tf.__version__
     _git_version = tf.__git_version__
 
@@ -43,8 +45,12 @@ def load(self, model_path, inputs=None, outputs=None):
         self.sess = tflite.Interpreter(model_path=model_path)
         self.sess.allocate_tensors()
         # keep input/output name to index mapping
-        self.input2index = {i["name"]: i["index"] for i in self.sess.get_input_details()}
-        self.output2index = {i["name"]: i["index"] for i in self.sess.get_output_details()}
+        self.input2index = {
+            i["name"]: i["index"] for i in self.sess.get_input_details()
+        }
+        self.output2index = {
+            i["name"]: i["index"] for i in self.sess.get_output_details()
+        }
         # keep input/output names
         self.inputs = list(self.input2index.keys())
         self.outputs = list(self.output2index.keys())
diff --git a/retired_benchmarks/vision/classification_and_detection/python/coco.py b/retired_benchmarks/vision/classification_and_detection/python/coco.py
index 408015874..d11ac2c5f 100644
--- a/retired_benchmarks/vision/classification_and_detection/python/coco.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/coco.py
@@ -20,8 +20,19 @@
 
 
 class Coco(dataset.Dataset):
-    def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
-                 image_format="NHWC", pre_process=None, count=None, cache_dir=None,use_label_map=False):
+    def __init__(
+        self,
+        data_path,
+        image_list,
+        name,
+        use_cache=0,
+        image_size=None,
+        image_format="NHWC",
+        pre_process=None,
+        count=None,
+        cache_dir=None,
+        use_label_map=False,
+    ):
         super().__init__()
         self.image_size = image_size
         self.image_list = []
@@ -32,17 +43,19 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         self.use_cache = use_cache
         self.data_path = data_path
         self.pre_process = pre_process
-        self.use_label_map=use_label_map
+        self.use_label_map = use_label_map
         if not cache_dir:
             cache_dir = os.getcwd()
-        self.cache_dir = os.path.join(cache_dir, "preprocessed", name, image_format)
+        self.cache_dir = os.path.join(
+            cache_dir, "preprocessed", name, image_format)
         # input images are in HWC
         self.need_transpose = True if image_format == "NCHW" else False
-        not_found = 0 
+        not_found = 0
         empty_80catageories = 0
         if image_list is None:
             # by default look for val_map.txt
-            image_list = os.path.join(data_path, "annotations/instances_val2017.json")
+            image_list = os.path.join(
+                data_path, "annotations/instances_val2017.json")
         self.annotation_file = image_list
         if self.use_label_map:
             # for pytorch
@@ -58,16 +71,22 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         with open(image_list, "r") as f:
             coco = json.load(f)
         for i in coco["images"]:
-            images[i["id"]] = {"file_name": i["file_name"],
-                               "height": i["height"],
-                               "width": i["width"],
-                               "bbox": [],
-                               "category": []}
+            images[i["id"]] = {
+                "file_name": i["file_name"],
+                "height": i["height"],
+                "width": i["width"],
+                "bbox": [],
+                "category": [],
+            }
         for a in coco["annotations"]:
             i = images.get(a["image_id"])
             if i is None:
                 continue
-            catagory_ids = label_map[a.get("category_id")] if self.use_label_map else a.get("category_id")
+            catagory_ids = (
+                label_map[a.get("category_id")]
+                if self.use_label_map
+                else a.get("category_id")
+            )
             i["category"].append(catagory_ids)
             i["bbox"].append(a.get("bbox"))
 
@@ -78,17 +97,23 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
                 # if the image does not exists ignore it
                 not_found += 1
                 continue
-            if len(img["category"])==0 and self.use_label_map: 
-                #if an image doesn't have any of the 81 categories in it    
-                empty_80catageories += 1 #should be 48 images - thus the validation sert has 4952 images
-                continue 
+            if len(img["category"]) == 0 and self.use_label_map:
+                # if an image doesn't have any of the 81 categories in it
+                empty_80catageories += (
+                    1  # should be 48 images - thus the validation sert has 4952 images
+                )
+                continue
 
-            os.makedirs(os.path.dirname(os.path.join(self.cache_dir, image_name)), exist_ok=True)
+            os.makedirs(
+                os.path.dirname(os.path.join(self.cache_dir, image_name)), exist_ok=True
+            )
             dst = os.path.join(self.cache_dir, image_name)
             if not os.path.exists(dst + ".npy"):
                 # cache a preprocessed version of the image
                 img_org = cv2.imread(src)
-                processed = self.pre_process(img_org, need_transpose=self.need_transpose, dims=self.image_size)
+                processed = self.pre_process(
+                    img_org, need_transpose=self.need_transpose, dims=self.image_size
+                )
                 np.save(dst, processed)
 
             self.image_ids.append(image_id)
@@ -107,10 +132,16 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         if not_found > 0:
             log.info("reduced image list, %d images not found", not_found)
         if empty_80catageories > 0:
-            log.info("reduced image list, %d images without any of the 80 categories", empty_80catageories)
+            log.info(
+                "reduced image list, %d images without any of the 80 categories",
+                empty_80catageories,
+            )
 
-        log.info("loaded {} images, cache={}, took={:.1f}sec".format(
-            len(self.image_list), use_cache, time_taken))
+        log.info(
+            "loaded {} images, cache={}, took={:.1f}sec".format(
+                len(self.image_list), use_cache, time_taken
+            )
+        )
 
         self.label_list = np.array(self.label_list)
 
@@ -129,6 +160,7 @@ class PostProcessCoco:
     """
     Post processing for tensorflow ssd-mobilenet style models
     """
+
     def __init__(self):
         self.results = []
         self.good = 0
@@ -139,14 +171,22 @@ def __init__(self):
     def add_results(self, results):
         self.results.extend(results)
 
-    def __call__(self, results, ids, expected=None, result_dict=None, ):
+    def __call__(
+        self,
+        results,
+        ids,
+        expected=None,
+        result_dict=None,
+    ):
         # results come as:
-        #   tensorflow, ssd-mobilenet: num_detections,detection_boxes,detection_scores,detection_classes
+        # tensorflow, ssd-mobilenet:
+        # num_detections,detection_boxes,detection_scores,detection_classes
         processed_results = []
         # batch size
         bs = len(results[0])
         for idx in range(0, bs):
-            # keep the content_id from loadgen to handle content_id's without results
+            # keep the content_id from loadgen to handle content_id's without
+            # results
             self.content_ids.append(ids[idx])
             processed_results.append([])
             detection_num = int(results[0][idx])
@@ -158,10 +198,17 @@ def __call__(self, results, ids, expected=None, result_dict=None, ):
                 if detection_class in expected_classes:
                     self.good += 1
                 box = detection_boxes[detection]
-                processed_results[idx].append([float(ids[idx]),
-                                              box[0], box[1], box[2], box[3],
-                                              results[2][idx][detection],
-                                              float(detection_class)])
+                processed_results[idx].append(
+                    [
+                        float(ids[idx]),
+                        box[0],
+                        box[1],
+                        box[2],
+                        box[3],
+                        results[2][idx][detection],
+                        float(detection_class),
+                    ]
+                )
                 self.total += 1
         return processed_results
 
@@ -181,7 +228,7 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                 annotations = json.load(fin)
             for cnt, cat in enumerate(annotations["categories"]):
                 label_map[cat["id"]] = cnt + 1
-            inv_map = {v:k for k,v in label_map.items()}
+            inv_map = {v: k for k, v in label_map.items()}
 
         detections = []
         image_indices = []
@@ -192,8 +239,13 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                 # this is the index of the coco image
                 image_idx = int(detection[0])
                 if image_idx != self.content_ids[batch]:
-                    # working with the coco index/id is error prone - extra check to make sure it is consistent
-                    log.error("image_idx missmatch, lg={} / result={}".format(image_idx, self.content_ids[batch]))
+                    # working with the coco index/id is error prone - extra
+                    # check to make sure it is consistent
+                    log.error(
+                        "image_idx missmatch, lg={} / result={}".format(
+                            image_idx, self.content_ids[batch]
+                        )
+                    )
                 # map the index to the coco image id
                 detection[0] = ds.image_ids[image_idx]
                 height, width = ds.image_sizes[image_idx]
@@ -211,16 +263,19 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                     cat_id = inv_map.get(int(detection[6]), -1)
                     if cat_id == -1:
                         # FIXME:
-                        log.info("finalize can't map category {}".format(int(detection[6])))
-                    detection[6] =  cat_id
+                        log.info(
+                            "finalize can't map category {}".format(
+                                int(detection[6]))
+                        )
+                    detection[6] = cat_id
                 detections.append(np.array(detection))
 
         # map indices to coco image id's
-        image_ids = [ds.image_ids[i]  for i in image_indices]
+        image_ids = [ds.image_ids[i] for i in image_indices]
         self.results = []
         cocoGt = pycoco.COCO(ds.annotation_file)
         cocoDt = cocoGt.loadRes(np.array(detections))
-        cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
+        cocoEval = COCOeval(cocoGt, cocoDt, iouType="bbox")
         cocoEval.params.imgIds = image_ids
         cocoEval.evaluate()
         cocoEval.accumulate()
@@ -232,11 +287,12 @@ class PostProcessCocoPt(PostProcessCoco):
     """
     Post processing required by ssd-resnet34 / pytorch
     """
-    def __init__(self,use_inv_map,score_threshold):
+
+    def __init__(self, use_inv_map, score_threshold):
         super().__init__()
         self.use_inv_map = use_inv_map
         self.score_threshold = score_threshold
-        
+
     def __call__(self, results, ids, expected=None, result_dict=None):
         # results come as:
         #   detection_boxes,detection_classes,detection_scores
@@ -251,7 +307,7 @@ def __call__(self, results, ids, expected=None, result_dict=None):
             detection_classes = results[1][idx]
             expected_classes = expected[idx][0]
             scores = results[2][idx]
-            #for detection in range(0, len(expected_classes)):
+            # for detection in range(0, len(expected_classes)):
             for detection in range(0, len(scores)):
                 if scores[detection] < self.score_threshold:
                     break
@@ -260,10 +316,17 @@ def __call__(self, results, ids, expected=None, result_dict=None):
                     self.good += 1
                 box = detection_boxes[detection]
                 # comes from model as:  0=xmax 1=ymax 2=xmin 3=ymin
-                processed_results[idx].append([float(ids[idx]),
-                                              box[1], box[0], box[3], box[2],
-                                              scores[detection],
-                                              float(detection_class)])
+                processed_results[idx].append(
+                    [
+                        float(ids[idx]),
+                        box[1],
+                        box[0],
+                        box[3],
+                        box[2],
+                        scores[detection],
+                        float(detection_class),
+                    ]
+                )
                 self.total += 1
         return processed_results
 
@@ -272,12 +335,14 @@ class PostProcessCocoOnnx(PostProcessCoco):
     """
     Post processing required by ssd-resnet34 / onnx
     """
+
     def __init__(self):
         super().__init__()
 
     def __call__(self, results, ids, expected=None, result_dict=None):
         # results come as:
-        #   onnx (from pytorch ssd-resnet34): detection_boxes,detection_classes,detection_scores
+        # onnx (from pytorch ssd-resnet34):
+        # detection_boxes,detection_classes,detection_scores
 
         processed_results = []
 
@@ -298,17 +363,26 @@ def __call__(self, results, ids, expected=None, result_dict=None):
                     self.good += 1
                 box = detection_boxes[detection]
                 # comes from model as:  0=xmax 1=ymax 2=xmin 3=ymin
-                processed_results[idx].append([float(ids[idx]),
-                                              box[1], box[0], box[3], box[2],
-                                              scores[detection],
-                                              float(detection_class)])
+                processed_results[idx].append(
+                    [
+                        float(ids[idx]),
+                        box[1],
+                        box[0],
+                        box[3],
+                        box[2],
+                        scores[detection],
+                        float(detection_class),
+                    ]
+                )
                 self.total += 1
         return processed_results
 
+
 class PostProcessCocoTf(PostProcessCoco):
     """
     Post processing required by ssd-resnet34 / pytorch
     """
+
     def __init__(self):
         super().__init__()
         self.use_inv_map = True
@@ -335,9 +409,16 @@ def __call__(self, results, ids, expected=None, result_dict=None):
                     self.good += 1
                 box = detection_boxes[detection]
                 # comes from model as:  0=xmax 1=ymax 2=xmin 3=ymin
-                processed_results[idx].append([float(ids[idx]),
-                                              box[0], box[1], box[2], box[3],
-                                              scores[detection],
-                                              float(detection_class)])
+                processed_results[idx].append(
+                    [
+                        float(ids[idx]),
+                        box[0],
+                        box[1],
+                        box[2],
+                        box[3],
+                        scores[detection],
+                        float(detection_class),
+                    ]
+                )
                 self.total += 1
         return processed_results
diff --git a/retired_benchmarks/vision/classification_and_detection/python/dataset.py b/retired_benchmarks/vision/classification_and_detection/python/dataset.py
index dce968a3d..34ffa54b5 100755
--- a/retired_benchmarks/vision/classification_and_detection/python/dataset.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/dataset.py
@@ -15,7 +15,8 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("dataset")
 
-class Item():
+
+class Item:
     def __init__(self, label, img, idx):
         self.label = label
         self.img = img
@@ -24,19 +25,25 @@ def __init__(self, label, img, idx):
 
 
 def usleep(sec):
-    if sys.platform == 'win32':
+    if sys.platform == "win32":
         # on windows time.sleep() doesn't work to well
         import ctypes
+
         kernel32 = ctypes.windll.kernel32
-        timer = kernel32.CreateWaitableTimerA(ctypes.c_void_p(), True, ctypes.c_void_p())
+        timer = kernel32.CreateWaitableTimerA(
+            ctypes.c_void_p(), True, ctypes.c_void_p()
+        )
         delay = ctypes.c_longlong(int(-1 * (10 * 1000000 * sec)))
-        kernel32.SetWaitableTimer(timer, ctypes.byref(delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False)
-        kernel32.WaitForSingleObject(timer, 0xffffffff)
+        kernel32.SetWaitableTimer(
+            timer, ctypes.byref(
+                delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False
+        )
+        kernel32.WaitForSingleObject(timer, 0xFFFFFFFF)
     else:
         time.sleep(sec)
 
 
-class Dataset():
+class Dataset:
     def __init__(self):
         self.arrival = None
         self.image_list = []
@@ -62,7 +69,7 @@ def load_query_samples(self, sample_list):
     def unload_query_samples(self, sample_list):
         if sample_list:
             for sample in sample_list:
-                if sample in self.image_list_inmemory :
+                if sample in self.image_list_inmemory:
                     del self.image_list_inmemory[sample]
         else:
             self.image_list_inmemory = {}
@@ -102,7 +109,7 @@ def start(self):
         self.good = 0
         self.total = 0
 
-    def finalize(self, results, ds=False,  output_dir=None):
+    def finalize(self, results, ds=False, output_dir=None):
         results["good"] = self.good
         results["total"] = self.total
 
@@ -141,6 +148,7 @@ def finalize(self, results, ds=False, output_dir=None):
 # pre-processing
 #
 
+
 def center_crop(img, out_height, out_width):
     height, width, _ = img.shape
     left = int((width - out_width) / 2)
@@ -151,10 +159,12 @@ def center_crop(img, out_height, out_width):
     return img
 
 
-def resize_with_aspectratio(img, out_height, out_width, scale=87.5, inter_pol=cv2.INTER_LINEAR):
+def resize_with_aspectratio(
+    img, out_height, out_width, scale=87.5, inter_pol=cv2.INTER_LINEAR
+):
     height, width, _ = img.shape
-    new_height = int(100. * out_height / scale)
-    new_width = int(100. * out_width / scale)
+    new_height = int(100.0 * out_height / scale)
+    new_width = int(100.0 * out_width / scale)
     if height > width:
         w = new_width
         h = int(new_height * height / width)
@@ -170,9 +180,11 @@ def pre_process_vgg(img, dims=None, need_transpose=False):
 
     output_height, output_width, _ = dims
     cv2_interpol = cv2.INTER_AREA
-    img = resize_with_aspectratio(img, output_height, output_width, inter_pol=cv2_interpol)
+    img = resize_with_aspectratio(
+        img, output_height, output_width, inter_pol=cv2_interpol
+    )
     img = center_crop(img, output_height, output_width)
-    img = np.asarray(img, dtype='float32')
+    img = np.asarray(img, dtype="float32")
 
     # normalize image
     means = np.array([123.68, 116.78, 103.94], dtype=np.float32)
@@ -188,9 +200,11 @@ def pre_process_mobilenet(img, dims=None, need_transpose=False):
     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 
     output_height, output_width, _ = dims
-    img = resize_with_aspectratio(img, output_height, output_width, inter_pol=cv2.INTER_LINEAR)
+    img = resize_with_aspectratio(
+        img, output_height, output_width, inter_pol=cv2.INTER_LINEAR
+    )
     img = center_crop(img, output_height, output_width)
-    img = np.asarray(img, dtype='float32')
+    img = np.asarray(img, dtype="float32")
 
     img /= 255.0
     img -= 0.5
@@ -211,10 +225,12 @@ def pre_process_imagenet_pytorch(img, dims=None, need_transpose=False):
     img = F.resize(img, 256, Image.BILINEAR)
     img = F.center_crop(img, 224)
     img = F.to_tensor(img)
-    img = F.normalize(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=False)
+    img = F.normalize(
+        img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=False
+    )
     if not need_transpose:
-        img = img.permute(1, 2, 0) # NHWC
-    img = np.asarray(img, dtype='float32')
+        img = img.permute(1, 2, 0)  # NHWC
+    img = np.asarray(img, dtype="float32")
     return img
 
 
@@ -224,9 +240,10 @@ def maybe_resize(img, dims):
         # some images might be grayscale
         img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    if dims != None:
+    if dims is not None:
         im_height, im_width, _ = dims
-        img = cv2.resize(img, (im_width, im_height), interpolation=cv2.INTER_LINEAR)
+        img = cv2.resize(img, (im_width, im_height),
+                         interpolation=cv2.INTER_LINEAR)
     return img
 
 
@@ -254,7 +271,7 @@ def pre_process_coco_resnet34(img, dims=None, need_transpose=False):
     mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
     std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
 
-    img = img / 255. - mean
+    img = img / 255.0 - mean
     img = img / std
 
     if need_transpose:
diff --git a/retired_benchmarks/vision/classification_and_detection/python/imagenet.py b/retired_benchmarks/vision/classification_and_detection/python/imagenet.py
index 57865c6b7..d48845ba9 100755
--- a/retired_benchmarks/vision/classification_and_detection/python/imagenet.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/imagenet.py
@@ -20,8 +20,18 @@
 
 class Imagenet(dataset.Dataset):
 
-    def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
-                 image_format="NHWC", pre_process=None, count=None, cache_dir=None):
+    def __init__(
+        self,
+        data_path,
+        image_list,
+        name,
+        use_cache=0,
+        image_size=None,
+        image_format="NHWC",
+        pre_process=None,
+        count=None,
+        cache_dir=None,
+    ):
         super(Imagenet, self).__init__()
         if image_size is None:
             self.image_size = [224, 224, 3]
@@ -33,7 +43,8 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         self.label_list = []
         self.count = count
         self.use_cache = use_cache
-        self.cache_dir = os.path.join(cache_dir, "preprocessed", name, image_format)
+        self.cache_dir = os.path.join(
+            cache_dir, "preprocessed", name, image_format)
         self.data_path = data_path
         self.pre_process = pre_process
         # input images are in HWC
@@ -47,7 +58,7 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         os.makedirs(self.cache_dir, exist_ok=True)
 
         start = time.time()
-        with open(image_list, 'r') as f:
+        with open(image_list, "r") as f:
             for s in f:
                 image_name, label = re.split(r"\s+", s.strip())
                 src = os.path.join(data_path, image_name)
@@ -55,15 +66,22 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
                     # if the image does not exists ignore it
                     not_found += 1
                     continue
-                os.makedirs(os.path.dirname(os.path.join(self.cache_dir, image_name)), exist_ok=True)
+                os.makedirs(
+                    os.path.dirname(os.path.join(self.cache_dir, image_name)),
+                    exist_ok=True,
+                )
                 dst = os.path.join(self.cache_dir, image_name)
                 if not os.path.exists(dst + ".npy"):
                     # cache a preprocessed version of the image
                     # TODO: make this multi threaded ?
                     img_org = cv2.imread(src)
-                    processed = self.pre_process(img_org, need_transpose=self.need_transpose, dims=self.image_size)
+                    processed = self.pre_process(
+                        img_org,
+                        need_transpose=self.need_transpose,
+                        dims=self.image_size,
+                    )
                     np.save(dst, processed)
-                
+
                 self.image_list.append(image_name)
                 self.label_list.append(int(label))
 
@@ -78,8 +96,11 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         if not_found > 0:
             log.info("reduced image list, %d images not found", not_found)
 
-        log.info("loaded {} images, cache={}, took={:.1f}sec".format(
-            len(self.image_list), use_cache, time_taken))
+        log.info(
+            "loaded {} images, cache={}, took={:.1f}sec".format(
+                len(self.image_list), use_cache, time_taken
+            )
+        )
 
         self.label_list = np.array(self.label_list)
 
@@ -92,4 +113,3 @@ def get_item(self, nr):
     def get_item_loc(self, nr):
         src = os.path.join(self.data_path, self.image_list[nr])
         return src
-
diff --git a/retired_benchmarks/vision/classification_and_detection/python/main.py b/retired_benchmarks/vision/classification_and_detection/python/main.py
index 904a8b74a..23eaea9fa 100755
--- a/retired_benchmarks/vision/classification_and_detection/python/main.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/main.py
@@ -34,33 +34,60 @@
 
 # the datasets we support
 SUPPORTED_DATASETS = {
-    "imagenet":
-        (imagenet.Imagenet, dataset.pre_process_vgg, dataset.PostProcessCommon(offset=-1),
-         {"image_size": [224, 224, 3]}),
-    "imagenet_mobilenet":
-        (imagenet.Imagenet, dataset.pre_process_mobilenet, dataset.PostProcessArgMax(offset=-1),
-         {"image_size": [224, 224, 3]}),
-    "imagenet_pytorch":
-        (imagenet.Imagenet, dataset.pre_process_imagenet_pytorch, dataset.PostProcessArgMax(offset=0),
-         {"image_size": [224, 224, 3]}),
-    "coco-300":
-        (coco.Coco, dataset.pre_process_coco_mobilenet, coco.PostProcessCoco(),
-         {"image_size": [300, 300, 3]}),
-    "coco-300-pt":
-        (coco.Coco, dataset.pre_process_coco_pt_mobilenet, coco.PostProcessCocoPt(False,0.3),
-         {"image_size": [300, 300, 3]}),         
-    "coco-1200":
-        (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCoco(),
-         {"image_size": [1200, 1200, 3]}),
-    "coco-1200-onnx":
-        (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoOnnx(),
-         {"image_size": [1200, 1200, 3]}),
-    "coco-1200-pt":
-        (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoPt(True,0.05),
-         {"image_size": [1200, 1200, 3],"use_label_map": True}),
-    "coco-1200-tf":
-        (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoTf(),
-         {"image_size": [1200, 1200, 3],"use_label_map": False}),
+    "imagenet": (
+        imagenet.Imagenet,
+        dataset.pre_process_vgg,
+        dataset.PostProcessCommon(offset=-1),
+        {"image_size": [224, 224, 3]},
+    ),
+    "imagenet_mobilenet": (
+        imagenet.Imagenet,
+        dataset.pre_process_mobilenet,
+        dataset.PostProcessArgMax(offset=-1),
+        {"image_size": [224, 224, 3]},
+    ),
+    "imagenet_pytorch": (
+        imagenet.Imagenet,
+        dataset.pre_process_imagenet_pytorch,
+        dataset.PostProcessArgMax(offset=0),
+        {"image_size": [224, 224, 3]},
+    ),
+    "coco-300": (
+        coco.Coco,
+        dataset.pre_process_coco_mobilenet,
+        coco.PostProcessCoco(),
+        {"image_size": [300, 300, 3]},
+    ),
+    "coco-300-pt": (
+        coco.Coco,
+        dataset.pre_process_coco_pt_mobilenet,
+        coco.PostProcessCocoPt(False, 0.3),
+        {"image_size": [300, 300, 3]},
+    ),
+    "coco-1200": (
+        coco.Coco,
+        dataset.pre_process_coco_resnet34,
+        coco.PostProcessCoco(),
+        {"image_size": [1200, 1200, 3]},
+    ),
+    "coco-1200-onnx": (
+        coco.Coco,
+        dataset.pre_process_coco_resnet34,
+        coco.PostProcessCocoOnnx(),
+        {"image_size": [1200, 1200, 3]},
+    ),
+    "coco-1200-pt": (
+        coco.Coco,
+        dataset.pre_process_coco_resnet34,
+        coco.PostProcessCocoPt(True, 0.05),
+        {"image_size": [1200, 1200, 3], "use_label_map": True},
+    ),
+    "coco-1200-tf": (
+        coco.Coco,
+        dataset.pre_process_coco_resnet34,
+        coco.PostProcessCocoTf(),
+        {"image_size": [1200, 1200, 3], "use_label_map": False},
+    ),
 }
 
 # pre-defined command line options so simplify things. They are used as defaults and can be
@@ -73,7 +100,6 @@
         "cache": 0,
         "max-batchsize": 32,
     },
-
     # resnet
     "resnet50-tf": {
         "inputs": "input_tensor:0",
@@ -95,7 +121,6 @@
         "backend": "onnxruntime",
         "model-name": "resnet50",
     },
-
     # mobilenet
     "mobilenet-tf": {
         "inputs": "input:0",
@@ -110,7 +135,6 @@
         "backend": "onnxruntime",
         "model-name": "mobilenet",
     },
-
     # ssd-mobilenet
     "ssd-mobilenet-tf": {
         "inputs": "image_tensor:0",
@@ -133,7 +157,6 @@
         "data-format": "NHWC",
         "model-name": "ssd-mobilenet",
     },
-
     # ssd-resnet34
     "ssd-resnet34-tf": {
         "inputs": "image:0",
@@ -182,37 +205,83 @@
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
-    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
+    parser.add_argument(
+        "--dataset",
+        choices=SUPPORTED_DATASETS.keys(),
+        help="dataset")
+    parser.add_argument(
+        "--dataset-path",
+        required=True,
+        help="path to the dataset")
     parser.add_argument("--dataset-list", help="path to the dataset list")
-    parser.add_argument("--data-format", choices=["NCHW", "NHWC"], help="data format")
-    parser.add_argument("--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles")
-    parser.add_argument("--scenario", default="SingleStream",
-                        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())))
-    parser.add_argument("--max-batchsize", type=int, help="max batch size in a single inference")
+    parser.add_argument(
+        "--data-format",
+        choices=[
+            "NCHW",
+            "NHWC"],
+        help="data format")
+    parser.add_argument(
+        "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
+    )
+    parser.add_argument(
+        "--scenario",
+        default="SingleStream",
+        help="mlperf benchmark scenario, one of " +
+        str(list(SCENARIO_MAP.keys())),
+    )
+    parser.add_argument(
+        "--max-batchsize", type=int, help="max batch size in a single inference"
+    )
     parser.add_argument("--model", required=True, help="model file")
     parser.add_argument("--output", default="output", help="test results")
     parser.add_argument("--inputs", help="model inputs")
     parser.add_argument("--outputs", help="model outputs")
     parser.add_argument("--backend", help="runtime to use")
-    parser.add_argument("--model-name", help="name of the mlperf model, ie. resnet50")
-    parser.add_argument("--threads", default=os.cpu_count(), type=int, help="threads")
+    parser.add_argument(
+        "--model-name",
+        help="name of the mlperf model, ie. resnet50")
+    parser.add_argument(
+        "--threads",
+        default=os.cpu_count(),
+        type=int,
+        help="threads")
     parser.add_argument("--qps", type=int, help="target qps")
     parser.add_argument("--cache", type=int, default=0, help="use cache")
-    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
-    parser.add_argument("--find-peak-performance", action="store_true", help="enable finding peak performance pass")
-    parser.add_argument("--debug", action="store_true", help="debug, turn traces on")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--find-peak-performance",
+        action="store_true",
+        help="enable finding peak performance pass",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="debug, turn traces on")
 
     # file to use mlperf rules compliant parameters
-    parser.add_argument("--mlperf_conf", default="../../mlperf.conf", help="mlperf rules config")
+    parser.add_argument(
+        "--mlperf_conf", default="../../mlperf.conf", help="mlperf rules config"
+    )
     # file for user LoadGen settings such as target QPS
-    parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS")
-
-    # below will override mlperf rules compliant settings - don't use for official submission
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+
+    # below will override mlperf rules compliant settings - don't use for
+    # official submission
     parser.add_argument("--time", type=int, help="time to scan in seconds")
     parser.add_argument("--count", type=int, help="dataset items to use")
-    parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile")
-    parser.add_argument("--samples-per-query", type=int, help="mlperf multi-stream sample per query")
+    parser.add_argument(
+        "--max-latency", type=float, help="mlperf max latency in pct tile"
+    )
+    parser.add_argument(
+        "--samples-per-query", type=int, help="mlperf multi-stream sample per query"
+    )
     args = parser.parse_args()
 
     # don't use defaults in argparser. Instead we default to a dict, override that with a profile
@@ -239,21 +308,27 @@ def get_args():
 def get_backend(backend):
     if backend == "tensorflow":
         from backend_tf import BackendTensorflow
+
         backend = BackendTensorflow()
     elif backend == "onnxruntime":
         from backend_onnxruntime import BackendOnnxruntime
+
         backend = BackendOnnxruntime()
     elif backend == "null":
         from backend_null import BackendNull
+
         backend = BackendNull()
     elif backend == "pytorch":
         from backend_pytorch import BackendPytorch
+
         backend = BackendPytorch()
     elif backend == "pytorch-native":
         from backend_pytorch_native import BackendPytorchNative
-        backend = BackendPytorchNative()      
+
+        backend = BackendPytorchNative()
     elif backend == "tflite":
         from backend_tflite import BackendTflite
+
         backend = BackendTflite()
     else:
         raise ValueError("unknown backend: " + backend)
@@ -296,7 +371,9 @@ def run_one_item(self, qitem):
         processed_results = []
         try:
             results = self.model.predict({self.model.inputs[0]: qitem.img})
-            processed_results = self.post_process(results, qitem.content_id, qitem.label, self.result_dict)
+            processed_results = self.post_process(
+                results, qitem.content_id, qitem.label, self.result_dict
+            )
             if self.take_accuracy:
                 self.post_process.add_results(processed_results)
                 self.result_timing.append(time.time() - qitem.start)
@@ -309,7 +386,9 @@ def run_one_item(self, qitem):
             response_array_refs = []
             response = []
             for idx, query_id in enumerate(qitem.query_id):
-                response_array = array.array("B", np.array(processed_results[idx], np.float32).tobytes())
+                response_array = array.array(
+                    "B", np.array(processed_results[idx], np.float32).tobytes()
+                )
                 response_array_refs.append(response_array)
                 bi = response_array.buffer_info()
                 response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1]))
@@ -324,8 +403,10 @@ def enqueue(self, query_samples):
         else:
             bs = self.max_batchsize
             for i in range(0, len(idx), bs):
-                data, label = self.ds.get_samples(idx[i:i+bs])
-                self.run_one_item(Item(query_id[i:i+bs], idx[i:i+bs], data, label))
+                data, label = self.ds.get_samples(idx[i: i + bs])
+                self.run_one_item(
+                    Item(query_id[i: i + bs], idx[i: i + bs], data, label)
+                )
 
     def finish(self):
         pass
@@ -339,7 +420,9 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         self.result_dict = {}
 
         for _ in range(self.threads):
-            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
+            worker = threading.Thread(
+                target=self.handle_tasks, args=(
+                    self.tasks,))
             worker.daemon = True
             self.workers.append(worker)
             worker.start()
@@ -376,10 +459,14 @@ def finish(self):
             worker.join()
 
 
-def add_results(final_results, name, result_dict, result_list, took, show_accuracy=False):
-    percentiles = [50., 80., 90., 95., 99., 99.9]
+def add_results(
+    final_results, name, result_dict, result_list, took, show_accuracy=False
+):
+    percentiles = [50.0, 80.0, 90.0, 95.0, 99.0, 99.9]
     buckets = np.percentile(result_list, percentiles).tolist()
-    buckets_str = ",".join(["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)])
+    buckets_str = ",".join(
+        ["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)]
+    )
 
     if result_dict["total"] == 0:
         result_dict["total"] = len(result_list)
@@ -396,19 +483,27 @@ def add_results(final_results, name, result_dict, result_list, took, show_accura
     }
     acc_str = ""
     if show_accuracy:
-        result["accuracy"] = 100. * result_dict["good"] / result_dict["total"]
+        result["accuracy"] = 100.0 * result_dict["good"] / result_dict["total"]
         acc_str = ", acc={:.3f}%".format(result["accuracy"])
         if "mAP" in result_dict:
-            result["mAP"] = 100. * result_dict["mAP"]
+            result["mAP"] = 100.0 * result_dict["mAP"]
             acc_str += ", mAP={:.3f}%".format(result["mAP"])
 
     # add the result to the result dict
     final_results[name] = result
 
     # to stdout
-    print("{} qps={:.2f}, mean={:.4f}, time={:.3f}{}, queries={}, tiles={}".format(
-        name, result["qps"], result["mean"], took, acc_str,
-        len(result_list), buckets_str))
+    print(
+        "{} qps={:.2f}, mean={:.4f}, time={:.3f}{}, queries={}, tiles={}".format(
+            name,
+            result["qps"],
+            result["mean"],
+            took,
+            acc_str,
+            len(result_list),
+            buckets_str,
+        )
+    )
 
 
 def main():
@@ -432,13 +527,16 @@ def main():
 
     # dataset to use
     wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
-    ds = wanted_dataset(data_path=args.dataset_path,
-                        image_list=args.dataset_list,
-                        name=args.dataset,
-                        image_format=image_format,
-                        pre_process=pre_proc,
-                        use_cache=args.cache,
-                        count=count, **kwargs)
+    ds = wanted_dataset(
+        data_path=args.dataset_path,
+        image_list=args.dataset_list,
+        name=args.dataset,
+        image_format=image_format,
+        pre_process=pre_proc,
+        use_cache=args.cache,
+        count=count,
+        **kwargs
+    )
     # load model to backend
     model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs)
     final_results = {
@@ -480,9 +578,11 @@ def main():
         lg.TestScenario.SingleStream: RunnerBase,
         lg.TestScenario.MultiStream: QueueRunner,
         lg.TestScenario.Server: QueueRunner,
-        lg.TestScenario.Offline: QueueRunner
+        lg.TestScenario.Offline: QueueRunner,
     }
-    runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize)
+    runner = runner_map[scenario](
+        model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+    )
 
     def issue_queries(query_samples):
         runner.enqueue(query_samples)
@@ -530,10 +630,13 @@ def process_latencies(latencies_ns):
         settings.multi_stream_samples_per_query = args.samples_per_query
     if args.max_latency:
         settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
-        settings.multi_stream_target_latency_ns = int(args.max_latency * NANO_SEC)
+        settings.multi_stream_target_latency_ns = int(
+            args.max_latency * NANO_SEC)
 
     sut = lg.ConstructSUT(issue_queries, flush_queries, process_latencies)
-    qsl = lg.ConstructQSL(count, min(count, 500), ds.load_query_samples, ds.unload_query_samples)
+    qsl = lg.ConstructQSL(
+        count, min(count, 500), ds.load_query_samples, ds.unload_query_samples
+    )
 
     log.info("starting {}".format(scenario))
     result_dict = {"good": 0, "total": 0, "scenario": str(scenario)}
@@ -546,8 +649,14 @@ def process_latencies(latencies_ns):
     if args.accuracy:
         post_proc.finalize(result_dict, ds, output_dir=args.output)
 
-    add_results(final_results, "{}".format(scenario),
-                result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy)
+    add_results(
+        final_results,
+        "{}".format(scenario),
+        result_dict,
+        last_timeing,
+        time.time() - ds.last_loaded,
+        args.accuracy,
+    )
 
     runner.finish()
     lg.DestroyQSL(qsl)
diff --git a/retired_benchmarks/vision/classification_and_detection/python/models/anchor_generator.py b/retired_benchmarks/vision/classification_and_detection/python/models/anchor_generator.py
index 9a2d9d490..be0401268 100644
--- a/retired_benchmarks/vision/classification_and_detection/python/models/anchor_generator.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/models/anchor_generator.py
@@ -2,11 +2,12 @@
 import numpy as np
 
 
-# The following functions were taken from 
+# The following functions were taken from
 # https://github.com/tensorflow/models/tree/master/research/object_detection
 # with minor modifications so that they use
 # torch operations instead
 
+
 def expanded_shape(orig_shape, start_dim, num_dims):
     s = (1,) * num_dims
     return orig_shape[:start_dim] + s + orig_shape[start_dim:]
@@ -45,350 +46,407 @@ def meshgrid(x, y):
     return xgrid, ygrid
 
 
-def tile_anchors(grid_height,
-                 grid_width,
-                 scales,
-                 aspect_ratios,
-                 base_anchor_size,
-                 anchor_stride,
-                 anchor_offset):
-  """Create a tiled set of anchors strided along a grid in image space.
-  This op creates a set of anchor boxes by placing a "basis" collection of
-  boxes with user-specified scales and aspect ratios centered at evenly
-  distributed points along a grid.  The basis collection is specified via the
-  scale and aspect_ratios arguments.  For example, setting scales=[.1, .2, .2]
-  and aspect ratios = [2,2,1/2] means that we create three boxes: one with scale
-  .1, aspect ratio 2, one with scale .2, aspect ratio 2, and one with scale .2
-  and aspect ratio 1/2.  Each box is multiplied by "base_anchor_size" before
-  placing it over its respective center.
-  Grid points are specified via grid_height, grid_width parameters as well as
-  the anchor_stride and anchor_offset parameters.
-  Args:
-    grid_height: size of the grid in the y direction (int or int scalar tensor)
-    grid_width: size of the grid in the x direction (int or int scalar tensor)
-    scales: a 1-d  (float) tensor representing the scale of each box in the
-      basis set.
-    aspect_ratios: a 1-d (float) tensor representing the aspect ratio of each
-      box in the basis set.  The length of the scales and aspect_ratios tensors
-      must be equal.
-    base_anchor_size: base anchor size as [height, width]
-      (float tensor of shape [2])
-    anchor_stride: difference in centers between base anchors for adjacent grid
-                   positions (float tensor of shape [2])
-    anchor_offset: center of the anchor with scale and aspect ratio 1 for the
-                   upper left element of the grid, this should be zero for
-                   feature networks with only VALID padding and even receptive
-                   field size, but may need some additional calculation if other
-                   padding is used (float tensor of shape [2])
-  Returns:
-    a BoxList holding a collection of N anchor boxes
-  """
-  aspect_ratios = torch.as_tensor(aspect_ratios, dtype=torch.float32)
-  scales = torch.as_tensor(scales, dtype=torch.float32)
+def tile_anchors(
+    grid_height,
+    grid_width,
+    scales,
+    aspect_ratios,
+    base_anchor_size,
+    anchor_stride,
+    anchor_offset,
+):
+    """Create a tiled set of anchors strided along a grid in image space.
+    This op creates a set of anchor boxes by placing a "basis" collection of
+    boxes with user-specified scales and aspect ratios centered at evenly
+    distributed points along a grid.  The basis collection is specified via the
+    scale and aspect_ratios arguments.  For example, setting scales=[.1, .2, .2]
+    and aspect ratios = [2,2,1/2] means that we create three boxes: one with scale
+    .1, aspect ratio 2, one with scale .2, aspect ratio 2, and one with scale .2
+    and aspect ratio 1/2.  Each box is multiplied by "base_anchor_size" before
+    placing it over its respective center.
+    Grid points are specified via grid_height, grid_width parameters as well as
+    the anchor_stride and anchor_offset parameters.
+    Args:
+      grid_height: size of the grid in the y direction (int or int scalar tensor)
+      grid_width: size of the grid in the x direction (int or int scalar tensor)
+      scales: a 1-d  (float) tensor representing the scale of each box in the
+        basis set.
+      aspect_ratios: a 1-d (float) tensor representing the aspect ratio of each
+        box in the basis set.  The length of the scales and aspect_ratios tensors
+        must be equal.
+      base_anchor_size: base anchor size as [height, width]
+        (float tensor of shape [2])
+      anchor_stride: difference in centers between base anchors for adjacent grid
+                     positions (float tensor of shape [2])
+      anchor_offset: center of the anchor with scale and aspect ratio 1 for the
+                     upper left element of the grid, this should be zero for
+                     feature networks with only VALID padding and even receptive
+                     field size, but may need some additional calculation if other
+                     padding is used (float tensor of shape [2])
+    Returns:
+      a BoxList holding a collection of N anchor boxes
+    """
+    aspect_ratios = torch.as_tensor(aspect_ratios, dtype=torch.float32)
+    scales = torch.as_tensor(scales, dtype=torch.float32)
 
-  ratio_sqrts = torch.sqrt(aspect_ratios)
-  heights = scales / ratio_sqrts * base_anchor_size[0]
-  widths = scales * ratio_sqrts * base_anchor_size[1]
+    ratio_sqrts = torch.sqrt(aspect_ratios)
+    heights = scales / ratio_sqrts * base_anchor_size[0]
+    widths = scales * ratio_sqrts * base_anchor_size[1]
 
-  # Get a grid of box centers
-  y_centers = torch.arange(grid_height, dtype=torch.float32)
-  y_centers = y_centers * anchor_stride[0] + anchor_offset[0]
-  x_centers = torch.arange(grid_width, dtype=torch.float32)
-  x_centers = x_centers * anchor_stride[1] + anchor_offset[1]
+    # Get a grid of box centers
+    y_centers = torch.arange(grid_height, dtype=torch.float32)
+    y_centers = y_centers * anchor_stride[0] + anchor_offset[0]
+    x_centers = torch.arange(grid_width, dtype=torch.float32)
+    x_centers = x_centers * anchor_stride[1] + anchor_offset[1]
 
-  x_centers, y_centers = meshgrid(x_centers, y_centers)
+    x_centers, y_centers = meshgrid(x_centers, y_centers)
 
-  widths_grid, x_centers_grid = meshgrid(widths, x_centers)
-  heights_grid, y_centers_grid = meshgrid(heights, y_centers)
+    widths_grid, x_centers_grid = meshgrid(widths, x_centers)
+    heights_grid, y_centers_grid = meshgrid(heights, y_centers)
 
-  bbox_centers = torch.stack([y_centers_grid, x_centers_grid], dim=3)
-  bbox_sizes = torch.stack([heights_grid, widths_grid], dim=3)
-  bbox_centers = torch.reshape(bbox_centers, [-1, 2])
-  bbox_sizes = torch.reshape(bbox_sizes, [-1, 2])
-  bbox_corners = _center_size_bbox_to_corners_bbox(bbox_centers, bbox_sizes)
-  return bbox_corners
+    bbox_centers = torch.stack([y_centers_grid, x_centers_grid], dim=3)
+    bbox_sizes = torch.stack([heights_grid, widths_grid], dim=3)
+    bbox_centers = torch.reshape(bbox_centers, [-1, 2])
+    bbox_sizes = torch.reshape(bbox_sizes, [-1, 2])
+    bbox_corners = _center_size_bbox_to_corners_bbox(bbox_centers, bbox_sizes)
+    return bbox_corners
 
 
 def _center_size_bbox_to_corners_bbox(centers, sizes):
-  """Converts bbox center-size representation to corners representation.
-  Args:
-    centers: a tensor with shape [N, 2] representing bounding box centers
-    sizes: a tensor with shape [N, 2] representing bounding boxes
-  Returns:
-    corners: tensor with shape [N, 4] representing bounding boxes in corners
-      representation
-  """
-  return torch.cat([centers - .5 * sizes, centers + .5 * sizes], 1)
-
-
-def create_ssd_anchors(num_layers=6,
-                       min_scale=0.2,
-                       max_scale=0.95,
-                       scales=None,
-                       aspect_ratios=(1.0, 2.0, 1.0 / 2, 3.0, 1.0 / 3),
-                       interpolated_scale_aspect_ratio=1.0,
-                       base_anchor_size=None,
-                       anchor_strides=None,
-                       anchor_offsets=None,
-                       reduce_boxes_in_lowest_layer=True):
-  """Creates MultipleGridAnchorGenerator for SSD anchors.
-  This function instantiates a MultipleGridAnchorGenerator that reproduces
-  ``default box`` construction proposed by Liu et al in the SSD paper.
-  See Section 2.2 for details. Grid sizes are assumed to be passed in
-  at generation time from finest resolution to coarsest resolution --- this is
-  used to (linearly) interpolate scales of anchor boxes corresponding to the
-  intermediate grid sizes.
-  Anchors that are returned by calling the `generate` method on the returned
-  MultipleGridAnchorGenerator object are always in normalized coordinates
-  and clipped to the unit square: (i.e. all coordinates lie in [0, 1]x[0, 1]).
-  Args:
-    num_layers: integer number of grid layers to create anchors for (actual
-      grid sizes passed in at generation time)
-    min_scale: scale of anchors corresponding to finest resolution (float)
-    max_scale: scale of anchors corresponding to coarsest resolution (float)
-    scales: As list of anchor scales to use. When not None and not empty,
-      min_scale and max_scale are not used.
-    aspect_ratios: list or tuple of (float) aspect ratios to place on each
-      grid point.
-    interpolated_scale_aspect_ratio: An additional anchor is added with this
-      aspect ratio and a scale interpolated between the scale for a layer
-      and the scale for the next layer (1.0 for the last layer).
-      This anchor is not included if this value is 0.
-    base_anchor_size: base anchor size as [height, width].
-      The height and width values are normalized to the minimum dimension of the
-      input height and width, so that when the base anchor height equals the
-      base anchor width, the resulting anchor is square even if the input image
-      is not square.
-    anchor_strides: list of pairs of strides in pixels (in y and x directions
-      respectively). For example, setting anchor_strides=[(25, 25), (50, 50)]
-      means that we want the anchors corresponding to the first layer to be
-      strided by 25 pixels and those in the second layer to be strided by 50
-      pixels in both y and x directions. If anchor_strides=None, they are set to
-      be the reciprocal of the corresponding feature map shapes.
-    anchor_offsets: list of pairs of offsets in pixels (in y and x directions
-      respectively). The offset specifies where we want the center of the
-      (0, 0)-th anchor to lie for each layer. For example, setting
-      anchor_offsets=[(10, 10), (20, 20)]) means that we want the
-      (0, 0)-th anchor of the first layer to lie at (10, 10) in pixel space
-      and likewise that we want the (0, 0)-th anchor of the second layer to lie
-      at (25, 25) in pixel space. If anchor_offsets=None, then they are set to
-      be half of the corresponding anchor stride.
-    reduce_boxes_in_lowest_layer: a boolean to indicate whether the fixed 3
-      boxes per location is used in the lowest layer.
-  Returns:
-    a MultipleGridAnchorGenerator
-  """
-  if base_anchor_size is None:
-    base_anchor_size = [1.0, 1.0]
-  base_anchor_size = torch.tensor(base_anchor_size, dtype=torch.float32)
-  box_specs_list = []
-  if scales is None or not scales:
-    scales = [min_scale + (max_scale - min_scale) * i / (num_layers - 1)
-              for i in range(num_layers)] + [1.0]
-  else:
-    # Add 1.0 to the end, which will only be used in scale_next below and used
-    # for computing an interpolated scale for the largest scale in the list.
-    scales += [1.0]
-
-  for layer, scale, scale_next in zip(
-      range(num_layers), scales[:-1], scales[1:]):
-    layer_box_specs = []
-    if layer == 0 and reduce_boxes_in_lowest_layer:
-      layer_box_specs = [(0.1, 1.0), (scale, 2.0), (scale, 0.5)]
-    else:
-      for aspect_ratio in aspect_ratios:
-        layer_box_specs.append((scale, aspect_ratio))
-      # Add one more anchor, with a scale between the current scale, and the
-      # scale for the next layer, with a specified aspect ratio (1.0 by
-      # default).
-      if interpolated_scale_aspect_ratio > 0.0:
-        layer_box_specs.append((np.sqrt(scale*scale_next),
-                                interpolated_scale_aspect_ratio))
-    box_specs_list.append(layer_box_specs)
-
-  return MultipleGridAnchorGenerator(box_specs_list, base_anchor_size,
-                                     anchor_strides, anchor_offsets)
+    """Converts bbox center-size representation to corners representation.
+    Args:
+      centers: a tensor with shape [N, 2] representing bounding box centers
+      sizes: a tensor with shape [N, 2] representing bounding boxes
+    Returns:
+      corners: tensor with shape [N, 4] representing bounding boxes in corners
+        representation
+    """
+    return torch.cat([centers - 0.5 * sizes, centers + 0.5 * sizes], 1)
 
-class MultipleGridAnchorGenerator(object):
-  """Generate a grid of anchors for multiple CNN layers."""
 
-  def __init__(self,
-               box_specs_list,
-               base_anchor_size=None,
-               anchor_strides=None,
-               anchor_offsets=None,
-               clip_window=None):
-    """Constructs a MultipleGridAnchorGenerator.
-    To construct anchors, at multiple grid resolutions, one must provide a
-    list of feature_map_shape_list (e.g., [(8, 8), (4, 4)]), and for each grid
-    size, a corresponding list of (scale, aspect ratio) box specifications.
-    For example:
-    box_specs_list = [[(.1, 1.0), (.1, 2.0)],  # for 8x8 grid
-                      [(.2, 1.0), (.3, 1.0), (.2, 2.0)]]  # for 4x4 grid
-    To support the fully convolutional setting, we pass grid sizes in at
-    generation time, while scale and aspect ratios are fixed at construction
-    time.
+def create_ssd_anchors(
+    num_layers=6,
+    min_scale=0.2,
+    max_scale=0.95,
+    scales=None,
+    aspect_ratios=(1.0, 2.0, 1.0 / 2, 3.0, 1.0 / 3),
+    interpolated_scale_aspect_ratio=1.0,
+    base_anchor_size=None,
+    anchor_strides=None,
+    anchor_offsets=None,
+    reduce_boxes_in_lowest_layer=True,
+):
+    """Creates MultipleGridAnchorGenerator for SSD anchors.
+    This function instantiates a MultipleGridAnchorGenerator that reproduces
+    ``default box`` construction proposed by Liu et al in the SSD paper.
+    See Section 2.2 for details. Grid sizes are assumed to be passed in
+    at generation time from finest resolution to coarsest resolution --- this is
+    used to (linearly) interpolate scales of anchor boxes corresponding to the
+    intermediate grid sizes.
+    Anchors that are returned by calling the `generate` method on the returned
+    MultipleGridAnchorGenerator object are always in normalized coordinates
+    and clipped to the unit square: (i.e. all coordinates lie in [0, 1]x[0, 1]).
     Args:
-      box_specs_list: list of list of (scale, aspect ratio) pairs with the
-        outside list having the same number of entries as feature_map_shape_list
-        (which is passed in at generation time).
-      base_anchor_size: base anchor size as [height, width]
-                        (length-2 float tensor, default=[1.0, 1.0]).
-                        The height and width values are normalized to the
-                        minimum dimension of the input height and width, so that
-                        when the base anchor height equals the base anchor
-                        width, the resulting anchor is square even if the input
-                        image is not square.
+      num_layers: integer number of grid layers to create anchors for (actual
+        grid sizes passed in at generation time)
+      min_scale: scale of anchors corresponding to finest resolution (float)
+      max_scale: scale of anchors corresponding to coarsest resolution (float)
+      scales: As list of anchor scales to use. When not None and not empty,
+        min_scale and max_scale are not used.
+      aspect_ratios: list or tuple of (float) aspect ratios to place on each
+        grid point.
+      interpolated_scale_aspect_ratio: An additional anchor is added with this
+        aspect ratio and a scale interpolated between the scale for a layer
+        and the scale for the next layer (1.0 for the last layer).
+        This anchor is not included if this value is 0.
+      base_anchor_size: base anchor size as [height, width].
+        The height and width values are normalized to the minimum dimension of the
+        input height and width, so that when the base anchor height equals the
+        base anchor width, the resulting anchor is square even if the input image
+        is not square.
       anchor_strides: list of pairs of strides in pixels (in y and x directions
         respectively). For example, setting anchor_strides=[(25, 25), (50, 50)]
         means that we want the anchors corresponding to the first layer to be
         strided by 25 pixels and those in the second layer to be strided by 50
-        pixels in both y and x directions. If anchor_strides=None, they are set
-        to be the reciprocal of the corresponding feature map shapes.
+        pixels in both y and x directions. If anchor_strides=None, they are set to
+        be the reciprocal of the corresponding feature map shapes.
       anchor_offsets: list of pairs of offsets in pixels (in y and x directions
         respectively). The offset specifies where we want the center of the
         (0, 0)-th anchor to lie for each layer. For example, setting
         anchor_offsets=[(10, 10), (20, 20)]) means that we want the
         (0, 0)-th anchor of the first layer to lie at (10, 10) in pixel space
-        and likewise that we want the (0, 0)-th anchor of the second layer to
-        lie at (25, 25) in pixel space. If anchor_offsets=None, then they are
-        set to be half of the corresponding anchor stride.
-      clip_window: a tensor of shape [4] specifying a window to which all
-        anchors should be clipped. If clip_window is None, then no clipping
-        is performed.
-    Raises:
-      ValueError: if box_specs_list is not a list of list of pairs
-      ValueError: if clip_window is not either None or a tensor of shape [4]
+        and likewise that we want the (0, 0)-th anchor of the second layer to lie
+        at (25, 25) in pixel space. If anchor_offsets=None, then they are set to
+        be half of the corresponding anchor stride.
+      reduce_boxes_in_lowest_layer: a boolean to indicate whether the fixed 3
+        boxes per location is used in the lowest layer.
+    Returns:
+      a MultipleGridAnchorGenerator
     """
-    if isinstance(box_specs_list, list) and all(
-        [isinstance(list_item, list) for list_item in box_specs_list]):
-      self._box_specs = box_specs_list
-    else:
-      raise ValueError('box_specs_list is expected to be a '
-                       'list of lists of pairs')
     if base_anchor_size is None:
-      base_anchor_size = torch.tensor([256, 256], dtype=torch.float32)
-    self._base_anchor_size = base_anchor_size
-    self._anchor_strides = anchor_strides
-    self._anchor_offsets = anchor_offsets
-    if clip_window is not None and list(clip_window.shape) != [4]:
-      raise ValueError('clip_window must either be None or a shape [4] tensor')
-    self._clip_window = clip_window
-    self._scales = []
-    self._aspect_ratios = []
-    for box_spec in self._box_specs:
-      if not all([isinstance(entry, tuple) and len(entry) == 2
-                  for entry in box_spec]):
-        raise ValueError('box_specs_list is expected to be a '
-                         'list of lists of pairs')
-      scales, aspect_ratios = zip(*box_spec)
-      self._scales.append(scales)
-      self._aspect_ratios.append(aspect_ratios)
+        base_anchor_size = [1.0, 1.0]
+    base_anchor_size = torch.tensor(base_anchor_size, dtype=torch.float32)
+    box_specs_list = []
+    if scales is None or not scales:
+        scales = [
+            min_scale + (max_scale - min_scale) * i / (num_layers - 1)
+            for i in range(num_layers)
+        ] + [1.0]
+    else:
+        # Add 1.0 to the end, which will only be used in scale_next below and used
+        # for computing an interpolated scale for the largest scale in the
+        # list.
+        scales += [1.0]
 
-    for arg, arg_name in zip([self._anchor_strides, self._anchor_offsets],
-                             ['anchor_strides', 'anchor_offsets']):
-      if arg and not (isinstance(arg, list) and
-                      len(arg) == len(self._box_specs)):
-        raise ValueError('%s must be a list with the same length '
-                         'as self._box_specs' % arg_name)
-      if arg and not all([
-          isinstance(list_item, tuple) and len(list_item) == 2
-          for list_item in arg
-      ]):
-        raise ValueError('%s must be a list of pairs.' % arg_name)
+    for layer, scale, scale_next in zip(
+            range(num_layers), scales[:-1], scales[1:]):
+        layer_box_specs = []
+        if layer == 0 and reduce_boxes_in_lowest_layer:
+            layer_box_specs = [(0.1, 1.0), (scale, 2.0), (scale, 0.5)]
+        else:
+            for aspect_ratio in aspect_ratios:
+                layer_box_specs.append((scale, aspect_ratio))
+            # Add one more anchor, with a scale between the current scale, and the
+            # scale for the next layer, with a specified aspect ratio (1.0 by
+            # default).
+            if interpolated_scale_aspect_ratio > 0.0:
+                layer_box_specs.append(
+                    (np.sqrt(scale * scale_next), interpolated_scale_aspect_ratio)
+                )
+        box_specs_list.append(layer_box_specs)
 
+    return MultipleGridAnchorGenerator(
+        box_specs_list, base_anchor_size, anchor_strides, anchor_offsets
+    )
 
-  def _generate(self, feature_map_shape_list, im_height=1, im_width=1):
-    """Generates a collection of bounding boxes to be used as anchors.
-    The number of anchors generated for a single grid with shape MxM where we
-    place k boxes over each grid center is k*M^2 and thus the total number of
-    anchors is the sum over all grids. In our box_specs_list example
-    (see the constructor docstring), we would place two boxes over each grid
-    point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and
-    thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the
-    output anchors follows the order of how the grid sizes and box_specs are
-    specified (with box_spec index varying the fastest, followed by width
-    index, then height index, then grid index).
-    Args:
-      feature_map_shape_list: list of pairs of convnet layer resolutions in the
-        format [(height_0, width_0), (height_1, width_1), ...]. For example,
-        setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that
-        correspond to an 8x8 layer followed by a 7x7 layer.
-      im_height: the height of the image to generate the grid for. If both
-        im_height and im_width are 1, the generated anchors default to
-        absolute coordinates, otherwise normalized coordinates are produced.
-      im_width: the width of the image to generate the grid for. If both
-        im_height and im_width are 1, the generated anchors default to
-        absolute coordinates, otherwise normalized coordinates are produced.
-    Returns:
-      boxes_list: a list of BoxLists each holding anchor boxes corresponding to
-        the input feature map shapes.
-    Raises:
-      ValueError: if feature_map_shape_list, box_specs_list do not have the same
-        length.
-      ValueError: if feature_map_shape_list does not consist of pairs of
-        integers
-    """
-    if not (isinstance(feature_map_shape_list, list)
-            and len(feature_map_shape_list) == len(self._box_specs)):
-      raise ValueError('feature_map_shape_list must be a list with the same '
-                       'length as self._box_specs')
-    if not all([isinstance(list_item, tuple) and len(list_item) == 2
-                for list_item in feature_map_shape_list]):
-      raise ValueError('feature_map_shape_list must be a list of pairs.')
 
-    im_height = float(im_height)
-    im_width = float(im_width)
+class MultipleGridAnchorGenerator(object):
+    """Generate a grid of anchors for multiple CNN layers."""
 
-    if not self._anchor_strides:
-      anchor_strides = [(1.0 / float(pair[0]), 1.0 / float(pair[1]))
-                        for pair in feature_map_shape_list]
-    else:
-      anchor_strides = [(float(stride[0]) / im_height,
-                         float(stride[1]) / im_width)
-                        for stride in self._anchor_strides]
-    if not self._anchor_offsets:
-      anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1])
-                        for stride in anchor_strides]
-    else:
-      anchor_offsets = [(float(offset[0]) / im_height,
-                         float(offset[1]) / im_width)
-                        for offset in self._anchor_offsets]
+    def __init__(
+        self,
+        box_specs_list,
+        base_anchor_size=None,
+        anchor_strides=None,
+        anchor_offsets=None,
+        clip_window=None,
+    ):
+        """Constructs a MultipleGridAnchorGenerator.
+        To construct anchors, at multiple grid resolutions, one must provide a
+        list of feature_map_shape_list (e.g., [(8, 8), (4, 4)]), and for each grid
+        size, a corresponding list of (scale, aspect ratio) box specifications.
+        For example:
+        box_specs_list = [[(.1, 1.0), (.1, 2.0)],  # for 8x8 grid
+                          [(.2, 1.0), (.3, 1.0), (.2, 2.0)]]  # for 4x4 grid
+        To support the fully convolutional setting, we pass grid sizes in at
+        generation time, while scale and aspect ratios are fixed at construction
+        time.
+        Args:
+          box_specs_list: list of list of (scale, aspect ratio) pairs with the
+            outside list having the same number of entries as feature_map_shape_list
+            (which is passed in at generation time).
+          base_anchor_size: base anchor size as [height, width]
+                            (length-2 float tensor, default=[1.0, 1.0]).
+                            The height and width values are normalized to the
+                            minimum dimension of the input height and width, so that
+                            when the base anchor height equals the base anchor
+                            width, the resulting anchor is square even if the input
+                            image is not square.
+          anchor_strides: list of pairs of strides in pixels (in y and x directions
+            respectively). For example, setting anchor_strides=[(25, 25), (50, 50)]
+            means that we want the anchors corresponding to the first layer to be
+            strided by 25 pixels and those in the second layer to be strided by 50
+            pixels in both y and x directions. If anchor_strides=None, they are set
+            to be the reciprocal of the corresponding feature map shapes.
+          anchor_offsets: list of pairs of offsets in pixels (in y and x directions
+            respectively). The offset specifies where we want the center of the
+            (0, 0)-th anchor to lie for each layer. For example, setting
+            anchor_offsets=[(10, 10), (20, 20)]) means that we want the
+            (0, 0)-th anchor of the first layer to lie at (10, 10) in pixel space
+            and likewise that we want the (0, 0)-th anchor of the second layer to
+            lie at (25, 25) in pixel space. If anchor_offsets=None, then they are
+            set to be half of the corresponding anchor stride.
+          clip_window: a tensor of shape [4] specifying a window to which all
+            anchors should be clipped. If clip_window is None, then no clipping
+            is performed.
+        Raises:
+          ValueError: if box_specs_list is not a list of list of pairs
+          ValueError: if clip_window is not either None or a tensor of shape [4]
+        """
+        if isinstance(box_specs_list, list) and all(
+            [isinstance(list_item, list) for list_item in box_specs_list]
+        ):
+            self._box_specs = box_specs_list
+        else:
+            raise ValueError(
+                "box_specs_list is expected to be a " "list of lists of pairs"
+            )
+        if base_anchor_size is None:
+            base_anchor_size = torch.tensor([256, 256], dtype=torch.float32)
+        self._base_anchor_size = base_anchor_size
+        self._anchor_strides = anchor_strides
+        self._anchor_offsets = anchor_offsets
+        if clip_window is not None and list(clip_window.shape) != [4]:
+            raise ValueError(
+                "clip_window must either be None or a shape [4] tensor")
+        self._clip_window = clip_window
+        self._scales = []
+        self._aspect_ratios = []
+        for box_spec in self._box_specs:
+            if not all(
+                [isinstance(entry, tuple) and len(entry)
+                 == 2 for entry in box_spec]
+            ):
+                raise ValueError(
+                    "box_specs_list is expected to be a " "list of lists of pairs"
+                )
+            scales, aspect_ratios = zip(*box_spec)
+            self._scales.append(scales)
+            self._aspect_ratios.append(aspect_ratios)
+
+        for arg, arg_name in zip(
+            [self._anchor_strides, self._anchor_offsets],
+            ["anchor_strides", "anchor_offsets"],
+        ):
+            if arg and not (isinstance(arg, list) and len(arg)
+                            == len(self._box_specs)):
+                raise ValueError(
+                    "%s must be a list with the same length "
+                    "as self._box_specs" % arg_name
+                )
+            if arg and not all(
+                [
+                    isinstance(list_item, tuple) and len(list_item) == 2
+                    for list_item in arg
+                ]
+            ):
+                raise ValueError("%s must be a list of pairs." % arg_name)
+
+    def _generate(self, feature_map_shape_list, im_height=1, im_width=1):
+        """Generates a collection of bounding boxes to be used as anchors.
+        The number of anchors generated for a single grid with shape MxM where we
+        place k boxes over each grid center is k*M^2 and thus the total number of
+        anchors is the sum over all grids. In our box_specs_list example
+        (see the constructor docstring), we would place two boxes over each grid
+        point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and
+        thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the
+        output anchors follows the order of how the grid sizes and box_specs are
+        specified (with box_spec index varying the fastest, followed by width
+        index, then height index, then grid index).
+        Args:
+          feature_map_shape_list: list of pairs of convnet layer resolutions in the
+            format [(height_0, width_0), (height_1, width_1), ...]. For example,
+            setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that
+            correspond to an 8x8 layer followed by a 7x7 layer.
+          im_height: the height of the image to generate the grid for. If both
+            im_height and im_width are 1, the generated anchors default to
+            absolute coordinates, otherwise normalized coordinates are produced.
+          im_width: the width of the image to generate the grid for. If both
+            im_height and im_width are 1, the generated anchors default to
+            absolute coordinates, otherwise normalized coordinates are produced.
+        Returns:
+          boxes_list: a list of BoxLists each holding anchor boxes corresponding to
+            the input feature map shapes.
+        Raises:
+          ValueError: if feature_map_shape_list, box_specs_list do not have the same
+            length.
+          ValueError: if feature_map_shape_list does not consist of pairs of
+            integers
+        """
+        if not (
+            isinstance(feature_map_shape_list, list)
+            and len(feature_map_shape_list) == len(self._box_specs)
+        ):
+            raise ValueError(
+                "feature_map_shape_list must be a list with the same "
+                "length as self._box_specs"
+            )
+        if not all(
+            [
+                isinstance(list_item, tuple) and len(list_item) == 2
+                for list_item in feature_map_shape_list
+            ]
+        ):
+            raise ValueError("feature_map_shape_list must be a list of pairs.")
+
+        im_height = float(im_height)
+        im_width = float(im_width)
+
+        if not self._anchor_strides:
+            anchor_strides = [
+                (1.0 / float(pair[0]), 1.0 / float(pair[1]))
+                for pair in feature_map_shape_list
+            ]
+        else:
+            anchor_strides = [
+                (float(stride[0]) / im_height, float(stride[1]) / im_width)
+                for stride in self._anchor_strides
+            ]
+        if not self._anchor_offsets:
+            anchor_offsets = [
+                (0.5 * stride[0], 0.5 * stride[1]) for stride in anchor_strides
+            ]
+        else:
+            anchor_offsets = [
+                (float(offset[0]) / im_height, float(offset[1]) / im_width)
+                for offset in self._anchor_offsets
+            ]
 
-    for arg, arg_name in zip([anchor_strides, anchor_offsets],
-                             ['anchor_strides', 'anchor_offsets']):
-      if not (isinstance(arg, list) and len(arg) == len(self._box_specs)):
-        raise ValueError('%s must be a list with the same length '
-                         'as self._box_specs' % arg_name)
-      if not all([isinstance(list_item, tuple) and len(list_item) == 2
-                  for list_item in arg]):
-        raise ValueError('%s must be a list of pairs.' % arg_name)
+        for arg, arg_name in zip(
+            [anchor_strides, anchor_offsets], [
+                "anchor_strides", "anchor_offsets"]
+        ):
+            if not (isinstance(arg, list) and len(
+                    arg) == len(self._box_specs)):
+                raise ValueError(
+                    "%s must be a list with the same length "
+                    "as self._box_specs" % arg_name
+                )
+            if not all(
+                [
+                    isinstance(list_item, tuple) and len(list_item) == 2
+                    for list_item in arg
+                ]
+            ):
+                raise ValueError("%s must be a list of pairs." % arg_name)
 
-    anchor_grid_list = []
-    min_im_shape = min(im_height, im_width)
-    scale_height = min_im_shape / im_height
-    scale_width = min_im_shape / im_width
-    base_anchor_size = [
-        scale_height * self._base_anchor_size[0],
-        scale_width * self._base_anchor_size[1]
-    ]
-    for feature_map_index, (grid_size, scales, aspect_ratios, stride,
-                            offset) in enumerate(
-                                zip(feature_map_shape_list, self._scales,
-                                    self._aspect_ratios, anchor_strides,
-                                    anchor_offsets)):
-      tiled_anchors = tile_anchors(
-          grid_height=grid_size[0],
-          grid_width=grid_size[1],
-          scales=scales,
-          aspect_ratios=aspect_ratios,
-          base_anchor_size=base_anchor_size,
-          anchor_stride=stride,
-          anchor_offset=offset)
-      if self._clip_window is not None:
-        raise NotImplementedError("Oups!")
-      num_anchors_in_layer = len(tiled_anchors)
-      anchor_indices = feature_map_index * torch.ones(num_anchors_in_layer)
-      anchor_grid_list.append(tiled_anchors)
+        anchor_grid_list = []
+        min_im_shape = min(im_height, im_width)
+        scale_height = min_im_shape / im_height
+        scale_width = min_im_shape / im_width
+        base_anchor_size = [
+            scale_height * self._base_anchor_size[0],
+            scale_width * self._base_anchor_size[1],
+        ]
+        for feature_map_index, (
+            grid_size,
+            scales,
+            aspect_ratios,
+            stride,
+            offset,
+        ) in enumerate(
+            zip(
+                feature_map_shape_list,
+                self._scales,
+                self._aspect_ratios,
+                anchor_strides,
+                anchor_offsets,
+            )
+        ):
+            tiled_anchors = tile_anchors(
+                grid_height=grid_size[0],
+                grid_width=grid_size[1],
+                scales=scales,
+                aspect_ratios=aspect_ratios,
+                base_anchor_size=base_anchor_size,
+                anchor_stride=stride,
+                anchor_offset=offset,
+            )
+            if self._clip_window is not None:
+                raise NotImplementedError("Oups!")
+            num_anchors_in_layer = len(tiled_anchors)
+            anchor_indices = feature_map_index * \
+                torch.ones(num_anchors_in_layer)
+            anchor_grid_list.append(tiled_anchors)
 
-    return anchor_grid_list
+        return anchor_grid_list
diff --git a/retired_benchmarks/vision/classification_and_detection/python/models/base_model_r34.py b/retired_benchmarks/vision/classification_and_detection/python/models/base_model_r34.py
index ea224a7ca..9e8a2da61 100644
--- a/retired_benchmarks/vision/classification_and_detection/python/models/base_model_r34.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/models/base_model_r34.py
@@ -2,7 +2,7 @@
     Load the vgg16 weight and save it to special file
 """
 
-#from torchvision.models.vgg import vgg16
+# from torchvision.models.vgg import vgg16
 import torch.nn as nn
 import torch.nn.functional as F
 import torch
@@ -11,29 +11,32 @@
 
 from torchvision.models.resnet import resnet18, resnet34, resnet50
 
+
 def _ModifyConvStrideDilation(conv, stride=(1, 1), padding=None):
     conv.stride = stride
 
     if padding is not None:
         conv.padding = padding
 
+
 def _ModifyBlock(block, bottleneck=False, **kwargs):
     for m in list(block.children()):
         if bottleneck:
-           _ModifyConvStrideDilation(m.conv2, **kwargs)
+            _ModifyConvStrideDilation(m.conv2, **kwargs)
         else:
-           _ModifyConvStrideDilation(m.conv1, **kwargs)
+            _ModifyConvStrideDilation(m.conv1, **kwargs)
 
         if m.downsample is not None:
             # need to make sure no padding for the 1x1 residual connection
-            _ModifyConvStrideDilation(list(m.downsample.children())[0], **kwargs)
+            _ModifyConvStrideDilation(
+                list(m.downsample.children())[0], **kwargs)
+
 
 class ResNet18(nn.Module):
     def __init__(self):
         super().__init__()
         rn18 = resnet18(pretrained=True)
 
-
         # discard last Resnet block, avrpooling and classification FC
         # layer1 = up to and including conv3 block
         self.layer1 = nn.Sequential(*list(rn18.children())[:6])
@@ -43,7 +46,7 @@ def __init__(self):
         # modify conv4 if necessary
         # Always deal with stride in first block
         modulelist = list(self.layer2.children())
-        _ModifyBlock(modulelist[0], stride=(1,1))
+        _ModifyBlock(modulelist[0], stride=(1, 1))
 
     def forward(self, data):
         layer1_activation = self.layer1(data)
@@ -53,6 +56,7 @@ def forward(self, data):
         # Only need the output of conv4
         return [layer2_activation]
 
+
 class ResNet34(nn.Module):
     def __init__(self):
         super().__init__()
@@ -64,8 +68,7 @@ def __init__(self):
         # modify conv4 if necessary
         # Always deal with stride in first block
         modulelist = list(self.layer2.children())
-        _ModifyBlock(modulelist[0], stride=(1,1))
-
+        _ModifyBlock(modulelist[0], stride=(1, 1))
 
     def forward(self, data):
         layer1_activation = self.layer1(data)
@@ -74,22 +77,28 @@ def forward(self, data):
 
         return [layer2_activation]
 
+
 class L2Norm(nn.Module):
     """
-       Scale shall be learnable according to original paper
-       scale: initial scale number
-       chan_num: L2Norm channel number (norm over all channels)
+    Scale shall be learnable according to original paper
+    scale: initial scale number
+    chan_num: L2Norm channel number (norm over all channels)
     """
+
     def __init__(self, scale=20, chan_num=512):
         super(L2Norm, self).__init__()
         # Scale across channels
-        self.scale = \
-            nn.Parameter(torch.Tensor([scale]*chan_num).view(1, chan_num, 1, 1))
+        self.scale = nn.Parameter(
+            torch.Tensor([scale] * chan_num).view(1, chan_num, 1, 1)
+        )
 
     def forward(self, data):
         # normalize accross channel
-        return self.scale*data*data.pow(2).sum(dim=1, keepdim=True).clamp(min=1e-12).rsqrt()
-
+        return (
+            self.scale
+            * data
+            * data.pow(2).sum(dim=1, keepdim=True).clamp(min=1e-12).rsqrt()
+        )
 
 
 def tailor_module(src_model, src_dir, tgt_model, tgt_dir):
@@ -107,22 +116,23 @@ def tailor_module(src_model, src_dir, tgt_model, tgt_dir):
     for k1, k2 in zip(keys1, keys2):
         # print(k1, k2)
         state[k2] = src_state[k1]
-    #diff_keys = state.keys() - target_model.state_dict().keys()
-    #print("Different Keys:", diff_keys)
+    # diff_keys = state.keys() - target_model.state_dict().keys()
+    # print("Different Keys:", diff_keys)
     # Remove unecessary keys
-    #for k in diff_keys:
+    # for k in diff_keys:
     #    state.pop(k)
     tgt_model.load_state_dict(state)
     torch.save(tgt_model.state_dict(), tgt_dir)
 
+
 # Default vgg16 in pytorch seems different from ssd
 def make_layers(cfg, batch_norm=False):
     layers = []
     in_channels = 3
     for v in cfg:
-        if v == 'M':
+        if v == "M":
             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
-        elif v == 'C':
+        elif v == "C":
             # Notice ceil_mode is true
             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
         else:
@@ -134,42 +144,51 @@ def make_layers(cfg, batch_norm=False):
             in_channels = v
     return layers
 
+
 class Loss(nn.Module):
     """
-        Implements the loss as the sum of the followings:
-        1. Confidence Loss: All labels, with hard negative mining
-        2. Localization Loss: Only on positive labels
-        Suppose input dboxes has the shape 8732x4
+    Implements the loss as the sum of the followings:
+    1. Confidence Loss: All labels, with hard negative mining
+    2. Localization Loss: Only on positive labels
+    Suppose input dboxes has the shape 8732x4
     """
 
     def __init__(self, dboxes):
         super(Loss, self).__init__()
-        self.scale_xy = 1.0/dboxes.scale_xy
-        self.scale_wh = 1.0/dboxes.scale_wh
+        self.scale_xy = 1.0 / dboxes.scale_xy
+        self.scale_wh = 1.0 / dboxes.scale_wh
 
         self.sl1_loss = nn.SmoothL1Loss(reduce=False)
-        self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim = 0),
-            requires_grad=False)
+        self.dboxes = nn.Parameter(
+            dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0), requires_grad=False
+        )
         # Two factor are from following links
         # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
         self.con_loss = nn.CrossEntropyLoss(reduce=False)
 
     def _loc_vec(self, loc):
         """
-            Generate Location Vectors
+        Generate Location Vectors
         """
-        gxy = self.scale_xy*(loc[:, :2, :] - self.dboxes[:, :2, :])/self.dboxes[:, 2:, ]
-        gwh = self.scale_wh*(loc[:, 2:, :]/self.dboxes[:, 2:, :]).log()
+        gxy = (
+            self.scale_xy
+            * (loc[:, :2, :] - self.dboxes[:, :2, :])
+            / self.dboxes[
+                :,
+                2:,
+            ]
+        )
+        gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log()
 
         return torch.cat((gxy, gwh), dim=1).contiguous()
 
     def forward(self, ploc, plabel, gloc, glabel):
         """
-            ploc, plabel: Nx4x8732, Nxlabel_numx8732
-                predicted location and labels
+        ploc, plabel: Nx4x8732, Nxlabel_numx8732
+            predicted location and labels
 
-            gloc, glabel: Nx4x8732, Nx8732
-                ground truth location and labels
+        gloc, glabel: Nx4x8732, Nx8732
+            ground truth location and labels
         """
 
         mask = glabel > 0
@@ -177,7 +196,7 @@ def forward(self, ploc, plabel, gloc, glabel):
         vec_gd = self._loc_vec(gloc)
         # sum on four coordinates, and mask
         sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
-        sl1 = (mask.float()*sl1).sum(dim=1)
+        sl1 = (mask.float() * sl1).sum(dim=1)
 
         # hard negative mining
         con = self.con_loss(plabel, glabel)
@@ -189,16 +208,15 @@ def forward(self, ploc, plabel, gloc, glabel):
         _, con_rank = con_idx.sort(dim=1)
 
         # number of negative three times positive
-        neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1)
+        neg_num = torch.clamp(3 * pos_num, max=mask.size(1)).unsqueeze(-1)
         neg_mask = con_rank < neg_num
 
-        closs = (con*(mask.float() + neg_mask.float())).sum(dim=1)
+        closs = (con * (mask.float() + neg_mask.float())).sum(dim=1)
 
         # avoid no object detected
         total_loss = sl1 + closs
         num_mask = (pos_num > 0).float()
         pos_num = pos_num.float().clamp(min=1e-6)
 
-        ret = (total_loss*num_mask/pos_num).mean(dim=0)
+        ret = (total_loss * num_mask / pos_num).mean(dim=0)
         return ret
-
diff --git a/retired_benchmarks/vision/classification_and_detection/python/models/convert_tf_weights.py b/retired_benchmarks/vision/classification_and_detection/python/models/convert_tf_weights.py
index 02c51bf1b..4d063caef 100644
--- a/retired_benchmarks/vision/classification_and_detection/python/models/convert_tf_weights.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/models/convert_tf_weights.py
@@ -17,7 +17,7 @@ def remap_tf_base_names(orig_weights):
         if "batchnorm" not in k and "pointwise_" not in k
     }
 
-    matcher = re.compile("(.*)Conv2d_(\d+)")
+    matcher = re.compile("(.*)Conv2d_(\\d+)")
     mapping = {}
     for k in convs.keys():
         l = matcher.match(k).group(2)
@@ -52,7 +52,7 @@ def remap_tf_extras(orig_weights):
     }
     weights = {k: v for k, v in weights.items() if "pointwise_" in k}
 
-    matcher = re.compile("(.*)Conv2d_(\d+)_(\d)x(\d)")
+    matcher = re.compile("(.*)Conv2d_(\\d+)_(\\d)x(\\d)")
     mapping = {}
     for k in weights.keys():
         m = matcher.match(k)
@@ -75,7 +75,7 @@ def remap_tf_predictors(orig_weights):
     weights = {k: v for k, v in orig_weights.items() if "BoxPredictor" in k}
     weights = {k: v for k, v in weights.items() if "BoxEncodingPredictor" in k}
 
-    matcher = re.compile("BoxPredictor_(\d+)")
+    matcher = re.compile("BoxPredictor_(\\d+)")
     for k in weights.keys():
         pos = matcher.match(k).group(1)
         wtype = "weight" if "weights" in k else "bias"
@@ -125,13 +125,15 @@ def get_state_dict(weights):
 def read_tf_weights(frozen_model):
     import tensorflow as tf
     from tensorflow.python.framework import tensor_util
+
     weights = {}
     with tf.Session() as sess:
-        with tf.gfile.GFile(frozen_model, 'rb') as f:
+        with tf.gfile.GFile(frozen_model, "rb") as f:
             graph_def = tf.GraphDef()
             graph_def.ParseFromString(f.read())
             tf.import_graph_def(graph_def)
         for n in graph_def.node:
-            if n.op == 'Const':
-                weights[n.name] = tensor_util.MakeNdarray(n.attr['value'].tensor)
+            if n.op == "Const":
+                weights[n.name] = tensor_util.MakeNdarray(
+                    n.attr["value"].tensor)
     return weights
diff --git a/retired_benchmarks/vision/classification_and_detection/python/models/ssd_mobilenet_v1.py b/retired_benchmarks/vision/classification_and_detection/python/models/ssd_mobilenet_v1.py
index dc77808c7..9d7b27191 100644
--- a/retired_benchmarks/vision/classification_and_detection/python/models/ssd_mobilenet_v1.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/models/ssd_mobilenet_v1.py
@@ -81,7 +81,8 @@ def __init__(self, in_channels, num_classes, num_anchors):
         self.classification = nn.Conv2d(
             in_channels, num_classes * num_anchors, kernel_size=1
         )
-        self.regression = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1)
+        self.regression = nn.Conv2d(
+            in_channels, num_anchors * 4, kernel_size=1)
 
         self.num_classes = num_classes
         self.num_anchors = num_anchors
@@ -161,13 +162,12 @@ def ssd_model(self, x):
             self._feature_map_shapes = shapes
 
         self.coder_weights = self.coder_weights.to(scores)
-        if box_regression.dim()==2:
+        if box_regression.dim() == 2:
             box_regression = box_regression[None]
         boxes = decode_boxes(box_regression, self.priors, self.coder_weights)
         # add a batch dimension
         return scores, boxes
 
-
     def forward(self, images):
         """
         Arguments:
@@ -175,13 +175,15 @@ def forward(self, images):
         """
 
         scores, boxes = self.ssd_model(images)
-        list_boxes=[]; list_labels=[]; list_scores=[]
+        list_boxes = []
+        list_labels = []
+        list_scores = []
         for b in range(len(scores)):
             bboxes, blabels, bscores = self.filter_results(scores[b], boxes[b])
             list_boxes.append(bboxes)
             list_labels.append(blabels.long())
             list_scores.append(bscores)
-        #boxes = self.rescale_boxes(boxes, height, width)
+        # boxes = self.rescale_boxes(boxes, height, width)
         return [list_boxes, list_labels, list_scores]
 
     def filter_results(self, scores, boxes):
@@ -190,8 +192,8 @@ def filter_results(self, scores, boxes):
         # on python. This implementation is faster on the
         # CPU, which is why we run this part on the CPU
         cpu_device = torch.device("cpu")
-        #boxes = boxes[0]
-        #scores = scores[0]
+        # boxes = boxes[0]
+        # scores = scores[0]
         boxes = boxes.to(cpu_device)
         scores = scores.to(cpu_device)
         selected_box_probs = []
@@ -205,7 +207,8 @@ def filter_results(self, scores, boxes):
             box_probs = nms(box_probs, self.nms_threshold)
             selected_box_probs.append(box_probs)
             labels.append(
-                torch.full((box_probs.size(0),), class_index, dtype=torch.int64)
+                torch.full(
+                    (box_probs.size(0),), class_index, dtype=torch.int64)
             )
         selected_box_probs = torch.cat(selected_box_probs)
         labels = torch.cat(labels)
diff --git a/retired_benchmarks/vision/classification_and_detection/python/models/ssd_r34.py b/retired_benchmarks/vision/classification_and_detection/python/models/ssd_r34.py
index 63e596b4a..e8138e3a1 100644
--- a/retired_benchmarks/vision/classification_and_detection/python/models/ssd_r34.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/models/ssd_r34.py
@@ -6,102 +6,114 @@
 import itertools
 import torch.nn.functional as F
 
-##Inspired by https://github.com/kuangliu/pytorch-ssd
+# Inspired by https://github.com/kuangliu/pytorch-ssd
+
 
 class Encoder(object):
     """
-        Transform between (bboxes, lables) <-> SSD output
-        
-        dboxes: default boxes in size 8732 x 4, 
-            encoder: input ltrb format, output xywh format
-            decoder: input xywh format, output ltrb format 
-
-        decode:
-            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
-            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
-            criteria : IoU threshold of bboexes
-            max_output : maximum number of output bboxes
+    Transform between (bboxes, lables) <-> SSD output
+
+    dboxes: default boxes in size 8732 x 4,
+        encoder: input ltrb format, output xywh format
+        decoder: input xywh format, output ltrb format
+
+    decode:
+        input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
+        output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
+        criteria : IoU threshold of bboexes
+        max_output : maximum number of output bboxes
     """
 
     def __init__(self, dboxes):
         self.dboxes = dboxes(order="ltrb")
         self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0)
         self.nboxes = self.dboxes.size(0)
-        #print("# Bounding boxes: {}".format(self.nboxes))
+        # print("# Bounding boxes: {}".format(self.nboxes))
         self.scale_xy = torch.tensor(dboxes.scale_xy)
         self.scale_wh = torch.tensor(dboxes.scale_wh)
-    
-    
-    def decode_batch(self, bboxes_in, scores_in,  criteria = 0.45, max_output=200):
+
+    def decode_batch(self, bboxes_in, scores_in,
+                     criteria=0.45, max_output=200):
         self.dboxes = self.dboxes.to(bboxes_in)
         self.dboxes_xywh = self.dboxes_xywh.to(bboxes_in)
-        bboxes, probs = scale_back_batch(bboxes_in, scores_in,self.scale_xy,self.scale_wh,self.dboxes_xywh)
-        boxes = []; labels=[]; scores=[]
+        bboxes, probs = scale_back_batch(
+            bboxes_in, scores_in, self.scale_xy, self.scale_wh, self.dboxes_xywh
+        )
+        boxes = []
+        labels = []
+        scores = []
         for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
             bbox = bbox.squeeze(0)
             prob = prob.squeeze(0)
-            dbox,dlabel,dscore=self.decode_single(bbox, prob, criteria, max_output)
+            dbox, dlabel, dscore = self.decode_single(
+                bbox, prob, criteria, max_output)
             boxes.append(dbox)
             labels.append(dlabel)
             scores.append(dscore)
-           
-        return [boxes,labels,scores]
+
+        return [boxes, labels, scores]
 
     # perform non-maximum suppression
-    def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
+    def decode_single(self, bboxes_in, scores_in,
+                      criteria, max_output, max_num=200):
         # Reference to https://github.com/amdegroot/ssd.pytorch
-       
-        bboxes_out = []        
+
+        bboxes_out = []
         scores_out = []
         labels_out = []
 
         for i, score in enumerate(scores_in.split(1, 1)):
             # skip background
-            if i == 0: continue
-            
+            if i == 0:
+                continue
+
             score = score.squeeze(1)
             mask = score > 0.05
 
             bboxes, score = bboxes_in[mask, :], score[mask]
-            if score.size(0) == 0: continue
+            if score.size(0) == 0:
+                continue
 
             score_sorted, score_idx_sorted = score.sort(dim=0)
-        
+
             # select max_output indices
             score_idx_sorted = score_idx_sorted[-max_num:]
             candidates = []
-        
+
             while score_idx_sorted.numel() > 0:
                 idx = score_idx_sorted[-1].item()
                 bboxes_sorted = bboxes[score_idx_sorted, :]
                 bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
-                iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
-                # we only need iou < criteria 
+                iou_sorted = calc_iou_tensor(
+                    bboxes_sorted, bboxes_idx).squeeze()
+                # we only need iou < criteria
                 score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
                 candidates.append(idx)
 
             bboxes_out.append(bboxes[candidates, :])
             scores_out.append(score[candidates])
-            labels_out.extend([i]*len(candidates))
-
-        bboxes_out, labels_out, scores_out = torch.cat(bboxes_out, dim=0), \
-               torch.tensor(labels_out, dtype=torch.long), \
-               torch.cat(scores_out, dim=0)
+            labels_out.extend([i] * len(candidates))
 
+        bboxes_out, labels_out, scores_out = (
+            torch.cat(bboxes_out, dim=0),
+            torch.tensor(labels_out, dtype=torch.long),
+            torch.cat(scores_out, dim=0),
+        )
 
         _, max_ids = scores_out.sort(dim=0)
         max_ids = max_ids[-max_output:]
         return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
 
+
 @torch.jit.script
 def calc_iou_tensor(box1, box2):
-    """ Calculation of IoU based on two boxes tensor,
-        Reference to https://github.com/kuangliu/pytorch-ssd
-        input:
-            box1 (N, 4) 
-            box2 (M, 4)
-        output:
-            IoU (N, M)
+    """Calculation of IoU based on two boxes tensor,
+    Reference to https://github.com/kuangliu/pytorch-ssd
+    input:
+        box1 (N, 4)
+        box2 (M, 4)
+    output:
+        IoU (N, M)
     """
     N = box1.size(0)
     M = box2.size(0)
@@ -110,37 +122,42 @@ def calc_iou_tensor(box1, box2):
     be2 = box2.unsqueeze(0).expand(N, -1, -1)
 
     # Left Top & Right Bottom
-    lt = torch.max(be1[:,:,:2], be2[:,:,:2])
-    rb = torch.min(be1[:,:,2:], be2[:,:,2:])
+    lt = torch.max(be1[:, :, :2], be2[:, :, :2])
+    rb = torch.min(be1[:, :, 2:], be2[:, :, 2:])
     delta = rb - lt
-    delta.clone().masked_fill_(delta < 0,0)
-    intersect = delta[:,:,0]*delta[:,:,1]
-    delta1 = be1[:,:,2:] - be1[:,:,:2]
-    area1 = delta1[:,:,0]*delta1[:,:,1]
-    delta2 = be2[:,:,2:] - be2[:,:,:2]
-    area2 = delta2[:,:,0]*delta2[:,:,1]
-
-    iou = intersect/(area1 + area2 - intersect)
+    delta.clone().masked_fill_(delta < 0, 0)
+    intersect = delta[:, :, 0] * delta[:, :, 1]
+    delta1 = be1[:, :, 2:] - be1[:, :, :2]
+    area1 = delta1[:, :, 0] * delta1[:, :, 1]
+    delta2 = be2[:, :, 2:] - be2[:, :, :2]
+    area2 = delta2[:, :, 0] * delta2[:, :, 1]
+
+    iou = intersect / (area1 + area2 - intersect)
     return iou
 
+
 @torch.jit.script
-def scale_back_batch(bboxes_in, scores_in,scale_xy,scale_wh,dboxes_xywh):
+def scale_back_batch(bboxes_in, scores_in, scale_xy, scale_wh, dboxes_xywh):
+    """
+    Do scale and transform from xywh to ltrb
+    suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox
     """
-        Do scale and transform from xywh to ltrb
-        suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox
-    """     
     bboxes_in = bboxes_in.permute(0, 2, 1)
     scores_in = scores_in.permute(0, 2, 1)
 
-    bboxes_in[:, :, :2] = scale_xy*bboxes_in[:, :, :2]
-    bboxes_in[:, :, 2:] = scale_wh*bboxes_in[:, :, 2:]
-    bboxes_in[:, :, :2] = bboxes_in[:, :, :2]*dboxes_xywh[:, :, 2:] + dboxes_xywh[:, :, :2]
-    bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp()*dboxes_xywh[:, :, 2:]
-    # Transform format to ltrb 
-    l, t, r, b = bboxes_in[:, :, 0] - 0.5*bboxes_in[:, :, 2],\
-                 bboxes_in[:, :, 1] - 0.5*bboxes_in[:, :, 3],\
-                 bboxes_in[:, :, 0] + 0.5*bboxes_in[:, :, 2],\
-                 bboxes_in[:, :, 1] + 0.5*bboxes_in[:, :, 3]
+    bboxes_in[:, :, :2] = scale_xy * bboxes_in[:, :, :2]
+    bboxes_in[:, :, 2:] = scale_wh * bboxes_in[:, :, 2:]
+    bboxes_in[:, :, :2] = (
+        bboxes_in[:, :, :2] * dboxes_xywh[:, :, 2:] + dboxes_xywh[:, :, :2]
+    )
+    bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * dboxes_xywh[:, :, 2:]
+    # Transform format to ltrb
+    l, t, r, b = (
+        bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2],
+        bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3],
+        bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2],
+        bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3],
+    )
     bboxes_in[:, :, 0] = l
     bboxes_in[:, :, 1] = t
     bboxes_in[:, :, 2] = r
@@ -149,92 +166,116 @@ def scale_back_batch(bboxes_in, scores_in,scale_xy,scale_wh,dboxes_xywh):
 
 
 class DefaultBoxes(object):
-    def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \
-                       scale_xy=0.1, scale_wh=0.2):
+    def __init__(
+        self,
+        fig_size,
+        feat_size,
+        steps,
+        scales,
+        aspect_ratios,
+        scale_xy=0.1,
+        scale_wh=0.2,
+    ):
 
         self.feat_size = feat_size
-        self.fig_size_w,self.fig_size_h = fig_size
+        self.fig_size_w, self.fig_size_h = fig_size
 
         self.scale_xy_ = scale_xy
         self.scale_wh_ = scale_wh
-        
+
         # According to https://github.com/weiliu89/caffe
         # Calculation method slightly different from paper
         self.steps_w = [st[0] for st in steps]
         self.steps_h = [st[1] for st in steps]
         self.scales = scales
-        fkw = self.fig_size_w//np.array(self.steps_w)
-        fkh = self.fig_size_h//np.array(self.steps_h)
+        fkw = self.fig_size_w // np.array(self.steps_w)
+        fkh = self.fig_size_h // np.array(self.steps_h)
         self.aspect_ratios = aspect_ratios
 
         self.default_boxes = []
         # size of feature and number of feature
         for idx, sfeat in enumerate(self.feat_size):
-            sfeat_w,sfeat_h=sfeat
-            sk1 = scales[idx][0]/self.fig_size_w
-            sk2 = scales[idx+1][1]/self.fig_size_h
-            sk3 = sqrt(sk1*sk2)
+            sfeat_w, sfeat_h = sfeat
+            sk1 = scales[idx][0] / self.fig_size_w
+            sk2 = scales[idx + 1][1] / self.fig_size_h
+            sk3 = sqrt(sk1 * sk2)
             all_sizes = [(sk1, sk1), (sk3, sk3)]
             for alpha in aspect_ratios[idx]:
-                w, h = sk1*sqrt(alpha), sk1/sqrt(alpha)
+                w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha)
                 all_sizes.append((w, h))
                 all_sizes.append((h, w))
             for w, h in all_sizes:
                 for i, j in itertools.product(range(sfeat_w), range(sfeat_h)):
-                    cx, cy = (j+0.5)/fkh[idx], (i+0.5)/fkw[idx]
-                    self.default_boxes.append((cx, cy, w, h)) 
+                    cx, cy = (j + 0.5) / fkh[idx], (i + 0.5) / fkw[idx]
+                    self.default_boxes.append((cx, cy, w, h))
         self.dboxes = torch.tensor(self.default_boxes)
         self.dboxes.clamp_(min=0, max=1)
         # For IoU calculation
         self.dboxes_ltrb = self.dboxes.clone()
-        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5*self.dboxes[:, 2]
-        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5*self.dboxes[:, 3]
-        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5*self.dboxes[:, 2]
-        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5*self.dboxes[:, 3]
-    
+        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]
+        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]
+        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]
+        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]
+
     @property
     def scale_xy(self):
         return self.scale_xy_
-    
-    @property    
+
+    @property
     def scale_wh(self):
         return self.scale_wh_
 
     def __call__(self, order="ltrb"):
-        if order == "ltrb": return self.dboxes_ltrb
-        if order == "xywh": return self.dboxes
+        if order == "ltrb":
+            return self.dboxes_ltrb
+        if order == "xywh":
+            return self.dboxes
+
 
-def dboxes_R34_coco(figsize,strides):
+def dboxes_R34_coco(figsize, strides):
     feat_size = [[50, 50], [25, 25], [13, 13], [7, 7], [3, 3], [3, 3]]
-    steps=[(int(figsize[0]/fs[0]),int(figsize[1]/fs[1])) for fs in feat_size]
-    scales = [(int(s*figsize[0]/300),int(s*figsize[1]/300)) for s in [21, 45, 99, 153, 207, 261, 315]] 
-    aspect_ratios =  [[2], [2, 3], [2, 3], [2, 3], [2], [2]] 
+    steps = [(int(figsize[0] / fs[0]), int(figsize[1] / fs[1]))
+             for fs in feat_size]
+    scales = [
+        (int(s * figsize[0] / 300), int(s * figsize[1] / 300))
+        for s in [21, 45, 99, 153, 207, 261, 315]
+    ]
+    aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
     dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
     return dboxes
 
+
 class SSD_R34(nn.Module):
     """
-        Build a SSD module to take 300x300 image input,
-        and output 8732 per class bounding boxes
+    Build a SSD module to take 300x300 image input,
+    and output 8732 per class bounding boxes
 
-        vggt: pretrained vgg16 (partial) model
-        label_num: number of classes (including background 0)
+    vggt: pretrained vgg16 (partial) model
+    label_num: number of classes (including background 0)
     """
-    def __init__(self, label_num=81, backbone='resnet34', model_path="./resnet34-333f7ec4.pth",strides=[3,3 ,2 ,2 ,2 ,2],extract_shapes=False):
+
+    def __init__(
+        self,
+        label_num=81,
+        backbone="resnet34",
+        model_path="./resnet34-333f7ec4.pth",
+        strides=[3, 3, 2, 2, 2, 2],
+        extract_shapes=False,
+    ):
 
         super(SSD_R34, self).__init__()
 
         self.label_num = label_num
         self.strides = strides
-        if backbone == 'resnet34':
+        if backbone == "resnet34":
             self.model = ResNet34()
             out_channels = 256
             self.out_chan = [out_channels, 512, 512, 256, 256, 256]
         else:
-            raise ValueError('Invalid backbone chosen')
+            raise ValueError("Invalid backbone chosen")
 
         self._build_additional_features(self.out_chan)
-        self.extract_shapes=extract_shapes
+        self.extract_shapes = extract_shapes
         # after l2norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2
         # classifer 1, 2, 3, 4, 5 ,6
 
@@ -242,110 +283,159 @@ def __init__(self, label_num=81, backbone='resnet34', model_path="./resnet34-333
         self.loc = []
         self.conf = []
         for nd, oc in zip(self.num_defaults, self.out_chan):
-            self.loc.append(nn.Conv2d(oc, nd*4, kernel_size=3, padding=1,stride=self.strides[0]))
-            self.conf.append(nn.Conv2d(oc, nd*label_num, kernel_size=3, padding=1,stride=self.strides[1]))
+            self.loc.append(
+                nn.Conv2d(
+                    oc,
+                    nd * 4,
+                    kernel_size=3,
+                    padding=1,
+                    stride=self.strides[0])
+            )
+            self.conf.append(
+                nn.Conv2d(
+                    oc, nd * label_num, kernel_size=3, padding=1, stride=self.strides[1]
+                )
+            )
 
         self.loc = nn.ModuleList(self.loc)
         self.conf = nn.ModuleList(self.conf)
         if not extract_shapes:
-            self.size=(1200,1200)
-            dboxes = dboxes_R34_coco(list(self.size),[3,3,2,2,2,2])
+            self.size = (1200, 1200)
+            dboxes = dboxes_R34_coco(list(self.size), [3, 3, 2, 2, 2, 2])
             self.encoder = Encoder(dboxes)
         # intitalize all weights
         self._init_weights()
         self.device = 1
+
     def _build_additional_features(self, input_channels):
         idx = 0
         self.additional_blocks = []
-        
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 256, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1,stride=self.strides[2]),
-            nn.ReLU(inplace=True),
-        ))
+
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 256, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    256,
+                    input_channels[idx + 1],
+                    kernel_size=3,
+                    padding=1,
+                    stride=self.strides[2],
+                ),
+                nn.ReLU(inplace=True),
+            )
+        )
         idx += 1
 
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 256, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1, stride=self.strides[3]),
-            nn.ReLU(inplace=True),
-        ))
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 256, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    256,
+                    input_channels[idx + 1],
+                    kernel_size=3,
+                    padding=1,
+                    stride=self.strides[3],
+                ),
+                nn.ReLU(inplace=True),
+            )
+        )
         idx += 1
 
         # conv9_1, conv9_2
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, input_channels[idx+1], kernel_size=3, padding=1, stride=self.strides[4]),
-            nn.ReLU(inplace=True),
-        ))
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 128, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    128,
+                    input_channels[idx + 1],
+                    kernel_size=3,
+                    padding=1,
+                    stride=self.strides[4],
+                ),
+                nn.ReLU(inplace=True),
+            )
+        )
         idx += 1
 
         # conv10_1, conv10_2
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, input_channels[idx+1], kernel_size=3,stride=self.strides[5]),
-            nn.ReLU(inplace=True),
-        ))
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 128, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    128, input_channels[idx + 1], kernel_size=3, stride=self.strides[5]
+                ),
+                nn.ReLU(inplace=True),
+            )
+        )
         idx += 1
 
-
-
         # conv11_1, conv11_2
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, input_channels[idx+1], kernel_size=3),
-            nn.ReLU(inplace=True),
-        ))
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 128, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(128, input_channels[idx + 1], kernel_size=3),
+                nn.ReLU(inplace=True),
+            )
+        )
 
         self.additional_blocks = nn.ModuleList(self.additional_blocks)
 
     def _init_weights(self):
 
-        layers = [
-            *self.additional_blocks,
-            *self.loc, *self.conf]
+        layers = [*self.additional_blocks, *self.loc, *self.conf]
 
         for layer in layers:
             for param in layer.parameters():
-                if param.dim() > 1: nn.init.xavier_uniform_(param)
+                if param.dim() > 1:
+                    nn.init.xavier_uniform_(param)
 
     # Shape the classifier to the view of bboxes
-    def bbox_view(self, src, loc, conf,extract_shapes=False):
+    def bbox_view(self, src, loc, conf, extract_shapes=False):
         ret = []
         features_shapes = []
         for s, l, c in zip(src, loc, conf):
-            ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1)))
-            # extract shapes for prior box initliziation 
+            ret.append(
+                (l(s).view(s.size(0), 4, -1),
+                 c(s).view(s.size(0), self.label_num, -1))
+            )
+            # extract shapes for prior box initliziation
             if extract_shapes:
-                ls=l(s)
-                features_shapes.append([ls.shape[2],ls.shape[3]])
+                ls = l(s)
+                features_shapes.append([ls.shape[2], ls.shape[3]])
         locs, confs = list(zip(*ret))
-        locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
-        return locs, confs,features_shapes
+        locs, confs = torch.cat(
+            locs, 2).contiguous(), torch.cat(
+            confs, 2).contiguous()
+        return locs, confs, features_shapes
 
     def forward(self, data):
         layers = self.model(data)
 
         # last result from network goes into additional blocks
         x = layers[-1]
-        
+
         additional_results = []
         for i, l in enumerate(self.additional_blocks):
-            
+
             x = l(x)
             additional_results.append(x)
 
         src = [*layers, *additional_results]
-        # Feature maps sizes depend on the image size. For 300x300 with strides=[1,1,2,2,2,1] it is 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4 
-        locs, confs,features_shapes = self.bbox_view(src, self.loc, self.conf,extract_shapes=self.extract_shapes)
+        # Feature maps sizes depend on the image size. For 300x300 with
+        # strides=[1,1,2,2,2,1] it is 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4,
+        # 1x1x4
+        locs, confs, features_shapes = self.bbox_view(
+            src, self.loc, self.conf, extract_shapes=self.extract_shapes
+        )
         if self.extract_shapes:
-            return locs, confs,features_shapes
-        else:    
-            # For SSD 300 with strides=[1,1,2,2,2,1] , shall return nbatch x 8732 x {nlabels, nlocs} results 
-            results=self.encoder.decode_batch(locs, confs, 0.50, 200) #[0]
-            return results #locs, confs,features_shapes
+            return locs, confs, features_shapes
+        else:
+            # For SSD 300 with strides=[1,1,2,2,2,1] , shall return nbatch x
+            # 8732 x {nlabels, nlocs} results
+            results = self.encoder.decode_batch(locs, confs, 0.50, 200)  # [0]
+            return results  # locs, confs,features_shapes
diff --git a/retired_benchmarks/vision/classification_and_detection/python/models/utils.py b/retired_benchmarks/vision/classification_and_detection/python/models/utils.py
index 940722075..49299f584 100644
--- a/retired_benchmarks/vision/classification_and_detection/python/models/utils.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/models/utils.py
@@ -15,7 +15,7 @@ def __init__(self, out):
         self.register_buffer("scale", torch.ones(out))
         self.register_buffer("bias", torch.zeros(out))
 
-    #@torch.jit.script_method
+    # @torch.jit.script_method
     def forward(self, x):
         scale = self.scale.view(1, -1, 1, 1)
         bias = self.bias.view(1, -1, 1, 1)
@@ -31,7 +31,7 @@ def __init__(self, out):
         super(BiasAdd, self).__init__()
         self.register_buffer("bias", torch.zeros(out))
 
-    #@torch.jit.script_method
+    # @torch.jit.script_method
     def forward(self, x):
         bias = self.bias.view(1, -1, 1, 1)
         return x + bias
@@ -52,14 +52,15 @@ def _compute_padding(self, input, dim):
         effective_filter_size = (filter_size - 1) * self.dilation[dim] + 1
         out_size = (input_size + self.stride[dim] - 1) // self.stride[dim]
         total_padding = max(
-            0, (out_size - 1) * self.stride[dim] + effective_filter_size - input_size
+            0, (out_size - 1) * self.stride[dim] +
+            effective_filter_size - input_size
         )
         additional_padding = int(total_padding % 2 != 0)
 
         return additional_padding, total_padding
 
     def forward(self, input):
-        #import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
         if self.padding == "VALID":
             return F.conv2d(
                 input,
@@ -151,8 +152,8 @@ def decode_boxes(rel_codes, boxes, weights):
     # type: (torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
 
     # perform some unpacking to make it JIT-fusion friendly
-    
-    #rel_codes=rel_codes[0][None]
+
+    # rel_codes=rel_codes[0][None]
     wx = weights[1]
     wy = weights[0]
     ww = weights[3]
@@ -163,10 +164,10 @@ def decode_boxes(rel_codes, boxes, weights):
     boxes_x2 = boxes[:, 3].unsqueeze(1).unsqueeze(0)
     boxes_y2 = boxes[:, 2].unsqueeze(1).unsqueeze(0)
 
-    dx = rel_codes[:,:, 1].unsqueeze(2)
-    dy = rel_codes[:,:, 0].unsqueeze(2)
-    dw = rel_codes[:,:, 3].unsqueeze(2)
-    dh = rel_codes[:,:, 2].unsqueeze(2)
+    dx = rel_codes[:, :, 1].unsqueeze(2)
+    dy = rel_codes[:, :, 0].unsqueeze(2)
+    dw = rel_codes[:, :, 3].unsqueeze(2)
+    dh = rel_codes[:, :, 2].unsqueeze(2)
 
     # implementation starts here
     widths = boxes_x2 - boxes_x1
@@ -180,7 +181,7 @@ def decode_boxes(rel_codes, boxes, weights):
     dh = dh / wh
 
     pred_ctr_x = dx * widths + ctr_x
-    #import pdb; pdb.set_trace()
+    # import pdb; pdb.set_trace()
     pred_ctr_y = dy * heights + ctr_y
     pred_w = torch.exp(dw) * widths
     pred_h = torch.exp(dh) * heights
@@ -194,5 +195,5 @@ def decode_boxes(rel_codes, boxes, weights):
         ],
         dim=2,
     )
-    #import pdb; pdb.set_trace()
+    # import pdb; pdb.set_trace()
     return pred_boxes
diff --git a/retired_benchmarks/vision/classification_and_detection/python/pycoco.py b/retired_benchmarks/vision/classification_and_detection/python/pycoco.py
index 931863569..f9d5f2e87 100644
--- a/retired_benchmarks/vision/classification_and_detection/python/pycoco.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/pycoco.py
@@ -1,5 +1,5 @@
-__author__ = 'tylin'
-__version__ = '2.0'
+__author__ = "tylin"
+__version__ = "2.0"
 # Interface for accessing the Microsoft COCO dataset.
 
 # Microsoft COCO is a large image dataset designed for object detection,
@@ -46,9 +46,10 @@
 
 import json
 import time
-#import matplotlib.pyplot as plt
-#from matplotlib.collections import PatchCollection
-#from matplotlib.patches import Polygon
+
+# import matplotlib.pyplot as plt
+# from matplotlib.collections import PatchCollection
+# from matplotlib.patches import Polygon
 import numpy as np
 import copy
 import itertools
@@ -56,6 +57,7 @@
 import os
 from collections import defaultdict
 import sys
+
 PYTHON_VERSION = sys.version_info[0]
 if PYTHON_VERSION == 2:
     from urllib import urlretrieve
@@ -64,7 +66,7 @@
 
 
 def _isArrayLike(obj):
-    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+    return hasattr(obj, "__iter__") and hasattr(obj, "__len__")
 
 
 class COCO:
@@ -76,40 +78,42 @@ def __init__(self, annotation_file=None):
         :return:
         """
         # load dataset
-        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
+        self.dataset, self.anns, self.cats, self.imgs = dict(), dict(), dict(), dict()
         self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
-        if not annotation_file == None:
-            print('loading annotations into memory...')
+        if not annotation_file is None:
+            print("loading annotations into memory...")
             tic = time.time()
-            dataset = json.load(open(annotation_file, 'r'))
-            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
-            print('Done (t={:0.2f}s)'.format(time.time()- tic))
+            dataset = json.load(open(annotation_file, "r"))
+            assert (
+                isinstance(dataset, dict)
+            ), "annotation file format {} not supported".format(type(dataset))
+            print("Done (t={:0.2f}s)".format(time.time() - tic))
             self.dataset = dataset
             self.createIndex()
 
     def createIndex(self):
         # create index
-        print('creating index...')
+        print("creating index...")
         anns, cats, imgs = {}, {}, {}
-        imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
-        if 'annotations' in self.dataset:
-            for ann in self.dataset['annotations']:
-                imgToAnns[ann['image_id']].append(ann)
-                anns[ann['id']] = ann
+        imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
+        if "annotations" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                imgToAnns[ann["image_id"]].append(ann)
+                anns[ann["id"]] = ann
 
-        if 'images' in self.dataset:
-            for img in self.dataset['images']:
-                imgs[img['id']] = img
+        if "images" in self.dataset:
+            for img in self.dataset["images"]:
+                imgs[img["id"]] = img
 
-        if 'categories' in self.dataset:
-            for cat in self.dataset['categories']:
-                cats[cat['id']] = cat
+        if "categories" in self.dataset:
+            for cat in self.dataset["categories"]:
+                cats[cat["id"]] = cat
 
-        if 'annotations' in self.dataset and 'categories' in self.dataset:
-            for ann in self.dataset['annotations']:
-                catToImgs[ann['category_id']].append(ann['image_id'])
+        if "annotations" in self.dataset and "categories" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                catToImgs[ann["category_id"]].append(ann["image_id"])
 
-        print('index created!')
+        print("index created!")
 
         # create class members
         self.anns = anns
@@ -123,8 +127,8 @@ def info(self):
         Print information about the annotation file.
         :return:
         """
-        for key, value in self.dataset['info'].items():
-            print('{}: {}'.format(key, value))
+        for key, value in self.dataset["info"].items():
+            print("{}: {}".format(key, value))
 
     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         """
@@ -139,19 +143,33 @@ def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
         if len(imgIds) == len(catIds) == len(areaRng) == 0:
-            anns = self.dataset['annotations']
+            anns = self.dataset["annotations"]
         else:
             if not len(imgIds) == 0:
-                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
+                lists = [
+                    self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns
+                ]
                 anns = list(itertools.chain.from_iterable(lists))
             else:
-                anns = self.dataset['annotations']
-            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
-            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
-        if not iscrowd == None:
-            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+                anns = self.dataset["annotations"]
+            anns = (
+                anns
+                if len(catIds) == 0
+                else [ann for ann in anns if ann["category_id"] in catIds]
+            )
+            anns = (
+                anns
+                if len(areaRng) == 0
+                else [
+                    ann
+                    for ann in anns
+                    if ann["area"] > areaRng[0] and ann["area"] < areaRng[1]
+                ]
+            )
+        if not iscrowd is None:
+            ids = [ann["id"] for ann in anns if ann["iscrowd"] == iscrowd]
         else:
-            ids = [ann['id'] for ann in anns]
+            ids = [ann["id"] for ann in anns]
         return ids
 
     def getCatIds(self, catNms=[], supNms=[], catIds=[]):
@@ -167,22 +185,34 @@ def getCatIds(self, catNms=[], supNms=[], catIds=[]):
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
         if len(catNms) == len(supNms) == len(catIds) == 0:
-            cats = self.dataset['categories']
+            cats = self.dataset["categories"]
         else:
-            cats = self.dataset['categories']
-            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
-            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
-            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
-        ids = [cat['id'] for cat in cats]
+            cats = self.dataset["categories"]
+            cats = (
+                cats
+                if len(catNms) == 0
+                else [cat for cat in cats if cat["name"] in catNms]
+            )
+            cats = (
+                cats
+                if len(supNms) == 0
+                else [cat for cat in cats if cat["supercategory"] in supNms]
+            )
+            cats = (
+                cats
+                if len(catIds) == 0
+                else [cat for cat in cats if cat["id"] in catIds]
+            )
+        ids = [cat["id"] for cat in cats]
         return ids
 
     def getImgIds(self, imgIds=[], catIds=[]):
-        '''
+        """
         Get img ids that satisfy given filter conditions.
         :param imgIds (int array) : get imgs for given ids
         :param catIds (int array) : get imgs with all given cats
         :return: ids (int array)  : integer array of img ids
-        '''
+        """
         imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
@@ -205,7 +235,7 @@ def loadAnns(self, ids=[]):
         """
         if _isArrayLike(ids):
             return [self.anns[id] for id in ids]
-        elif type(ids) == int:
+        elif isinstance(ids, int):
             return [self.anns[ids]]
 
     def loadCats(self, ids=[]):
@@ -216,7 +246,7 @@ def loadCats(self, ids=[]):
         """
         if _isArrayLike(ids):
             return [self.cats[id] for id in ids]
-        elif type(ids) == int:
+        elif isinstance(ids, int):
             return [self.cats[ids]]
 
     def loadImgs(self, ids=[]):
@@ -227,7 +257,7 @@ def loadImgs(self, ids=[]):
         """
         if _isArrayLike(ids):
             return [self.imgs[id] for id in ids]
-        elif type(ids) == int:
+        elif isinstance(ids, int):
             return [self.imgs[ids]]
 
     def showAnns(self, anns):
@@ -238,61 +268,88 @@ def showAnns(self, anns):
         """
         if len(anns) == 0:
             return 0
-        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
-            datasetType = 'instances'
-        elif 'caption' in anns[0]:
-            datasetType = 'captions'
+        if "segmentation" in anns[0] or "keypoints" in anns[0]:
+            datasetType = "instances"
+        elif "caption" in anns[0]:
+            datasetType = "captions"
         else:
-            raise Exception('datasetType not supported')
-        if datasetType == 'instances':
+            raise Exception("datasetType not supported")
+        if datasetType == "instances":
             ax = plt.gca()
             ax.set_autoscale_on(False)
             polygons = []
             color = []
             for ann in anns:
-                c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
-                if 'segmentation' in ann:
-                    if type(ann['segmentation']) == list:
+                c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
+                if "segmentation" in ann:
+                    if isinstance(ann["segmentation"], list):
                         # polygon
-                        for seg in ann['segmentation']:
-                            poly = np.array(seg).reshape((int(len(seg)/2), 2))
+                        for seg in ann["segmentation"]:
+                            poly = np.array(seg).reshape(
+                                (int(len(seg) / 2), 2))
                             polygons.append(Polygon(poly))
                             color.append(c)
                     else:
                         # mask
-                        t = self.imgs[ann['image_id']]
-                        if type(ann['segmentation']['counts']) == list:
-                            rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
+                        t = self.imgs[ann["image_id"]]
+                        if isinstance(ann["segmentation"]["counts"], list):
+                            rle = maskUtils.frPyObjects(
+                                [ann["segmentation"]], t["height"], t["width"]
+                            )
                         else:
-                            rle = [ann['segmentation']]
+                            rle = [ann["segmentation"]]
                         m = maskUtils.decode(rle)
-                        img = np.ones( (m.shape[0], m.shape[1], 3) )
-                        if ann['iscrowd'] == 1:
-                            color_mask = np.array([2.0,166.0,101.0])/255
-                        if ann['iscrowd'] == 0:
+                        img = np.ones((m.shape[0], m.shape[1], 3))
+                        if ann["iscrowd"] == 1:
+                            color_mask = np.array([2.0, 166.0, 101.0]) / 255
+                        if ann["iscrowd"] == 0:
                             color_mask = np.random.random((1, 3)).tolist()[0]
                         for i in range(3):
-                            img[:,:,i] = color_mask[i]
-                        ax.imshow(np.dstack( (img, m*0.5) ))
-                if 'keypoints' in ann and type(ann['keypoints']) == list:
+                            img[:, :, i] = color_mask[i]
+                        ax.imshow(np.dstack((img, m * 0.5)))
+                if "keypoints" in ann and isinstance(ann["keypoints"], list):
                     # turn skeleton into zero-based index
-                    sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
-                    kp = np.array(ann['keypoints'])
+                    sks = np.array(
+                        self.loadCats(
+                            ann["category_id"])[0]["skeleton"]) - 1
+                    kp = np.array(ann["keypoints"])
                     x = kp[0::3]
                     y = kp[1::3]
                     v = kp[2::3]
                     for sk in sks:
-                        if np.all(v[sk]>0):
-                            plt.plot(x[sk],y[sk], linewidth=3, color=c)
-                    plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
-                    plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
-            p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
+                        if np.all(v[sk] > 0):
+                            plt.plot(x[sk], y[sk], linewidth=3, color=c)
+                    plt.plot(
+                        x[v > 0],
+                        y[v > 0],
+                        "o",
+                        markersize=8,
+                        markerfacecolor=c,
+                        markeredgecolor="k",
+                        markeredgewidth=2,
+                    )
+                    plt.plot(
+                        x[v > 1],
+                        y[v > 1],
+                        "o",
+                        markersize=8,
+                        markerfacecolor=c,
+                        markeredgecolor=c,
+                        markeredgewidth=2,
+                    )
+            p = PatchCollection(
+                polygons,
+                facecolor=color,
+                linewidths=0,
+                alpha=0.4)
             ax.add_collection(p)
-            p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
+            p = PatchCollection(
+                polygons, facecolor="none", edgecolors=color, linewidths=2
+            )
             ax.add_collection(p)
-        elif datasetType == 'captions':
+        elif datasetType == "captions":
             for ann in anns:
-                print(ann['caption'])
+                print(ann["caption"])
 
     def loadRes(self, resFile):
         """
@@ -301,69 +358,78 @@ def loadRes(self, resFile):
         :return: res (obj)         : result api object
         """
         res = COCO()
-        res.dataset['images'] = [img for img in self.dataset['images']]
+        res.dataset["images"] = [img for img in self.dataset["images"]]
 
-        print('Loading and preparing results...')
+        print("Loading and preparing results...")
         tic = time.time()
-        if type(resFile) == str: #or type(resFile) == unicode:
+        if isinstance(resFile, str):  # or type(resFile) == unicode:
             anns = json.load(open(resFile))
-        elif type(resFile) == np.ndarray:
+        elif isinstance(resFile, np.ndarray):
             anns = self.loadNumpyAnnotations(resFile)
         else:
             anns = resFile
-        assert type(anns) == list, 'results in not an array of objects'
-        annsImgIds = [ann['image_id'] for ann in anns]
-        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
-               'Results do not correspond to current coco set'
-        if 'caption' in anns[0]:
-            imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
-            res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
+        assert isinstance(anns, list), "results in not an array of objects"
+        annsImgIds = [ann["image_id"] for ann in anns]
+        assert set(annsImgIds) == (
+            set(annsImgIds) & set(self.getImgIds())
+        ), "Results do not correspond to current coco set"
+        if "caption" in anns[0]:
+            imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
+                [ann["image_id"] for ann in anns]
+            )
+            res.dataset["images"] = [
+                img for img in res.dataset["images"] if img["id"] in imgIds
+            ]
             for id, ann in enumerate(anns):
-                ann['id'] = id+1
-        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+                ann["id"] = id + 1
+        elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
+            res.dataset["categories"] = copy.deepcopy(
+                self.dataset["categories"])
             for id, ann in enumerate(anns):
-                bb = ann['bbox']
-                x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
-                if not 'segmentation' in ann:
-                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
-                ann['area'] = bb[2]*bb[3]
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'segmentation' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+                bb = ann["bbox"]
+                x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+                if not "segmentation" in ann:
+                    ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann["area"] = bb[2] * bb[3]
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "segmentation" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(
+                self.dataset["categories"])
             for id, ann in enumerate(anns):
-                # now only support compressed RLE format as segmentation results
-                ann['area'] = maskUtils.area(ann['segmentation'])
-                if not 'bbox' in ann:
-                    ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'keypoints' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+                # now only support compressed RLE format as segmentation
+                # results
+                ann["area"] = maskUtils.area(ann["segmentation"])
+                if not "bbox" in ann:
+                    ann["bbox"] = maskUtils.toBbox(ann["segmentation"])
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "keypoints" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(
+                self.dataset["categories"])
             for id, ann in enumerate(anns):
-                s = ann['keypoints']
+                s = ann["keypoints"]
                 x = s[0::3]
                 y = s[1::3]
-                x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
-                ann['area'] = (x1-x0)*(y1-y0)
-                ann['id'] = id + 1
-                ann['bbox'] = [x0,y0,x1-x0,y1-y0]
-        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+                x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann["area"] = (x1 - x0) * (y1 - y0)
+                ann["id"] = id + 1
+                ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
+        print("DONE (t={:0.2f}s)".format(time.time() - tic))
 
-        res.dataset['annotations'] = anns
+        res.dataset["annotations"] = anns
         res.createIndex()
         return res
 
-    def download(self, tarDir = None, imgIds = [] ):
-        '''
+    def download(self, tarDir=None, imgIds=[]):
+        """
         Download COCO images from mscoco.org server.
         :param tarDir (str): COCO results directory name
                imgIds (list): images to be downloaded
         :return:
-        '''
+        """
         if tarDir is None:
-            print('Please specify target directory')
+            print("Please specify target directory")
             return -1
         if len(imgIds) == 0:
             imgs = self.imgs.values()
@@ -374,10 +440,13 @@ def download(self, tarDir = None, imgIds = [] ):
             os.makedirs(tarDir)
         for i, img in enumerate(imgs):
             tic = time.time()
-            fname = os.path.join(tarDir, img['file_name'])
+            fname = os.path.join(tarDir, img["file_name"])
             if not os.path.exists(fname):
-                urlretrieve(img['coco_url'], fname)
-            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
+                urlretrieve(img["coco_url"], fname)
+            print(
+                "downloaded {}/{} images (t={:0.1f}s)".format(i,
+                                                              N, time.time() - tic)
+            )
 
     def loadNumpyAnnotations(self, data):
         """
@@ -385,21 +454,23 @@ def loadNumpyAnnotations(self, data):
         :param  data (numpy.ndarray)
         :return: annotations (python nested list)
         """
-        print('Converting ndarray to lists...')
-        assert(type(data) == np.ndarray)
+        print("Converting ndarray to lists...")
+        assert isinstance(data, np.ndarray)
         print(data.shape)
-        assert(data.shape[1] == 7)
+        assert data.shape[1] == 7
         N = data.shape[0]
         ann = []
         for i in range(N):
             if i % 1000000 == 0:
-                print('{}/{}'.format(i,N))
-            ann += [{
-                'image_id'  : int(data[i, 0]),
-                'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
-                'score' : data[i, 5],
-                'category_id': int(data[i, 6]),
-                }]
+                print("{}/{}".format(i, N))
+            ann += [
+                {
+                    "image_id": int(data[i, 0]),
+                    "bbox": [data[i, 1], data[i, 2], data[i, 3], data[i, 4]],
+                    "score": data[i, 5],
+                    "category_id": int(data[i, 6]),
+                }
+            ]
         return ann
 
     def annToRLE(self, ann):
@@ -407,20 +478,20 @@ def annToRLE(self, ann):
         Convert annotation which can be polygons, uncompressed RLE to RLE.
         :return: binary mask (numpy 2D array)
         """
-        t = self.imgs[ann['image_id']]
-        h, w = t['height'], t['width']
-        segm = ann['segmentation']
-        if type(segm) == list:
+        t = self.imgs[ann["image_id"]]
+        h, w = t["height"], t["width"]
+        segm = ann["segmentation"]
+        if isinstance(segm, list):
             # polygon -- a single object might consist of multiple parts
             # we merge all parts into one mask rle code
             rles = maskUtils.frPyObjects(segm, h, w)
             rle = maskUtils.merge(rles)
-        elif type(segm['counts']) == list:
+        elif isinstance(segm["counts"], list):
             # uncompressed RLE
             rle = maskUtils.frPyObjects(segm, h, w)
         else:
             # rle
-            rle = ann['segmentation']
+            rle = ann["segmentation"]
         return rle
 
     def annToMask(self, ann):
diff --git a/retired_benchmarks/vision/classification_and_detection/python/version.py b/retired_benchmarks/vision/classification_and_detection/python/version.py
index 1152dbb41..570348596 100644
--- a/retired_benchmarks/vision/classification_and_detection/python/version.py
+++ b/retired_benchmarks/vision/classification_and_detection/python/version.py
@@ -1,3 +1,2 @@
-
-version = '0.1.0'
-git_version = '05df3bae82ef9fc933277385eb778e3f22cd0c6a'
+version = "0.1.0"
+git_version = "05df3bae82ef9fc933277385eb778e3f22cd0c6a"
diff --git a/retired_benchmarks/vision/classification_and_detection/setup.py b/retired_benchmarks/vision/classification_and_detection/setup.py
index c1e2fbcf0..758d874fb 100644
--- a/retired_benchmarks/vision/classification_and_detection/setup.py
+++ b/retired_benchmarks/vision/classification_and_detection/setup.py
@@ -13,17 +13,20 @@
 from setuptools import setup, find_packages, Command
 
 TOP_DIR = os.path.realpath(os.path.dirname(__file__))
-SRC_DIR = os.path.join(TOP_DIR, 'python')
+SRC_DIR = os.path.join(TOP_DIR, "python")
 
 try:
-    git_version = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=TOP_DIR).decode('ascii').strip()
+    git_version = (
+        subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=TOP_DIR)
+        .decode("ascii")
+        .strip()
+    )
 except (OSError, subprocess.CalledProcessError):
     git_version = None
 
-with open(os.path.join(TOP_DIR, 'VERSION_NUMBER')) as version_file:
-    VersionInfo = namedtuple('VersionInfo', ['version', 'git_version'])(
-        version=version_file.read().strip(),
-        git_version=git_version
+with open(os.path.join(TOP_DIR, "VERSION_NUMBER")) as version_file:
+    VersionInfo = namedtuple("VersionInfo", ["version", "git_version"])(
+        version=version_file.read().strip(), git_version=git_version
     )
 
 
@@ -37,49 +40,67 @@ def finalize_options(self):
         pass
 
     def run(self):
-        with open(os.path.join(SRC_DIR, 'version.py'), 'w') as f:
-            f.write(dedent('''
+        with open(os.path.join(SRC_DIR, "version.py"), "w") as f:
+            f.write(
+                dedent(
+                    """
             version = '{version}'
             git_version = '{git_version}'
-            '''.format(**dict(VersionInfo._asdict()))))
+            """.format(
+                        **dict(VersionInfo._asdict())
+                    )
+                )
+            )
 
 
 class build_py(setuptools.command.build_py.build_py):
     def run(self):
-        self.run_command('create_version')
+        self.run_command("create_version")
         setuptools.command.build_py.build_py.run(self)
 
 
 class build(distutils.command.build.build):
     def run(self):
-        self.run_command('build_py')
+        self.run_command("build_py")
 
 
 class develop(setuptools.command.develop.develop):
     def run(self):
-        self.run_command('create_version')
-        self.run_command('build')
+        self.run_command("create_version")
+        self.run_command("build")
         setuptools.command.develop.develop.run(self)
 
 
 cmdclass = {
-    'create_version': create_version,
-    'build_py': build_py,
-    'build': build,
-    'develop': develop,
+    "create_version": create_version,
+    "build_py": build_py,
+    "build": build,
+    "develop": develop,
 }
 
 setup(
     name="mlperf-inference",
     version=VersionInfo.version,
-    description='mlperf inference benchmark',
-    setup_requires=['pytest-runner'],
-    tests_require=['graphviz', 'parameterized', 'pytest', 'pytest-cov', 'pyyaml'],
+    description="mlperf inference benchmark",
+    setup_requires=["pytest-runner"],
+    tests_require=[
+        "graphviz",
+        "parameterized",
+        "pytest",
+        "pytest-cov",
+        "pyyaml"],
     cmdclass=cmdclass,
     packages=find_packages(),
-    author='guschmue@microsoft.com',
-    author_email='guschmue@microsoft.com',
-    url='https://github.com/mlperf/inference',
-    install_requires=['numpy>=1.14.1', 'onnx>=1.5', 'pybind11', 'Cython',
-                        'pycocotools', 'mlperf_loadgen', 'opencv-python-headless']
+    author="guschmue@microsoft.com",
+    author_email="guschmue@microsoft.com",
+    url="https://github.com/mlperf/inference",
+    install_requires=[
+        "numpy>=1.14.1",
+        "onnx>=1.5",
+        "pybind11",
+        "Cython",
+        "pycocotools",
+        "mlperf_loadgen",
+        "opencv-python-headless",
+    ],
 )
diff --git a/retired_benchmarks/vision/classification_and_detection/tools/accuracy-coco.py b/retired_benchmarks/vision/classification_and_detection/tools/accuracy-coco.py
index 1e15999f1..95e04193b 100644
--- a/retired_benchmarks/vision/classification_and_detection/tools/accuracy-coco.py
+++ b/retired_benchmarks/vision/classification_and_detection/tools/accuracy-coco.py
@@ -19,15 +19,29 @@
 
 # pylint: disable=missing-docstring
 
+
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json")
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
     parser.add_argument("--coco-dir", required=True, help="coco directory")
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
-    parser.add_argument("--output-file", default="coco-results.json", help="path to output file")
-    parser.add_argument("--use-inv-map", action="store_true", help="use inverse label map")
-    parser.add_argument("--remove-48-empty-images", action="store_true", help="used in case you removed 48 empty images while preprocessing the dataset")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--output-file", default="coco-results.json", help="path to output file"
+    )
+    parser.add_argument(
+        "--use-inv-map", action="store_true", help="use inverse label map"
+    )
+    parser.add_argument(
+        "--remove-48-empty-images",
+        action="store_true",
+        help="used in case you removed 48 empty images while preprocessing the dataset",
+    )
     args = parser.parse_args()
     return args
 
@@ -35,10 +49,14 @@ def get_args():
 def main():
     args = get_args()
 
-    cocoGt = COCO(os.path.join(args.coco_dir, "annotations/instances_val2017.json"))
+    cocoGt = COCO(
+        os.path.join(
+            args.coco_dir,
+            "annotations/instances_val2017.json"))
 
     if args.use_inv_map:
-        inv_map = [0] + cocoGt.getCatIds() # First label in inv_map is not used
+        # First label in inv_map is not used
+        inv_map = [0] + cocoGt.getCatIds()
 
     with open(args.mlperf_accuracy_file, "r") as f:
         results = json.load(f)
@@ -47,7 +65,7 @@ def main():
     image_ids = set()
     seen = set()
     no_results = 0
-    if args.remove_48_empty_images:        
+    if args.remove_48_empty_images:
         im_ids = []
         for i in cocoGt.getCatIds():
             im_ids += cocoGt.catToImgs[i]
@@ -57,7 +75,7 @@ def main():
         image_map = cocoGt.dataset["images"]
 
     for j in results:
-        idx = j['qsl_idx']
+        idx = j["qsl_idx"]
         # de-dupe in case loadgen sends the same image multiple times
         if idx in seen:
             continue
@@ -66,12 +84,14 @@ def main():
         # reconstruct from mlperf accuracy log
         # what is written by the benchmark is an array of float32's:
         # id, box[0], box[1], box[2], box[3], score, detection_class
-        # note that id is a index into instances_val2017.json, not the actual image_id
-        data = np.frombuffer(bytes.fromhex(j['data']), np.float32)
+        # note that id is a index into instances_val2017.json, not the actual
+        # image_id
+        data = np.frombuffer(bytes.fromhex(j["data"]), np.float32)
         if len(data) < 7:
             # handle images that had no results
             image = image_map[idx]
-            # by adding the id to image_ids we make pycoco aware of the no-result image
+            # by adding the id to image_ids we make pycoco aware of the
+            # no-result image
             image_ids.add(image["id"])
             no_results += 1
             if args.verbose:
@@ -79,11 +99,15 @@ def main():
             continue
 
         for i in range(0, len(data), 7):
-            image_idx, ymin, xmin, ymax, xmax, score, label = data[i:i + 7]
+            image_idx, ymin, xmin, ymax, xmax, score, label = data[i: i + 7]
             image = image_map[idx]
             image_idx = int(image_idx)
             if image_idx != idx:
-                print("ERROR: loadgen({}) and payload({}) disagree on image_idx".format(idx, image_idx))
+                print(
+                    "ERROR: loadgen({}) and payload({}) disagree on image_idx".format(
+                        idx, image_idx
+                    )
+                )
             image_id = image["id"]
             height, width = image["height"], image["width"]
             ymin *= height
@@ -95,25 +119,35 @@ def main():
             if args.use_inv_map:
                 label = inv_map[label]
             # pycoco wants {imageID,x1,y1,w,h,score,class}
-            detections.append({
-                "image_id": image_id,
-                "image_loc": loc,
-                "category_id": label,
-                "bbox": [float(xmin), float(ymin), float(xmax - xmin), float(ymax - ymin)],
-                "score": float(score)})
+            detections.append(
+                {
+                    "image_id": image_id,
+                    "image_loc": loc,
+                    "category_id": label,
+                    "bbox": [
+                        float(xmin),
+                        float(ymin),
+                        float(xmax - xmin),
+                        float(ymax - ymin),
+                    ],
+                    "score": float(score),
+                }
+            )
             image_ids.add(image_id)
 
     with open(args.output_file, "w") as fp:
         json.dump(detections, fp, sort_keys=True, indent=4)
 
-    cocoDt = cocoGt.loadRes(args.output_file) # Load from file to bypass error with Python3
-    cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
+    cocoDt = cocoGt.loadRes(
+        args.output_file
+    )  # Load from file to bypass error with Python3
+    cocoEval = COCOeval(cocoGt, cocoDt, iouType="bbox")
     cocoEval.params.imgIds = list(image_ids)
     cocoEval.evaluate()
     cocoEval.accumulate()
     cocoEval.summarize()
 
-    print("mAP={:.3f}%".format(100. * cocoEval.stats[0]))
+    print("mAP={:.3f}%".format(100.0 * cocoEval.stats[0]))
     if args.verbose:
         print("found {} results".format(len(results)))
         print("found {} images".format(len(image_ids)))
diff --git a/retired_benchmarks/vision/classification_and_detection/tools/accuracy-imagenet.py b/retired_benchmarks/vision/classification_and_detection/tools/accuracy-imagenet.py
index 1879e0f09..a57810891 100644
--- a/retired_benchmarks/vision/classification_and_detection/tools/accuracy-imagenet.py
+++ b/retired_benchmarks/vision/classification_and_detection/tools/accuracy-imagenet.py
@@ -15,21 +15,32 @@
 
 # pylint: disable=missing-docstring
 
+
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--imagenet-val-file", required=True, help="path to imagenet val_map.txt")
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
-    parser.add_argument("--dtype", default="float32", choices=["float32", "int32", "int64"], help="data type of the label")
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--imagenet-val-file", required=True, help="path to imagenet val_map.txt"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float32", "int32", "int64"],
+        help="data type of the label",
+    )
     args = parser.parse_args()
     return args
 
-dtype_map = {
-    "float32": np.float32,
-    "int32": np.int32,
-    "int64": np.int64
-}
+
+dtype_map = {"float32": np.float32, "int32": np.int32, "int64": np.int64}
+
 
 def main():
     args = get_args()
@@ -46,7 +57,7 @@ def main():
     seen = set()
     good = 0
     for j in results:
-        idx = j['qsl_idx']
+        idx = j["qsl_idx"]
 
         # de-dupe in case loadgen sends the same image multiple times
         if idx in seen:
@@ -57,7 +68,7 @@ def main():
         img, label = imagenet[idx]
 
         # reconstruct label from mlperf accuracy log
-        data = np.frombuffer(bytes.fromhex(j['data']), dtype_map[args.dtype])
+        data = np.frombuffer(bytes.fromhex(j["data"]), dtype_map[args.dtype])
         found = int(data[0])
         if label == found:
             good += 1
@@ -65,7 +76,11 @@ def main():
             if args.verbose:
                 print("{}, expected: {}, found {}".format(img, label, found))
 
-    print("accuracy={:.3f}%, good={}, total={}".format(100. * good / len(seen), good, len(seen)))
+    print(
+        "accuracy={:.3f}%, good={}, total={}".format(
+            100.0 * good / len(seen), good, len(seen)
+        )
+    )
     if args.verbose:
         print("found and ignored {} dupes".format(len(results) - len(seen)))
 
diff --git a/retired_benchmarks/vision/classification_and_detection/tools/calibrate_torchvision_model.py b/retired_benchmarks/vision/classification_and_detection/tools/calibrate_torchvision_model.py
index 815e5fe20..3b002003a 100644
--- a/retired_benchmarks/vision/classification_and_detection/tools/calibrate_torchvision_model.py
+++ b/retired_benchmarks/vision/classification_and_detection/tools/calibrate_torchvision_model.py
@@ -12,12 +12,13 @@
 
 class CalibrationDataset(Dataset):
     def __init__(self, root, files, transform):
-        with open(files, 'r') as f:
-            self.files = [os.path.join(root, fn.strip()) for fn in f.readlines()]
+        with open(files, "r") as f:
+            self.files = [os.path.join(root, fn.strip())
+                          for fn in f.readlines()]
         self.transform = transform
 
     def __getitem__(self, idx):
-        image = Image.open(self.files[idx]).convert('RGB')
+        image = Image.open(self.files[idx]).convert("RGB")
         image = self.transform(image)
         return image
 
@@ -25,7 +26,7 @@ def __len__(self):
         return len(self.files)
 
 
-def quantize_model(model, dataloader, backend='fbgemm'):
+def quantize_model(model, dataloader, backend="fbgemm"):
     if backend not in torch.backends.quantized.supported_engines:
         raise RuntimeError("Quantized backend not supported ")
     torch.backends.quantized.engine = backend
@@ -36,29 +37,40 @@ def quantize_model(model, dataloader, backend='fbgemm'):
     # Make sure that weight qconfig matches that of the serialized models
     model.qconfig = torch.quantization.get_default_qconfig(backend)
     torch.quantization.prepare(model, inplace=True)
-    print('calibrating...')
+    print("calibrating...")
     for x in tqdm(dataloader):
         model(x)
-    print('calibration DONE!')
+    print("calibration DONE!")
     torch.quantization.convert(model, inplace=True)
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, default='resnet50')
-    parser.add_argument('--image-dir', type=str, default='imagenet/val')
-    parser.add_argument('--image-list', type=str, default='../../calibration/ImageNet/cal_image_list_option_1.txt')
+    parser.add_argument("--model", type=str, default="resnet50")
+    parser.add_argument("--image-dir", type=str, default="imagenet/val")
+    parser.add_argument(
+        "--image-list",
+        type=str,
+        default="../../calibration/ImageNet/cal_image_list_option_1.txt",
+    )
     args = parser.parse_args()
     print(args)
 
-    transform = transforms.Compose([                                                   
-        transforms.Resize(256),                                                        
-        transforms.CenterCrop(224),                                                    
-        transforms.ToTensor(),                                                         
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),   
-    ])                                                                                 
-
-    dataset = CalibrationDataset(root=args.image_dir, files=args.image_list, transform=transform)
+    transform = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[
+                    0.485, 0.456, 0.406], std=[
+                    0.229, 0.224, 0.225]),
+        ]
+    )
+
+    dataset = CalibrationDataset(
+        root=args.image_dir, files=args.image_list, transform=transform
+    )
     dataloader = DataLoader(dataset, batch_size=1)
 
     model = eval(args.model)(pretrained=True, progress=True, quantize=False)
@@ -67,11 +79,10 @@ def main():
 
     inp = torch.rand(1, 3, 224, 224)
     script_module = torch.jit.trace(model, inp)
-    save_path = f'{args.model}.pt'
+    save_path = f"{args.model}.pt"
     torch.jit.save(script_module, save_path)
-    print(f'saved: {save_path}')
+    print(f"saved: {save_path}")
 
 
-if __name__=='__main__':
+if __name__ == "__main__":
     main()
-
diff --git a/retired_benchmarks/vision/classification_and_detection/tools/coco-analyze.py b/retired_benchmarks/vision/classification_and_detection/tools/coco-analyze.py
index 7f22daa84..f58677f9d 100755
--- a/retired_benchmarks/vision/classification_and_detection/tools/coco-analyze.py
+++ b/retired_benchmarks/vision/classification_and_detection/tools/coco-analyze.py
@@ -36,7 +36,7 @@ def annotate_image(results, cocoGt, output):
 
     new_results = collections.defaultdict(list)
     for result in results:
-        new_results[result['image_id']].append(result)
+        new_results[result["image_id"]].append(result)
     print("Unique images = {}".format(len(new_results)))
     results = new_results
 
@@ -44,32 +44,40 @@ def annotate_image(results, cocoGt, output):
         draw = None
         image = None
         for v in result:
-            box = v['bbox']
-            score = v['score']
+            box = v["bbox"]
+            score = v["score"]
             predicted_class = v["category_id"]
             try:
                 predicted_class = cocoGt.loadCats(predicted_class)[0]["name"]
             except Exception as ex:
-                print("category {} not found, image {}".format(predicted_class, v["image_loc"]))
+                print(
+                    "category {} not found, image {}".format(
+                        predicted_class, v["image_loc"]
+                    )
+                )
             # predicted_class = self.class_names[c]
             # "image_loc": "/home/gs/data/coco300/val2017/000000397133.jpg",
             if not draw:
-                image = Image.open(v['image_loc'])
-                if image.mode != 'RGB':
-                    image = image.convert('RGB')
+                image = Image.open(v["image_loc"])
+                if image.mode != "RGB":
+                    image = image.convert("RGB")
 
                 draw = ImageDraw.Draw(image)
             # font = ImageFont.truetype(font='FreeMono.ttf',
-            #            size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
+            # size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
             try:
                 left, top, w, h = box
                 bottom = top + h
                 right = left + w
-                top = max(0, np.floor(top + 0.5).astype('int32'))
-                left = max(0, np.floor(left + 0.5).astype('int32'))
-                bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
-                right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
-                label = '{} {:.2f}'.format(predicted_class, score)
+                top = max(0, np.floor(top + 0.5).astype("int32"))
+                left = max(0, np.floor(left + 0.5).astype("int32"))
+                bottom = min(
+                    image.size[1], np.floor(
+                        bottom + 0.5).astype("int32"))
+                right = min(
+                    image.size[0], np.floor(
+                        right + 0.5).astype("int32"))
+                label = "{} {:.2f}".format(predicted_class, score)
                 # label_size = draw.textsize(label, font)
                 label_size = draw.textsize(label)
 
@@ -80,11 +88,19 @@ def annotate_image(results, cocoGt, output):
 
                 color = ImageColor.getrgb("red")
                 thickness = 0
-                draw.rectangle([left + thickness, top + thickness, right - thickness, bottom - thickness], outline=color)
+                draw.rectangle(
+                    [
+                        left + thickness,
+                        top + thickness,
+                        right - thickness,
+                        bottom - thickness,
+                    ],
+                    outline=color,
+                )
                 draw.text(text_origin, label, fill=color)  # , font=font)
             except Exception as ex:
-                print("{} failed, ex {}".format(v['image_loc'], ex))
-        image.save(os.path.join(output, os.path.basename(v['image_loc'])))
+                print("{} failed, ex {}".format(v["image_loc"], ex))
+        image.save(os.path.join(output, os.path.basename(v["image_loc"])))
         del draw
 
 
@@ -93,7 +109,7 @@ def calculate_map(results, cocoGt, output):
     # x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
 
     cocoDt = cocoGt.loadRes(results)
-    cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
+    cocoEval = COCOeval(cocoGt, cocoDt, iouType="bbox")
     cocoEval.evaluate()
     cocoEval.accumulate()
     cocoEval.summarize()
@@ -110,11 +126,11 @@ def calculate_map(results, cocoGt, output):
         "DetectionBoxes_Recall/AR@100": cocoEval.stats[8],
         "DetectionBoxes_Recall/AR@100 (small)": cocoEval.stats[9],
         "DetectionBoxes_Recall/AR@100 (medium)": cocoEval.stats[10],
-        "DetectionBoxes_Recall/AR@100 (large)": cocoEval.stats[11]
+        "DetectionBoxes_Recall/AR@100 (large)": cocoEval.stats[11],
     }
 
-    mAP = all_metrics['DetectionBoxes_Precision/mAP']
-    recall = all_metrics['DetectionBoxes_Recall/AR@100']
+    mAP = all_metrics["DetectionBoxes_Precision/mAP"]
+    recall = all_metrics["DetectionBoxes_Recall/AR@100"]
     print("mAP={}, recall={}".format(mAP, recall))
 
 
@@ -124,7 +140,8 @@ def main():
     with open(args.input, "r") as f:
         results = json.load(f)
 
-    annotation_file = os.path.join(args.coco, "annotations/instances_val2017.json")
+    annotation_file = os.path.join(
+        args.coco, "annotations/instances_val2017.json")
     cocoGt = COCO(annotation_file)
     annotate_image(results, cocoGt, args.output)
     calculate_map(args.input, cocoGt, args.output)
diff --git a/retired_benchmarks/vision/classification_and_detection/tools/lglog2csv.py b/retired_benchmarks/vision/classification_and_detection/tools/lglog2csv.py
index 901bd795d..6f32433f2 100644
--- a/retired_benchmarks/vision/classification_and_detection/tools/lglog2csv.py
+++ b/retired_benchmarks/vision/classification_and_detection/tools/lglog2csv.py
@@ -36,32 +36,43 @@ def main():
     with open(args.input, "r") as fp:
         mode, mean, latency_90, latency_99, qps = None, 0, 0, 0, 0
         for line in fp:
-            m = re.match("^Scenario\s*:\s*(\w+).*", line)
+            m = re.match("^Scenario\\s*:\\s*(\\w+).*", line)
             if m:
                 mode = m.group(1)
-            m = re.match("^90.00 percentile latency.*:\s*(\d+).*", line)
+            m = re.match("^90.00 percentile latency.*:\\s*(\\d+).*", line)
             if m:
                 latency_90 = m.group(1)
-            m = re.match("^99.00 percentile latency.*:\s*(\d+).*", line)
+            m = re.match("^99.00 percentile latency.*:\\s*(\\d+).*", line)
             if m:
                 latency_99 = m.group(1)
-            m = re.match("^Mean latency.*:\s*(\d+).*", line)
+            m = re.match("^Mean latency.*:\\s*(\\d+).*", line)
             if m:
                 mean = m.group(1)
-            m = re.match("^Completed samples per second.*:\s*(\d+).*", line)
+            m = re.match("^Completed samples per second.*:\\s*(\\d+).*", line)
             if m:
                 qps = m.group(1)
-            m = re.match("^QPS w/ loadgen overhead.*:\s*(\d+).*", line)
+            m = re.match("^QPS w/ loadgen overhead.*:\\s*(\\d+).*", line)
             if m:
                 qps = m.group(1)
-            m = re.match("^Samples per second.*:\s*(\d+).*", line)
+            m = re.match("^Samples per second.*:\\s*(\\d+).*", line)
             if m:
                 qps = m.group(1)
             m = re.match("Test Parameters Used.*", line)
             if m:
-                print("{},{},{},{},{},{},{},{},{},{}".format(
-                    args.name, now, args.machine, args.runtime, args.model,
-                    mode, qps, mean, latency_90, latency_99))
+                print(
+                    "{},{},{},{},{},{},{},{},{},{}".format(
+                        args.name,
+                        now,
+                        args.machine,
+                        args.runtime,
+                        args.model,
+                        mode,
+                        qps,
+                        mean,
+                        latency_90,
+                        latency_99,
+                    )
+                )
                 mode, mean, latency_90, latency_99, qps = None, 0, 0, 0, 0
 
 
diff --git a/retired_benchmarks/vision/classification_and_detection/tools/resnet_save.py b/retired_benchmarks/vision/classification_and_detection/tools/resnet_save.py
index fca66ea26..0eaa0ef58 100755
--- a/retired_benchmarks/vision/classification_and_detection/tools/resnet_save.py
+++ b/retired_benchmarks/vision/classification_and_detection/tools/resnet_save.py
@@ -43,256 +43,308 @@
 from official.utils.misc import model_helpers
 
 
-
-
 def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
-  """Serving input fn for raw jpeg images."""
-
-  def _preprocess_image(image_bytes):
-    """Preprocess a single raw image."""
-    # Bounding box around the whole image.
-    bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
-    height, width, num_channels = image_shape
-    image = imagenet_preprocessing.preprocess_image(
-        image_bytes, bbox, height, width, num_channels, is_training=False)
-    return image
-
-  image_bytes_list = tf.placeholder(
-      shape=[None], dtype=tf.string, name='input_tensor')
-  images = tf.map_fn(
-      _preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
-  return tf.estimator.export.TensorServingInputReceiver(
-      images, {'image_bytes': image_bytes_list})
-
-
-
-
-def resnet_model_fn(features, labels, mode, model_class,
-                    resnet_size, weight_decay, learning_rate_fn, momentum,
-                    data_format, resnet_version, loss_scale,
-                    loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE,
-                    fine_tune=False):
-  """Shared functionality for different resnet model_fns.
-
-  Initializes the ResnetModel representing the model layers
-  and uses that model to build the necessary EstimatorSpecs for
-  the `mode` in question. For training, this means building losses,
-  the optimizer, and the train op that get passed into the EstimatorSpec.
-  For evaluation and prediction, the EstimatorSpec is returned without
-  a train op, but with the necessary parameters for the given mode.
-
-  Args:
-    features: tensor representing input images
-    labels: tensor representing class labels for all input images
-    mode: current estimator mode; should be one of
-      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
-    model_class: a class representing a TensorFlow model that has a __call__
-      function. We assume here that this is a subclass of ResnetModel.
-    resnet_size: A single integer for the size of the ResNet model.
-    weight_decay: weight decay loss rate used to regularize learned variables.
-    learning_rate_fn: function that returns the current learning rate given
-      the current global_step
-    momentum: momentum term used for optimization
-    data_format: Input format ('channels_last', 'channels_first', or None).
-      If set to None, the format is dependent on whether a GPU is available.
-    resnet_version: Integer representing which version of the ResNet network to
-      use. See README for details. Valid values: [1, 2]
-    loss_scale: The factor to scale the loss for numerical stability. A detailed
-      summary is present in the arg parser help text.
-    loss_filter_fn: function that takes a string variable name and returns
-      True if the var should be included in loss calculation, and False
-      otherwise. If None, batch_normalization variables will be excluded
-      from the loss.
-    dtype: the TensorFlow dtype to use for calculations.
-    fine_tune: If True only train the dense layers(final layers).
-
-  Returns:
-    EstimatorSpec parameterized according to the input params and the
-    current mode.
-  """
-
-  model = model_class(resnet_size, data_format, resnet_version=resnet_version,
-                      dtype=dtype)
-
-  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
-
-  # This acts as a no-op if the logits are already in fp32 (provided logits are
-  # not a SparseTensor). If dtype is is low precision, logits must be cast to
-  # fp32 for numerical stability.
-  logits = tf.cast(logits, tf.float32)
-
-  predictions = {
-      'classes': tf.argmax(logits, axis=1),
-      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
-  }
-
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    # Return the predictions and the specification for serving a SavedModel
+    """Serving input fn for raw jpeg images."""
+
+    def _preprocess_image(image_bytes):
+        """Preprocess a single raw image."""
+        # Bounding box around the whole image.
+        bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
+        height, width, num_channels = image_shape
+        image = imagenet_preprocessing.preprocess_image(
+            image_bytes, bbox, height, width, num_channels, is_training=False
+        )
+        return image
+
+    image_bytes_list = tf.placeholder(
+        shape=[None], dtype=tf.string, name="input_tensor"
+    )
+    images = tf.map_fn(
+        _preprocess_image, image_bytes_list, back_prop=False, dtype=dtype
+    )
+    return tf.estimator.export.TensorServingInputReceiver(
+        images, {"image_bytes": image_bytes_list}
+    )
+
+
+def resnet_model_fn(
+    features,
+    labels,
+    mode,
+    model_class,
+    resnet_size,
+    weight_decay,
+    learning_rate_fn,
+    momentum,
+    data_format,
+    resnet_version,
+    loss_scale,
+    loss_filter_fn=None,
+    dtype=resnet_model.DEFAULT_DTYPE,
+    fine_tune=False,
+):
+    """Shared functionality for different resnet model_fns.
+
+    Initializes the ResnetModel representing the model layers
+    and uses that model to build the necessary EstimatorSpecs for
+    the `mode` in question. For training, this means building losses,
+    the optimizer, and the train op that get passed into the EstimatorSpec.
+    For evaluation and prediction, the EstimatorSpec is returned without
+    a train op, but with the necessary parameters for the given mode.
+
+    Args:
+      features: tensor representing input images
+      labels: tensor representing class labels for all input images
+      mode: current estimator mode; should be one of
+        `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
+      model_class: a class representing a TensorFlow model that has a __call__
+        function. We assume here that this is a subclass of ResnetModel.
+      resnet_size: A single integer for the size of the ResNet model.
+      weight_decay: weight decay loss rate used to regularize learned variables.
+      learning_rate_fn: function that returns the current learning rate given
+        the current global_step
+      momentum: momentum term used for optimization
+      data_format: Input format ('channels_last', 'channels_first', or None).
+        If set to None, the format is dependent on whether a GPU is available.
+      resnet_version: Integer representing which version of the ResNet network to
+        use. See README for details. Valid values: [1, 2]
+      loss_scale: The factor to scale the loss for numerical stability. A detailed
+        summary is present in the arg parser help text.
+      loss_filter_fn: function that takes a string variable name and returns
+        True if the var should be included in loss calculation, and False
+        otherwise. If None, batch_normalization variables will be excluded
+        from the loss.
+      dtype: the TensorFlow dtype to use for calculations.
+      fine_tune: If True only train the dense layers(final layers).
+
+    Returns:
+      EstimatorSpec parameterized according to the input params and the
+      current mode.
+    """
+
+    model = model_class(
+        resnet_size, data_format, resnet_version=resnet_version, dtype=dtype
+    )
+
+    logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
+
+    # This acts as a no-op if the logits are already in fp32 (provided logits are
+    # not a SparseTensor). If dtype is is low precision, logits must be cast to
+    # fp32 for numerical stability.
+    logits = tf.cast(logits, tf.float32)
+
+    predictions = {
+        "classes": tf.argmax(logits, axis=1),
+        "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
+    }
+
+    if mode == tf.estimator.ModeKeys.PREDICT:
+        # Return the predictions and the specification for serving a SavedModel
+        return tf.estimator.EstimatorSpec(
+            mode=mode,
+            predictions=predictions,
+            export_outputs={
+                "predict": tf.estimator.export.PredictOutput(predictions)},
+        )
+
+    # Calculate loss, which includes softmax cross entropy and L2
+    # regularization.
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+        logits=logits, labels=labels)
+
+    # Create a tensor named cross_entropy for logging purposes.
+    tf.identity(cross_entropy, name="cross_entropy")
+
+    # If no loss_filter_fn is passed, assume we want the default behavior,
+    # which is that batch_normalization variables are excluded from loss.
+    def exclude_batch_norm(name):
+        return "batch_normalization" not in name
+
+    loss_filter_fn = loss_filter_fn or exclude_batch_norm
+
+    # Add weight decay to the loss.
+    l2_loss = weight_decay * tf.add_n(
+        # loss is computed using fp32 for numerical stability.
+        [
+            tf.nn.l2_loss(tf.cast(v, tf.float32))
+            for v in tf.trainable_variables()
+            if loss_filter_fn(v.name)
+        ]
+    )
+    tf.summary.scalar("l2_loss", l2_loss)
+    loss = cross_entropy + l2_loss
+
     return tf.estimator.EstimatorSpec(
         mode=mode,
         predictions=predictions,
-        export_outputs={
-            'predict': tf.estimator.export.PredictOutput(predictions)
-        })
-
-  # Calculate loss, which includes softmax cross entropy and L2 regularization.
-  cross_entropy = tf.losses.sparse_softmax_cross_entropy(
-      logits=logits, labels=labels)
-
-  # Create a tensor named cross_entropy for logging purposes.
-  tf.identity(cross_entropy, name='cross_entropy')
-
-  # If no loss_filter_fn is passed, assume we want the default behavior,
-  # which is that batch_normalization variables are excluded from loss.
-  def exclude_batch_norm(name):
-    return 'batch_normalization' not in name
-  loss_filter_fn = loss_filter_fn or exclude_batch_norm
-
-  # Add weight decay to the loss.
-  l2_loss = weight_decay * tf.add_n(
-      # loss is computed using fp32 for numerical stability.
-      [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()
-       if loss_filter_fn(v.name)])
-  tf.summary.scalar('l2_loss', l2_loss)
-  loss = cross_entropy + l2_loss
-
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=predictions,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=metrics)
-
-
-def resnet_main(
-    flags_obj, model_function, input_function, dataset_name, shape=None):
-  """Shared main loop for ResNet Models.
-
-  Args:
-    flags_obj: An object containing parsed flags. See define_resnet_flags()
-      for details.
-    model_function: the function that instantiates the Model and builds the
-      ops for train/eval. This will be passed directly into the estimator.
-    input_function: the function that processes the dataset and returns a
-      dataset that the estimator can train on. This will be wrapped with
-      all the relevant flags for running and passed to estimator.
-    dataset_name: the name of the dataset for training and evaluation. This is
-      used for logging purpose.
-    shape: list of ints representing the shape of the images used for training.
-      This is only used if flags_obj.export_dir is passed.
-  """
-
-  print("RESNET MAIN")
-  model_helpers.apply_clean(flags.FLAGS)
-
-  # Ensures flag override logic is only executed if explicitly triggered.
-  if flags_obj.tf_gpu_thread_mode:
-    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
-
-  # Creates session config. allow_soft_placement = True, is required for
-  # multi-GPU and is not harmful for other modes.
-  session_config = tf.ConfigProto(allow_soft_placement=True)
-
-  run_config = tf.estimator.RunConfig(
-      session_config=session_config,
-      save_checkpoints_secs=60*60*24)
-
-  # Initializes model with all but the dense layer from pretrained ResNet.
-  if flags_obj.pretrained_model_checkpoint_path is not None:
-    warm_start_settings = tf.estimator.WarmStartSettings(
-        flags_obj.pretrained_model_checkpoint_path,
-        vars_to_warm_start='^(?!.*dense)')
-  else:
-    warm_start_settings = None
-
-  classifier = tf.estimator.Estimator(
-      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
-      warm_start_from=warm_start_settings, params={
-          'resnet_size': int(flags_obj.resnet_size),
-          'data_format': flags_obj.data_format,
-          'batch_size': flags_obj.batch_size,
-          'resnet_version': int(flags_obj.resnet_version),
-          'loss_scale': flags_core.get_loss_scale(flags_obj),
-          'dtype': flags_core.get_tf_dtype(flags_obj),
-          'fine_tune': flags_obj.fine_tune
-      })
-
-  run_params = {
-      'batch_size': flags_obj.batch_size,
-      'dtype': flags_core.get_tf_dtype(flags_obj),
-      'resnet_size': flags_obj.resnet_size,
-      'resnet_version': flags_obj.resnet_version,
-      'synthetic_data': flags_obj.use_synthetic_data,
-      'train_epochs': flags_obj.train_epochs,
-  }
-
-  def input_fn_eval():
-    return input_function(
-        is_training=False,
-        data_dir=flags_obj.data_dir,
-        batch_size=distribution_utils.per_device_batch_size(
-            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
-        num_epochs=1,
-        dtype=flags_core.get_tf_dtype(flags_obj))
-
-  schedule, n_loops = [0], 1
-  if flags_obj.export_dir is not None:
-    # Exports a saved model for the given classifier.
-    export_dtype = flags_core.get_tf_dtype(flags_obj)
-    if flags_obj.image_bytes_as_serving_input:
-      input_receiver_fn = functools.partial(
-          image_bytes_serving_input_fn, shape, dtype=export_dtype)
+        loss=loss,
+        train_op=train_op,
+        eval_metric_ops=metrics,
+    )
+
+
+def resnet_main(flags_obj, model_function,
+                input_function, dataset_name, shape=None):
+    """Shared main loop for ResNet Models.
+
+    Args:
+      flags_obj: An object containing parsed flags. See define_resnet_flags()
+        for details.
+      model_function: the function that instantiates the Model and builds the
+        ops for train/eval. This will be passed directly into the estimator.
+      input_function: the function that processes the dataset and returns a
+        dataset that the estimator can train on. This will be wrapped with
+        all the relevant flags for running and passed to estimator.
+      dataset_name: the name of the dataset for training and evaluation. This is
+        used for logging purpose.
+      shape: list of ints representing the shape of the images used for training.
+        This is only used if flags_obj.export_dir is passed.
+    """
+
+    print("RESNET MAIN")
+    model_helpers.apply_clean(flags.FLAGS)
+
+    # Ensures flag override logic is only executed if explicitly triggered.
+    if flags_obj.tf_gpu_thread_mode:
+        override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
+
+    # Creates session config. allow_soft_placement = True, is required for
+    # multi-GPU and is not harmful for other modes.
+    session_config = tf.ConfigProto(allow_soft_placement=True)
+
+    run_config = tf.estimator.RunConfig(
+        session_config=session_config, save_checkpoints_secs=60 * 60 * 24
+    )
+
+    # Initializes model with all but the dense layer from pretrained ResNet.
+    if flags_obj.pretrained_model_checkpoint_path is not None:
+        warm_start_settings = tf.estimator.WarmStartSettings(
+            flags_obj.pretrained_model_checkpoint_path,
+            vars_to_warm_start="^(?!.*dense)",
+        )
     else:
-      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
-          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
-    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
-                                 strip_default_attrs=True)
+        warm_start_settings = None
+
+    classifier = tf.estimator.Estimator(
+        model_fn=model_function,
+        model_dir=flags_obj.model_dir,
+        config=run_config,
+        warm_start_from=warm_start_settings,
+        params={
+            "resnet_size": int(flags_obj.resnet_size),
+            "data_format": flags_obj.data_format,
+            "batch_size": flags_obj.batch_size,
+            "resnet_version": int(flags_obj.resnet_version),
+            "loss_scale": flags_core.get_loss_scale(flags_obj),
+            "dtype": flags_core.get_tf_dtype(flags_obj),
+            "fine_tune": flags_obj.fine_tune,
+        },
+    )
+
+    run_params = {
+        "batch_size": flags_obj.batch_size,
+        "dtype": flags_core.get_tf_dtype(flags_obj),
+        "resnet_size": flags_obj.resnet_size,
+        "resnet_version": flags_obj.resnet_version,
+        "synthetic_data": flags_obj.use_synthetic_data,
+        "train_epochs": flags_obj.train_epochs,
+    }
+
+    def input_fn_eval():
+        return input_function(
+            is_training=False,
+            data_dir=flags_obj.data_dir,
+            batch_size=distribution_utils.per_device_batch_size(
+                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)
+            ),
+            num_epochs=1,
+            dtype=flags_core.get_tf_dtype(flags_obj),
+        )
+
+    schedule, n_loops = [0], 1
+    if flags_obj.export_dir is not None:
+        # Exports a saved model for the given classifier.
+        export_dtype = flags_core.get_tf_dtype(flags_obj)
+        if flags_obj.image_bytes_as_serving_input:
+            input_receiver_fn = functools.partial(
+                image_bytes_serving_input_fn, shape, dtype=export_dtype
+            )
+        else:
+            input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
+                shape, batch_size=flags_obj.batch_size, dtype=export_dtype
+            )
+        classifier.export_savedmodel(
+            flags_obj.export_dir, input_receiver_fn, strip_default_attrs=True
+        )
 
 
 def define_resnet_flags(resnet_size_choices=None):
-  """Add flags and validators for ResNet."""
-  flags_core.define_base()
-  flags_core.define_performance(num_parallel_calls=False,
-                                tf_gpu_thread_mode=True,
-                                datasets_num_private_threads=True,
-                                datasets_num_parallel_batches=True)
-  flags_core.define_image()
-  flags_core.define_benchmark()
-  flags.adopt_module_key_flags(flags_core)
-
-  flags.DEFINE_enum(
-      name='resnet_version', short_name='rv', default='1',
-      enum_values=['1', '2'],
-      help=flags_core.help_wrap(
-          'Version of ResNet. (1 or 2) See README.md for details.'))
-  flags.DEFINE_bool(
-      name='fine_tune', short_name='ft', default=False,
-      help=flags_core.help_wrap(
-          'If True do not train any parameters except for the final layer.'))
-  flags.DEFINE_string(
-      name='pretrained_model_checkpoint_path', short_name='pmcp', default=None,
-      help=flags_core.help_wrap(
-          'If not None initialize all the network except the final layer with '
-          'these values'))
-  flags.DEFINE_boolean(
-      name='eval_only', default=False,
-      help=flags_core.help_wrap('Skip training and only perform evaluation on '
-                                'the latest checkpoint.'))
-  flags.DEFINE_boolean(
-      name='image_bytes_as_serving_input', default=False,
-      help=flags_core.help_wrap(
-          'If True exports savedmodel with serving signature that accepts '
-          'JPEG image bytes instead of a fixed size [HxWxC] tensor that '
-          'represents the image. The former is easier to use for serving at '
-          'the expense of image resize/cropping being done as part of model '
-          'inference. Note, this flag only applies to ImageNet and cannot '
-          'be used for CIFAR.'))
-
-  choice_kwargs = dict(
-      name='resnet_size', short_name='rs', default='50',
-      help=flags_core.help_wrap('The size of the ResNet model to use.'))
-
-  if resnet_size_choices is None:
-    flags.DEFINE_string(**choice_kwargs)
-  else:
-    flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
+    """Add flags and validators for ResNet."""
+    flags_core.define_base()
+    flags_core.define_performance(
+        num_parallel_calls=False,
+        tf_gpu_thread_mode=True,
+        datasets_num_private_threads=True,
+        datasets_num_parallel_batches=True,
+    )
+    flags_core.define_image()
+    flags_core.define_benchmark()
+    flags.adopt_module_key_flags(flags_core)
+
+    flags.DEFINE_enum(
+        name="resnet_version",
+        short_name="rv",
+        default="1",
+        enum_values=["1", "2"],
+        help=flags_core.help_wrap(
+            "Version of ResNet. (1 or 2) See README.md for details."
+        ),
+    )
+    flags.DEFINE_bool(
+        name="fine_tune",
+        short_name="ft",
+        default=False,
+        help=flags_core.help_wrap(
+            "If True do not train any parameters except for the final layer."
+        ),
+    )
+    flags.DEFINE_string(
+        name="pretrained_model_checkpoint_path",
+        short_name="pmcp",
+        default=None,
+        help=flags_core.help_wrap(
+            "If not None initialize all the network except the final layer with "
+            "these values"
+        ),
+    )
+    flags.DEFINE_boolean(
+        name="eval_only",
+        default=False,
+        help=flags_core.help_wrap(
+            "Skip training and only perform evaluation on " "the latest checkpoint."
+        ),
+    )
+    flags.DEFINE_boolean(
+        name="image_bytes_as_serving_input",
+        default=False,
+        help=flags_core.help_wrap(
+            "If True exports savedmodel with serving signature that accepts "
+            "JPEG image bytes instead of a fixed size [HxWxC] tensor that "
+            "represents the image. The former is easier to use for serving at "
+            "the expense of image resize/cropping being done as part of model "
+            "inference. Note, this flag only applies to ImageNet and cannot "
+            "be used for CIFAR."
+        ),
+    )
+
+    choice_kwargs = dict(
+        name="resnet_size",
+        short_name="rs",
+        default="50",
+        help=flags_core.help_wrap("The size of the ResNet model to use."),
+    )
+
+    if resnet_size_choices is None:
+        flags.DEFINE_string(**choice_kwargs)
+    else:
+        flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
diff --git a/retired_benchmarks/vision/classification_and_detection/tools/ssd-nhwc.py b/retired_benchmarks/vision/classification_and_detection/tools/ssd-nhwc.py
index f255fd965..d7ef9599e 100644
--- a/retired_benchmarks/vision/classification_and_detection/tools/ssd-nhwc.py
+++ b/retired_benchmarks/vision/classification_and_detection/tools/ssd-nhwc.py
@@ -4,33 +4,39 @@
 import tensorflow as tf
 from tensorflow.core.framework import graph_pb2
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument('pbfile')
+    parser.add_argument("pbfile")
     return parser.parse_args()
 
+
 def insert_transpose(graph, a, b, to_nchw):
     if not isinstance(b, list):
         b = [b]
     trans_perm = graph.node.add()
-    trans_perm.name = a.name + '/transpose/perm'
-    trans_perm.op = 'Const'
-    trans_perm.attr['dtype'].type = 3 # DT_INT32
-    trans_perm.attr['value'].tensor.dtype = 3 # DT_INT32
-    trans_perm.attr['value'].tensor.tensor_shape.dim.add()
-    trans_perm.attr['value'].tensor.tensor_shape.dim[0].size = 4
+    trans_perm.name = a.name + "/transpose/perm"
+    trans_perm.op = "Const"
+    trans_perm.attr["dtype"].type = 3  # DT_INT32
+    trans_perm.attr["value"].tensor.dtype = 3  # DT_INT32
+    trans_perm.attr["value"].tensor.tensor_shape.dim.add()
+    trans_perm.attr["value"].tensor.tensor_shape.dim[0].size = 4
     if to_nchw:
-        trans_perm.attr['value'].tensor.tensor_content = b'\000\000\000\000\003\000\000\000\001\000\000\000\002\000\000\000'
+        trans_perm.attr["value"].tensor.tensor_content = (
+            b"\000\000\000\000\003\000\000\000\001\000\000\000\002\000\000\000"
+        )
     else:
-        trans_perm.attr['value'].tensor.tensor_content = b'\000\000\000\000\002\000\000\000\003\000\000\000\001\000\000\000'
-    
+        trans_perm.attr["value"].tensor.tensor_content = (
+            b"\000\000\000\000\002\000\000\000\003\000\000\000\001\000\000\000"
+        )
+
     trans = graph.node.add()
-    trans.name = a.name + '/transpose'
-    trans.op = 'Transpose'
+    trans.name = a.name + "/transpose"
+    trans.op = "Transpose"
     trans.input.append(a.name)
     trans.input.append(trans_perm.name)
-    trans.attr['T'].type = 1
-    trans.attr['Tperm'].type = 3
+    trans.attr["T"].type = 1
+    trans.attr["Tperm"].type = 3
 
     for n in b:
         inputs = []
@@ -45,6 +51,7 @@ def insert_transpose(graph, a, b, to_nchw):
         for i in range(0, cnt):
             n.input.append(inputs[i])
 
+
 def convert_list_nhwc(l):
     c = l.i[1]
     h = l.i[2]
@@ -52,102 +59,116 @@ def convert_list_nhwc(l):
     l.i[1] = h
     l.i[2] = w
     l.i[3] = c
-    
+
+
 def convert_conv_nhwc(node_conv):
-    node_conv.attr['data_format'].s = b'NHWC'
-    convert_list_nhwc(node_conv.attr['dilations'].list)
-    convert_list_nhwc(node_conv.attr['strides'].list)
+    node_conv.attr["data_format"].s = b"NHWC"
+    convert_list_nhwc(node_conv.attr["dilations"].list)
+    convert_list_nhwc(node_conv.attr["strides"].list)
+
 
 def convert_general_nhwc(node):
-    node.attr['data_format'].s = b'NHWC'
+    node.attr["data_format"].s = b"NHWC"
+
 
 def convert_mp_nhwc(node_mp):
-    node_mp.attr['data_format'].s = b'NHWC'
-    convert_list_nhwc(node_mp.attr['ksize'].list)
-    convert_list_nhwc(node_mp.attr['strides'].list)
+    node_mp.attr["data_format"].s = b"NHWC"
+    convert_list_nhwc(node_mp.attr["ksize"].list)
+    convert_list_nhwc(node_mp.attr["strides"].list)
+
 
 def convert_image_nhwc(node_image):
-    c = node_image.attr['shape'].shape.dim[1].size
-    del node_image.attr['shape'].shape.dim[1]
-    d = node_image.attr['shape'].shape.dim.add()
+    c = node_image.attr["shape"].shape.dim[1].size
+    del node_image.attr["shape"].shape.dim[1]
+    d = node_image.attr["shape"].shape.dim.add()
     d.size = c
 
+
 def init_node(n):
     node = {}
-    node['node'] = n
-    node['inputs'] = []
-    node['outputs'] = []
+    node["node"] = n
+    node["inputs"] = []
+    node["outputs"] = []
     return node
 
+
 def connect_nodes(n1, n2):
-    if n2['node'].name not in n1['outputs']:
-        n1['outputs'].append(n2['node'].name)
-        n2['inputs'].append(n1['node'].name)
+    if n2["node"].name not in n1["outputs"]:
+        n1["outputs"].append(n2["node"].name)
+        n2["inputs"].append(n1["node"].name)
     else:
-        print('{} -> {} already connected'.format(n1['node'].name, n2['node'].name))
+        print(
+            "{} -> {} already connected".format(n1["node"].name, n2["node"].name))
+
 
 def disconnect_nodes(n1, n2):
-    if n1['node'].name not in n2['inputs'] or n2['node'].name not in n1['outputs']:
-        print('{} -> {} not connected'.format(n1['node'].name, n2['node'].name))
-    for i in range(0, len(n1['outputs'])):
-        if n1['outputs'][i] == n2['node'].name:
-            del n1['outputs'][i]
+    if n1["node"].name not in n2["inputs"] or n2["node"].name not in n1["outputs"]:
+        print(
+            "{} -> {} not connected".format(n1["node"].name, n2["node"].name))
+    for i in range(0, len(n1["outputs"])):
+        if n1["outputs"][i] == n2["node"].name:
+            del n1["outputs"][i]
             break
-    for i in range(0, len(n2['inputs'])):
-        if n2['inputs'][i] == n1['node'].name:
-            del n2['inputs'][i]
+    for i in range(0, len(n2["inputs"])):
+        if n2["inputs"][i] == n1["node"].name:
+            del n2["inputs"][i]
             break
-            
+
+
 def build_graph(graph):
     node_map = {}
     for n in graph.node:
         node = init_node(n)
         node_map[n.name] = node
     for n in node_map:
-        for i in node_map[n]['node'].input:
-            if ':' in i:
-                i = i[:i.find(':')]
-            i = i.lstrip('^')
+        for i in node_map[n]["node"].input:
+            if ":" in i:
+                i = i[: i.find(":")]
+            i = i.lstrip("^")
             if i not in node_map:
-                print('node {} not found'.format(i))
+                print("node {} not found".format(i))
             else:
                 connect_nodes(node_map[i], node_map[n])
     return node_map
 
+
 def trim_const_from_graph(node_map):
     trim_list = []
     for n in node_map:
-        if node_map[n]['node'].op == 'Const':
+        if node_map[n]["node"].op == "Const":
             trim_list.append(n)
     for n in trim_list:
-        print('trimming {}'.format(n))
-        for o in node_map[n]['outputs']:
+        print("trimming {}".format(n))
+        for o in node_map[n]["outputs"]:
             disconnect_nodes(node_map[n], node_map[o])
         del node_map[n]
 
     trim_list = []
     for n in node_map:
-        if node_map[n]['node'].op == 'Identity' and len(node_map[n]['inputs']) == 0:
+        if node_map[n]["node"].op == "Identity" and len(
+                node_map[n]["inputs"]) == 0:
             trim_list.append(n)
     for n in trim_list:
-        print('trimming {}'.format(n))
-        for o in node_map[n]['outputs']:
+        print("trimming {}".format(n))
+        for o in node_map[n]["outputs"]:
             disconnect_nodes(node_map[n], node_map[o])
         del node_map[n]
 
 
 def all_input_in_nhwc(n, node_map, nhwc_nodes):
-    for i in node_map[n]['inputs']:
+    for i in node_map[n]["inputs"]:
         if i not in nhwc_nodes:
             return False
     return True
 
+
 def all_output_in_nhwc(n, node_map, nhwc_nodes):
-    for o in node_map[n]['outputs']:
+    for o in node_map[n]["outputs"]:
         if o not in nhwc_nodes:
             return False
     return True
 
+
 def find_nhwc_region(node_map):
     transpose_nhwc_nodes = {}
     transpose_nchw_nodes = {}
@@ -156,7 +177,7 @@ def find_nhwc_region(node_map):
     transpose_nhwc_nodes_append_list = []
     transpose_nchw_nodes_append_list = []
     for n in node_map:
-        if node_map[n]['node'].op == 'Conv2D':
+        if node_map[n]["node"].op == "Conv2D":
             transpose_nhwc_nodes_append_list.append(n)
             transpose_nchw_nodes_append_list.append(n)
             nhwc_nodes.append(n)
@@ -168,12 +189,18 @@ def find_nhwc_region(node_map):
             transpose_nchw_nodes[n] = 1
 
     prev_cnt_nhwc_nodes = len(nhwc_nodes)
-    nhwc_op_list = ['Conv2D', 'Relu', 'FusedBatchNorm', 'MaxPool', 'BiasAdd', 'Add']
+    nhwc_op_list = [
+        "Conv2D",
+        "Relu",
+        "FusedBatchNorm",
+        "MaxPool",
+        "BiasAdd",
+        "Add"]
     while True:
         transpose_nchw_nodes_append_list = []
         for n in transpose_nchw_nodes:
-            for o in node_map[n]['outputs']:
-                if o not in nhwc_nodes and node_map[o]['node'].op in nhwc_op_list:
+            for o in node_map[n]["outputs"]:
+                if o not in nhwc_nodes and node_map[o]["node"].op in nhwc_op_list:
                     if all_input_in_nhwc(o, node_map, nhwc_nodes):
                         nhwc_nodes.append(o)
                         if o not in transpose_nchw_nodes_append_list:
@@ -182,15 +209,19 @@ def find_nhwc_region(node_map):
         transpose_nhwc_nodes_remove_list = []
         transpose_nchw_nodes_remove_list = []
         for n in transpose_nhwc_nodes:
-            if (all_input_in_nhwc(n, node_map, nhwc_nodes) and
-                n not in transpose_nhwc_nodes_remove_list):
+            if (
+                all_input_in_nhwc(n, node_map, nhwc_nodes)
+                and n not in transpose_nhwc_nodes_remove_list
+            ):
                 transpose_nhwc_nodes_remove_list.append(n)
         for n in transpose_nhwc_nodes_remove_list:
             del transpose_nhwc_nodes[n]
 
         for n in transpose_nchw_nodes:
-            if (all_output_in_nhwc(n, node_map, nhwc_nodes) and
-                n not in transpose_nchw_nodes_remove_list):
+            if (
+                all_output_in_nhwc(n, node_map, nhwc_nodes)
+                and n not in transpose_nchw_nodes_remove_list
+            ):
                 transpose_nchw_nodes_remove_list.append(n)
         for n in transpose_nchw_nodes_remove_list:
             del transpose_nchw_nodes[n]
@@ -202,66 +233,81 @@ def find_nhwc_region(node_map):
         if len(nhwc_nodes) == prev_cnt_nhwc_nodes:
             break
         prev_cnt_nhwc_nodes = len(nhwc_nodes)
-                        
-    print('\n\nTranspose to NHWC at nodes:')
+
+    print("\n\nTranspose to NHWC at nodes:")
     for n in transpose_nhwc_nodes:
-        print('    {}'.format(n))
-    
-    print('\n\nTranspose to NCHW at nodes:')
+        print("    {}".format(n))
+
+    print("\n\nTranspose to NCHW at nodes:")
     for n in transpose_nchw_nodes:
-        print('    {}'.format(n))
-    
+        print("    {}".format(n))
+
     return nhwc_nodes, transpose_nhwc_nodes, transpose_nchw_nodes
 
+
 def main():
     args = get_args()
 
     graph = graph_pb2.GraphDef()
-    with open(args.pbfile, 'rb') as f:
+    with open(args.pbfile, "rb") as f:
         graph.ParseFromString(f.read())
 
     node_map = build_graph(graph)
     trim_const_from_graph(node_map)
 
-    nhwc_nodes, transpose_nhwc_nodes, transpose_nchw_nodes = find_nhwc_region(node_map)
+    nhwc_nodes, transpose_nhwc_nodes, transpose_nchw_nodes = find_nhwc_region(
+        node_map)
 
-    nhwc_op_list = ['Conv2D', 'Relu', 'FusedBatchNorm', 'MaxPool', 'BiasAdd', 'Add']
+    nhwc_op_list = [
+        "Conv2D",
+        "Relu",
+        "FusedBatchNorm",
+        "MaxPool",
+        "BiasAdd",
+        "Add"]
     for n in nhwc_nodes:
-        if node_map[n]['node'].op == 'Conv2D':
-            convert_conv_nhwc(node_map[n]['node'])
-        elif node_map[n]['node'].op in ['FusedBatchNorm', 'BiasAdd']:
-            convert_general_nhwc(node_map[n]['node'])
-        elif node_map[n]['node'].op == 'MaxPool':
-            convert_mp_nhwc(node_map[n]['node'])
-      
+        if node_map[n]["node"].op == "Conv2D":
+            convert_conv_nhwc(node_map[n]["node"])
+        elif node_map[n]["node"].op in ["FusedBatchNorm", "BiasAdd"]:
+            convert_general_nhwc(node_map[n]["node"])
+        elif node_map[n]["node"].op == "MaxPool":
+            convert_mp_nhwc(node_map[n]["node"])
+
     done_nhwc = False
     if len(transpose_nhwc_nodes) == 1:
         for n in transpose_nhwc_nodes:
-            if len(node_map[n]['inputs']) == 1 and node_map[n]['inputs'][0] == 'image':
+            if len(node_map[n]["inputs"]
+                   ) == 1 and node_map[n]["inputs"][0] == "image":
                 image_outputs = []
-                for o in node_map['image']['outputs']:
+                for o in node_map["image"]["outputs"]:
                     if o != n:
-                        image_outputs.append(node_map[o]['node'])
-                insert_transpose(graph, node_map['image']['node'], image_outputs, True)
-                convert_image_nhwc(node_map['image']['node'])
+                        image_outputs.append(node_map[o]["node"])
+                insert_transpose(
+                    graph,
+                    node_map["image"]["node"],
+                    image_outputs,
+                    True)
+                convert_image_nhwc(node_map["image"]["node"])
                 done_nhwc = True
 
     if not done_nhwc:
         for n in transpose_nhwc_nodes:
-            for i in node_map[n]['inputs']:
+            for i in node_map[n]["inputs"]:
                 if i not in nhwc_nodes:
-                    insert_transpose(graph, node_map[i]['node'], node_map[n]['node'], False)
+                    insert_transpose(
+                        graph, node_map[i]["node"], node_map[n]["node"], False
+                    )
 
     for n in transpose_nchw_nodes:
         node_outputs = []
-        for o in node_map[n]['outputs']:
+        for o in node_map[n]["outputs"]:
             if o not in nhwc_nodes:
-                node_outputs.append(node_map[o]['node'])
-        insert_transpose(graph, node_map[n]['node'], node_outputs, True)
+                node_outputs.append(node_map[o]["node"])
+        insert_transpose(graph, node_map[n]["node"], node_outputs, True)
 
-    with open(args.pbfile+'.patch', 'wb') as f:
+    with open(args.pbfile + ".patch", "wb") as f:
         f.write(graph.SerializeToString())
 
-if __name__ == '__main__':
-    main()
 
+if __name__ == "__main__":
+    main()
diff --git a/text_to_image/backend_pytorch.py b/text_to_image/backend_pytorch.py
index 300d7e58b..1a45c1ca5 100644
--- a/text_to_image/backend_pytorch.py
+++ b/text_to_image/backend_pytorch.py
@@ -90,7 +90,7 @@ def load(self):
             # self.pipe.unet = torch.compile(self.pipe.unet, mode="reduce-overhead", fullgraph=True)
 
         self.pipe.to(self.device)
-        #self.pipe.set_progress_bar_config(disable=True)
+        # self.pipe.set_progress_bar_config(disable=True)
 
         self.negative_prompt_tokens = self.pipe.tokenizer(
             self.convert_prompt(self.negative_prompt, self.pipe.tokenizer),
@@ -211,13 +211,15 @@ def encode_tokens(
                     text_input_ids.to(device), output_hidden_states=True
                 )
 
-                # We are only ALWAYS interested in the pooled output of the final text encoder
+                # We are only ALWAYS interested in the pooled output of the
+                # final text encoder
                 pooled_prompt_embeds = prompt_embeds[0]
                 if clip_skip is None:
                     prompt_embeds = prompt_embeds.hidden_states[-2]
                 else:
                     # "2" because SDXL always indexes from the penultimate layer.
-                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+                    prompt_embeds = prompt_embeds.hidden_states[-(
+                        clip_skip + 2)]
 
                 prompt_embeds_list.append(prompt_embeds)
 
@@ -233,7 +235,8 @@ def encode_tokens(
             and zero_out_negative_prompt
         ):
             negative_prompt_embeds = torch.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(
+                pooled_prompt_embeds)
         elif do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = negative_prompt or ""
             negative_prompt_2 = negative_prompt_2 or negative_prompt
@@ -260,30 +263,35 @@ def encode_tokens(
                     uncond_input.to(device),
                     output_hidden_states=True,
                 )
-                # We are only ALWAYS interested in the pooled output of the final text encoder
+                # We are only ALWAYS interested in the pooled output of the
+                # final text encoder
                 negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                 negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
 
                 negative_prompt_embeds_list.append(negative_prompt_embeds)
 
-            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+            negative_prompt_embeds = torch.concat(
+                negative_prompt_embeds_list, dim=-1)
 
         if pipe.text_encoder_2 is not None:
             prompt_embeds = prompt_embeds.to(
                 dtype=pipe.text_encoder_2.dtype, device=device
             )
         else:
-            prompt_embeds = prompt_embeds.to(dtype=pipe.unet.dtype, device=device)
+            prompt_embeds = prompt_embeds.to(
+                dtype=pipe.unet.dtype, device=device)
 
         bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        # duplicate text embeddings for each generation per prompt, using mps
+        # friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(
             bs_embed * num_images_per_prompt, seq_len, -1
         )
 
         if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            # duplicate unconditional embeddings for each generation per
+            # prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
 
             if pipe.text_encoder_2 is not None:
@@ -315,7 +323,7 @@ def encode_tokens(
             pooled_prompt_embeds,
             negative_pooled_prompt_embeds,
         )
-    
+
     def prepare_inputs(self, inputs, i):
         if self.batch_size == 1:
             return self.encode_tokens(
@@ -330,7 +338,7 @@ def prepare_inputs(self, inputs, i):
             negative_prompt_embeds = []
             pooled_prompt_embeds = []
             negative_pooled_prompt_embeds = []
-            for prompt in inputs[i:min(i+self.batch_size, len(inputs))]:
+            for prompt in inputs[i: min(i + self.batch_size, len(inputs))]:
                 assert isinstance(prompt, dict)
                 text_input = prompt["input_tokens"]
                 text_input_2 = prompt["input_tokens_2"]
@@ -351,18 +359,26 @@ def prepare_inputs(self, inputs, i):
                 pooled_prompt_embeds.append(p_p_e)
                 negative_pooled_prompt_embeds.append(n_p_p_e)
 
-
             prompt_embeds = torch.cat(prompt_embeds)
             negative_prompt_embeds = torch.cat(negative_prompt_embeds)
             pooled_prompt_embeds = torch.cat(pooled_prompt_embeds)
-            negative_pooled_prompt_embeds = torch.cat(negative_pooled_prompt_embeds)
-            return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+            negative_pooled_prompt_embeds = torch.cat(
+                negative_pooled_prompt_embeds)
+            return (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            )
 
     def predict(self, inputs):
         images = []
         with torch.no_grad():
             for i in range(0, len(inputs), self.batch_size):
-                latents_input = [inputs[idx]["latents"] for idx in range(i, min(i+self.batch_size, len(inputs)))]
+                latents_input = [
+                    inputs[idx]["latents"]
+                    for idx in range(i, min(i + self.batch_size, len(inputs)))
+                ]
                 latents_input = torch.cat(latents_input).to(self.device)
                 (
                     prompt_embeds,
@@ -382,4 +398,3 @@ def predict(self, inputs):
                 ).images
                 images.extend(generated)
         return images
-
diff --git a/text_to_image/coco.py b/text_to_image/coco.py
index b2c9d6dfc..cb3956a01 100644
--- a/text_to_image/coco.py
+++ b/text_to_image/coco.py
@@ -38,7 +38,8 @@ def __init__(
         **kwargs,
     ):
         super().__init__()
-        self.captions_df = pd.read_csv(f"{data_path}/captions/captions.tsv", sep="\t")
+        self.captions_df = pd.read_csv(
+            f"{data_path}/captions/captions.tsv", sep="\t")
         self.image_size = image_size
         self.preprocessed_dir = os.path.abspath(f"{data_path}/preprocessed/")
         self.img_dir = os.path.abspath(f"{data_path}/validation/data/")
@@ -116,7 +117,10 @@ def get_item_count(self):
         return len(self.captions_df)
 
     def get_img(self, id):
-        img = Image.open(self.img_dir + "/" + self.captions_df.loc[id]["file_name"])
+        img = Image.open(
+            self.img_dir +
+            "/" +
+            self.captions_df.loc[id]["file_name"])
         return self.image_to_tensor(img)
 
     def get_imgs(self, id_list):
@@ -137,7 +141,11 @@ def get_item_loc(self, id):
 
 class PostProcessCoco:
     def __init__(
-        self, device="cpu", dtype="uint8", statistics_path=os.path.join(os.path.dirname(__file__), "tools", "val2014.npz")
+        self,
+        device="cpu",
+        dtype="uint8",
+        statistics_path=os.path.join(
+            os.path.dirname(__file__), "tools", "val2014.npz"),
     ):
         self.results = []
         self.good = 0
@@ -159,10 +167,12 @@ def add_results(self, results):
     def __call__(self, results, ids, expected=None, result_dict=None):
         self.content_ids.extend(ids)
         return [
-            (t.cpu().permute(1, 2, 0).float().numpy() * 255).round().astype(self.numpy_dtype)
+            (t.cpu().permute(1, 2, 0).float().numpy() * 255)
+            .round()
+            .astype(self.numpy_dtype)
             for t in results
         ]
-    
+
     def save_images(self, ids, ds):
         info = []
         idx = {}
@@ -195,7 +205,10 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                 100 * clip.get_clip_score(caption, generated).item()
             )
 
-        fid_score = compute_fid(self.results, self.statistics_path, self.device)
+        fid_score = compute_fid(
+            self.results,
+            self.statistics_path,
+            self.device)
         result_dict["FID_SCORE"] = fid_score
         result_dict["CLIP_SCORE"] = np.mean(self.clip_scores)
 
diff --git a/text_to_image/main.py b/text_to_image/main.py
index 32425762f..82383bf58 100644
--- a/text_to_image/main.py
+++ b/text_to_image/main.py
@@ -73,15 +73,22 @@
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
-    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
+    parser.add_argument(
+        "--dataset",
+        choices=SUPPORTED_DATASETS.keys(),
+        help="dataset")
+    parser.add_argument(
+        "--dataset-path",
+        required=True,
+        help="path to the dataset")
     parser.add_argument(
         "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
     )
     parser.add_argument(
         "--scenario",
         default="SingleStream",
-        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())),
+        help="mlperf benchmark scenario, one of " +
+        str(list(SCENARIO_MAP.keys())),
     )
     parser.add_argument(
         "--max-batchsize",
@@ -90,7 +97,10 @@ def get_args():
         help="max batch size in a single inference",
     )
     parser.add_argument("--threads", default=1, type=int, help="threads")
-    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
     parser.add_argument(
         "--find-peak-performance",
         action="store_true",
@@ -139,14 +149,20 @@ def get_args():
     # pass this argument for official submission
     # parser.add_argument("--output-images", action="store_true", help="Store a subset of the generated images")
     # do not modify this argument for official submission
-    parser.add_argument("--ids-path", help="Path to caption ids", default="tools/sample_ids.txt")
+    parser.add_argument(
+        "--ids-path", help="Path to caption ids", default="tools/sample_ids.txt"
+    )
 
-    # below will override mlperf rules compliant settings - don't use for official submission
+    # below will override mlperf rules compliant settings - don't use for
+    # official submission
     parser.add_argument("--time", type=int, help="time to scan in seconds")
     parser.add_argument("--count", type=int, help="dataset items to use")
     parser.add_argument("--debug", action="store_true", help="debug")
     parser.add_argument(
-        "--performance-sample-count", type=int, help="performance sample count", default=5000
+        "--performance-sample-count",
+        type=int,
+        help="performance sample count",
+        default=5000,
     )
     parser.add_argument(
         "--max-latency", type=float, help="mlperf max latency in pct tile"
@@ -259,9 +275,9 @@ def enqueue(self, query_samples):
         else:
             bs = self.max_batchsize
             for i in range(0, len(idx), bs):
-                data, label = self.ds.get_samples(idx[i : i + bs])
+                data, label = self.ds.get_samples(idx[i: i + bs])
                 self.run_one_item(
-                    Item(query_id[i : i + bs], idx[i : i + bs], data, label)
+                    Item(query_id[i: i + bs], idx[i: i + bs], data, label)
                 )
 
     def finish(self):
@@ -276,7 +292,9 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         self.result_dict = {}
 
         for _ in range(self.threads):
-            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
+            worker = threading.Thread(
+                target=self.handle_tasks, args=(
+                    self.tasks,))
             worker.daemon = True
             self.workers.append(worker)
             worker.start()
@@ -324,7 +342,7 @@ def main():
         precision=args.dtype,
         device=args.device,
         model_path=args.model_path,
-        batch_size=args.max_batchsize
+        batch_size=args.max_batchsize,
     )
     if args.dtype == "fp16":
         dtype = torch.float16
@@ -377,7 +395,7 @@ def main():
         sys.exit(1)
 
     audit_config = os.path.abspath(args.audit_conf)
-    
+
     if args.accuracy:
         ids_path = os.path.abspath(args.ids_path)
         with open(ids_path) as f:
@@ -459,7 +477,8 @@ def flush_queries():
         settings.multi_stream_samples_per_query = args.samples_per_query
     if args.max_latency:
         settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
-        settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC)
+        settings.multi_stream_expected_latency_ns = int(
+            args.max_latency * NANO_SEC)
 
     performance_sample_count = (
         args.performance_sample_count
diff --git a/text_to_image/tools/accuracy_coco.py b/text_to_image/tools/accuracy_coco.py
index f831c4b6e..508ed9602 100644
--- a/text_to_image/tools/accuracy_coco.py
+++ b/text_to_image/tools/accuracy_coco.py
@@ -4,7 +4,6 @@
 the images in coco's captions/captions.tsv.
 """
 
-
 import argparse
 import json
 import os
@@ -17,16 +16,33 @@
 from fid.fid_score import compute_fid
 
 
-
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--caption-path", default="coco2014/captions/captions_source.tsv", help="path to coco captions")
-    parser.add_argument("--statistics-path", default=None, help="path to statistics")
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
-    parser.add_argument("--output-file", default="coco-results.json", help="path to output file")
-    parser.add_argument("--compliance-images-path", required=False, help="path to dump 10 stable diffusion xl compliance images")
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--caption-path",
+        default="coco2014/captions/captions_source.tsv",
+        help="path to coco captions",
+    )
+    parser.add_argument(
+        "--statistics-path",
+        default=None,
+        help="path to statistics")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--output-file", default="coco-results.json", help="path to output file"
+    )
+    parser.add_argument(
+        "--compliance-images-path",
+        required=False,
+        help="path to dump 10 stable diffusion xl compliance images",
+    )
     parser.add_argument("--device", default="cpu", choices=["gpu", "cpu"])
     args = parser.parse_args()
     return args
@@ -37,11 +53,12 @@ def preprocess_image(img_dir, file_name):
     img = np.asarray(img)
     if len(img.shape) == 2:
         img = np.expand_dims(img, axis=-1)
-    tensor = torch.Tensor(np.asarray(img).transpose([2,0,1])).to(torch.uint8)
+    tensor = torch.Tensor(np.asarray(img).transpose([2, 0, 1])).to(torch.uint8)
     if tensor.shape[0] == 1:
-        tensor = tensor.repeat(3,1,1)
+        tensor = tensor.repeat(3, 1, 1)
     return tensor.unsqueeze(0)
 
+
 def main():
     args = get_args()
     result_dict = {}
@@ -61,7 +78,8 @@ def main():
     # set statistics path
     statistics_path = args.statistics_path
     if args.statistics_path is None:
-        statistics_path = os.path.join(os.path.dirname(__file__), "val2014.npz")
+        statistics_path = os.path.join(
+            os.path.dirname(__file__), "val2014.npz")
 
     # Set compliance images path
     dump_compliance_images = False
@@ -70,14 +88,19 @@ def main():
             os.makedirs(args.compliance_images_path)
         dump_compliance_images = True
         compliance_images_idx_list = []
-        with open(os.path.join(os.path.dirname(__file__), "sample_ids.txt"), 'r') as compliance_id_file:
+        with open(
+            os.path.join(os.path.dirname(__file__), "sample_ids.txt"), "r"
+        ) as compliance_id_file:
             for line in compliance_id_file:
                 idx = int(line.strip())
                 compliance_images_idx_list.append(idx)
         # Dump caption.txt
-        with open(os.path.join(args.compliance_images_path, "captions.txt"), "w+") as caption_file:
+        with open(
+            os.path.join(args.compliance_images_path, "captions.txt"), "w+"
+        ) as caption_file:
             for idx in compliance_images_idx_list:
-                caption_file.write(f"{idx}  {df_captions.iloc[idx]['caption']}\n")
+                caption_file.write(
+                    f"{idx}  {df_captions.iloc[idx]['caption']}\n")
 
     # Load torchmetrics modules
     clip = CLIPEncoder(device=device)
@@ -85,26 +108,33 @@ def main():
     seen = set()
     result_list = []
     for j in results:
-        idx = j['qsl_idx']
+        idx = j["qsl_idx"]
         if idx in seen:
             continue
         seen.add(idx)
 
         # Load generated image
-        generated_img = np.frombuffer(bytes.fromhex(j['data']), np.uint8).reshape(1024, 1024, 3)
+        generated_img = np.frombuffer(bytes.fromhex(j["data"]), np.uint8).reshape(
+            1024, 1024, 3
+        )
         result_list.append(generated_img)
         generated_img = Image.fromarray(generated_img)
 
         # Dump compliance images
         if dump_compliance_images and idx in compliance_images_idx_list:
-            generated_img.save(os.path.join(args.compliance_images_path, f"{idx}.png"))
+            generated_img.save(
+                os.path.join(
+                    args.compliance_images_path,
+                    f"{idx}.png"))
 
         # generated_img = torch.Tensor(generated_img).to(torch.uint8).to(device)
         # Load Ground Truth
         caption = df_captions.iloc[idx]["caption"]
         clip_scores.append(
-            100 * clip.get_clip_score(caption, generated_img).item()
-        )
+            100 *
+            clip.get_clip_score(
+                caption,
+                generated_img).item())
     fid_score = compute_fid(result_list, statistics_path, device)
 
     result_dict["FID_SCORE"] = fid_score
@@ -114,5 +144,6 @@ def main():
     with open(args.output_file, "w") as fp:
         json.dump(result_dict, fp, sort_keys=True, indent=4)
 
+
 if __name__ == "__main__":
     main()
diff --git a/text_to_image/tools/clip/clip_encoder.py b/text_to_image/tools/clip/clip_encoder.py
index da24ada27..8f83170b2 100644
--- a/text_to_image/tools/clip/clip_encoder.py
+++ b/text_to_image/tools/clip/clip_encoder.py
@@ -9,7 +9,7 @@
 class CLIPEncoder(nn.Module):
     """
     A class for encoding images and texts using a specified CLIP model and computing the similarity between them.
-    
+
     Attributes:
     -----------
     clip_version: str
@@ -23,14 +23,17 @@ class CLIPEncoder(nn.Module):
     device: str
         The device to which the model is moved.
     """
-    def __init__(self, 
-                 clip_version: str = 'ViT-B/32',
-                 pretrained: Optional[str] = '',
-                 cache_dir: Optional[str] = None,
-                 device: str = 'cpu'):
+
+    def __init__(
+        self,
+        clip_version: str = "ViT-B/32",
+        pretrained: Optional[str] = "",
+        cache_dir: Optional[str] = None,
+        device: str = "cpu",
+    ):
         """
         Initializes the CLIPEncoder with the specified CLIP model version and pre-trained weights.
-        
+
         Parameters:
         -----------
         clip_version: str, optional
@@ -46,10 +49,10 @@ def __init__(self,
 
         self.clip_version = clip_version
         self.pretrained = pretrained if pretrained else self._get_default_pretrained()
-        
-        self.model, _, self.preprocess = open_clip.create_model_and_transforms(self.clip_version,
-                                                                               pretrained=self.pretrained,
-                                                                               cache_dir=cache_dir)
+
+        self.model, _, self.preprocess = open_clip.create_model_and_transforms(
+            self.clip_version, pretrained=self.pretrained, cache_dir=cache_dir
+        )
 
         self.model.eval()
         self.model.to(device)
@@ -57,25 +60,27 @@ def __init__(self,
 
     def _get_default_pretrained(self) -> str:
         """Returns the default pretrained weights based on the clip_version."""
-        if self.clip_version == 'ViT-H-14':
-            return 'laion2b_s32b_b79k'
-        elif self.clip_version == 'ViT-g-14':
-            return 'laion2b_s12b_b42k'
+        if self.clip_version == "ViT-H-14":
+            return "laion2b_s32b_b79k"
+        elif self.clip_version == "ViT-g-14":
+            return "laion2b_s12b_b42k"
         else:
-            return 'openai'
+            return "openai"
 
     @torch.no_grad()
-    def get_clip_score(self, text: Union[str, List[str]], image: Union[Image.Image, torch.Tensor]) -> torch.Tensor:
+    def get_clip_score(
+        self, text: Union[str, List[str]], image: Union[Image.Image, torch.Tensor]
+    ) -> torch.Tensor:
         """
         Computes the similarity score between the given text(s) and image using the CLIP model.
-        
+
         Parameters:
         -----------
         text: Union[str, List[str]]
             The text or list of texts to compare with the image.
         image: Image.Image
             The input image.
-            
+
         Returns:
         --------
         torch.Tensor
@@ -83,7 +88,7 @@ def get_clip_score(self, text: Union[str, List[str]], image: Union[Image.Image,
         """
         # Preprocess the image and move it to the specified device
         image = self.preprocess(image).unsqueeze(0).to(self.device)
-        
+
         # Normalize the image features
         image_features = self.model.encode_image(image).float()
         image_features /= image_features.norm(dim=-1, keepdim=True)
@@ -91,15 +96,15 @@ def get_clip_score(self, text: Union[str, List[str]], image: Union[Image.Image,
         # If a single text string is provided, convert it to a list
         if not isinstance(text, (list, tuple)):
             text = [text]
-        
+
         # Tokenize the text and move it to the specified device
         text = open_clip.tokenize(text).to(self.device)
-        
+
         # Normalize the text features
         text_features = self.model.encode_text(text).float()
         text_features /= text_features.norm(dim=-1, keepdim=True)
-        
+
         # Compute the similarity between the image and text features
         similarity = image_features @ text_features.T
 
-        return similarity
\ No newline at end of file
+        return similarity
diff --git a/text_to_image/tools/coco.py b/text_to_image/tools/coco.py
index c11f2225c..77b2a5da2 100644
--- a/text_to_image/tools/coco.py
+++ b/text_to_image/tools/coco.py
@@ -27,24 +27,36 @@ def get_args():
         type=int,
         help="Maximun number of images to download",
     )
-    parser.add_argument("--num-workers", default=1, type=int, help="Number of processes to download images")
+    parser.add_argument(
+        "--num-workers",
+        default=1,
+        type=int,
+        help="Number of processes to download images",
+    )
     parser.add_argument(
         "--allow-duplicate-images",
         action="store_true",
-        help="Allow mulple captions per image"
+        help="Allow mulple captions per image",
     )
     parser.add_argument(
-        "--latents-path-torch", default="latents.pt", type=str, help="Path to pytorch latents"
+        "--latents-path-torch",
+        default="latents.pt",
+        type=str,
+        help="Path to pytorch latents",
     )
     parser.add_argument(
-        "--latents-path-numpy", default="latents.npy", type=str, help="Path to numpy latents"
+        "--latents-path-numpy",
+        default="latents.npy",
+        type=str,
+        help="Path to numpy latents",
     )
     parser.add_argument(
         "--seed", type=int, default=2023, help="Seed to choose the dataset"
     )
     parser.add_argument(
-        "--keep-raw", action="store_true", help="Keep raw folder"
-    )
+        "--keep-raw",
+        action="store_true",
+        help="Keep raw folder")
     parser.add_argument(
         "--download-images", action="store_true", help="Download the calibration set"
     )
@@ -66,22 +78,29 @@ def download_img(args):
     dataset_dir = os.path.abspath(args.dataset_dir)
     # Check if the annotation dataframe is there
     if os.path.exists(f"{dataset_dir}/captions/captions_source.tsv"):
-        df_annotations = pd.read_csv(f"{dataset_dir}/captions/captions_source.tsv", sep="\t")
+        df_annotations = pd.read_csv(
+            f"{dataset_dir}/captions/captions_source.tsv", sep="\t"
+        )
         df_annotations = df_annotations.iloc[: args.max_images]
     elif os.path.exists(f"{dataset_dir}/../captions_source.tsv"):
         os.makedirs(f"{dataset_dir}/captions/", exist_ok=True)
-        os.system(f"cp {dataset_dir}/../captions_source.tsv {dataset_dir}/captions/")
-        df_annotations = pd.read_csv(f"{dataset_dir}/captions/captions_source.tsv", sep="\t")
+        os.system(
+            f"cp {dataset_dir}/../captions_source.tsv {dataset_dir}/captions/")
+        df_annotations = pd.read_csv(
+            f"{dataset_dir}/captions/captions_source.tsv", sep="\t"
+        )
         df_annotations = df_annotations.iloc[: args.max_images]
     elif args.tsv_path is not None and os.path.exists(f"{args.tsv_path}"):
         file_name = args.tsv_path.split("/")[-1]
         os.makedirs(f"{dataset_dir}/captions/", exist_ok=True)
         os.system(f"cp {args.tsv_path} {dataset_dir}/captions/")
-        df_annotations = pd.read_csv(f"{dataset_dir}/captions/{file_name}", sep="\t")
+        df_annotations = pd.read_csv(
+            f"{dataset_dir}/captions/{file_name}", sep="\t")
         df_annotations = df_annotations.iloc[: args.max_images]
     else:
         # Check if raw annotations file already exist
-        if not os.path.exists(f"{dataset_dir}/raw/annotations/captions_val2014.json"):
+        if not os.path.exists(
+                f"{dataset_dir}/raw/annotations/captions_val2014.json"):
             # Download annotations
             os.makedirs(f"{dataset_dir}/raw/", exist_ok=True)
             os.makedirs(f"{dataset_dir}/download_aux/", exist_ok=True)
@@ -121,7 +140,9 @@ def download_img(args):
             frac=1, random_state=args.seed
         ).reset_index(drop=True)
         df_annotations = df_annotations.iloc[: args.max_images]
-        df_annotations['caption'] = df_annotations['caption'].apply(lambda x: x.replace('\n', '').strip())
+        df_annotations["caption"] = df_annotations["caption"].apply(
+            lambda x: x.replace("\n", "").strip()
+        )
         df_annotations = (
             df_annotations.merge(
                 df_images, how="inner", left_on="image_id", right_on="id"
@@ -135,11 +156,18 @@ def download_img(args):
     if args.download_images:
         os.makedirs(f"{dataset_dir}/validation/data/", exist_ok=True)
         tasks = [
-            (row["coco_url"], f"{dataset_dir}/validation/data/", row["file_name"])
+            (row["coco_url"],
+             f"{dataset_dir}/validation/data/",
+             row["file_name"])
             for i, row in df_annotations.iterrows()
         ]
         pool = Pool(processes=args.num_workers)
-        [_ for _ in tqdm.tqdm(pool.imap_unordered(download_img, tasks), total=len(tasks))]
+        [
+            _
+            for _ in tqdm.tqdm(
+                pool.imap_unordered(download_img, tasks), total=len(tasks)
+            )
+        ]
     # Finalize annotations
     df_annotations[
         ["id", "image_id", "caption", "height", "width", "file_name", "coco_url"]
diff --git a/text_to_image/tools/coco_calibration.py b/text_to_image/tools/coco_calibration.py
index c8417f52a..dc4f49009 100644
--- a/text_to_image/tools/coco_calibration.py
+++ b/text_to_image/tools/coco_calibration.py
@@ -22,13 +22,19 @@ def get_args():
     parser.add_argument(
         "--tsv-path", default=None, help="Precomputed tsv file location"
     )
-    parser.add_argument("--num-workers", default=1, type=int, help="Number of processes to download images")
     parser.add_argument(
-        "--calibration-dir", default=None, help="Calibration ids location"
+        "--num-workers",
+        default=1,
+        type=int,
+        help="Number of processes to download images",
     )
     parser.add_argument(
-        "--keep-raw", action="store_true", help="Keep the raw dataset"
+        "--calibration-dir", default=None, help="Calibration ids location"
     )
+    parser.add_argument(
+        "--keep-raw",
+        action="store_true",
+        help="Keep the raw dataset")
     parser.add_argument(
         "--download-images", action="store_true", help="Download the calibration set"
     )
@@ -47,17 +53,28 @@ def download_img(args):
 if __name__ == "__main__":
     args = get_args()
     dataset_dir = os.path.abspath(args.dataset_dir)
-    calibration_dir = args.calibration_dir if args.calibration_dir is not None else os.path.join(os.path.dirname(__file__), "..", "..", "calibration", "COCO-2014")
+    calibration_dir = (
+        args.calibration_dir
+        if args.calibration_dir is not None
+        else os.path.join(
+            os.path.dirname(__file__), "..", "..", "calibration", "COCO-2014"
+        )
+    )
     # Check if the annotation dataframe is there
     if os.path.exists(f"{dataset_dir}/calibration/captions.tsv"):
-        df_annotations = pd.read_csv(f"{dataset_dir}/calibration/captions.tsv", sep="\t")
+        df_annotations = pd.read_csv(
+            f"{dataset_dir}/calibration/captions.tsv", sep="\t"
+        )
     elif args.tsv_path is not None and os.path.exists(f"{args.tsv_path}"):
         os.makedirs(f"{dataset_dir}/calibration/", exist_ok=True)
         os.system(f"cp {args.tsv_path} {dataset_dir}/calibration/")
-        df_annotations = pd.read_csv(f"{dataset_dir}/calibration/captions.tsv", sep="\t")
+        df_annotations = pd.read_csv(
+            f"{dataset_dir}/calibration/captions.tsv", sep="\t"
+        )
     else:
         # Check if raw annotations file already exist
-        if not os.path.exists(f"{dataset_dir}/raw/annotations/captions_train2014.json"):
+        if not os.path.exists(
+                f"{dataset_dir}/raw/annotations/captions_train2014.json"):
             # Download annotations
             os.makedirs(f"{dataset_dir}/raw/", exist_ok=True)
             os.makedirs(f"{dataset_dir}/download_aux/", exist_ok=True)
@@ -88,15 +105,19 @@ def download_img(args):
         df_annotations = pd.DataFrame(annotations)
         df_images = pd.DataFrame(images)
 
-        # Calibration images 
+        # Calibration images
         with open(f"{calibration_dir}/coco_cal_captions_list.txt") as f:
             calibration_ids = f.readlines()
-            calibration_ids = [int(id.replace('\n', '')) for id in calibration_ids]
+            calibration_ids = [int(id.replace("\n", ""))
+                               for id in calibration_ids]
             calibration_ids = calibration_ids
 
-        df_annotations = df_annotations[np.isin(df_annotations["id"], calibration_ids)]
+        df_annotations = df_annotations[np.isin(
+            df_annotations["id"], calibration_ids)]
         df_annotations = df_annotations.sort_values(by=["id"])
-        df_annotations['caption'] = df_annotations['caption'].apply(lambda x: x.replace('\n', '').strip())
+        df_annotations["caption"] = df_annotations["caption"].apply(
+            lambda x: x.replace("\n", "").strip()
+        )
         df_annotations = (
             df_annotations.merge(
                 df_images, how="inner", left_on="image_id", right_on="id"
@@ -111,11 +132,18 @@ def download_img(args):
     if args.download_images:
         os.makedirs(f"{dataset_dir}/calibration/data/", exist_ok=True)
         tasks = [
-            (row["coco_url"], f"{dataset_dir}/calibration/data/", row["file_name"])
+            (row["coco_url"],
+             f"{dataset_dir}/calibration/data/",
+             row["file_name"])
             for i, row in df_annotations.iterrows()
         ]
         pool = Pool(processes=args.num_workers)
-        [_ for _ in tqdm.tqdm(pool.imap_unordered(download_img, tasks), total=len(tasks))]
+        [
+            _
+            for _ in tqdm.tqdm(
+                pool.imap_unordered(download_img, tasks), total=len(tasks)
+            )
+        ]
     # Finalize annotations
     df_annotations[
         ["id", "image_id", "caption", "height", "width", "file_name", "coco_url"]
diff --git a/text_to_image/tools/coco_generate_calibration.py b/text_to_image/tools/coco_generate_calibration.py
index a2a7254c0..4c83489af 100644
--- a/text_to_image/tools/coco_generate_calibration.py
+++ b/text_to_image/tools/coco_generate_calibration.py
@@ -32,8 +32,9 @@ def get_args():
         "--seed", type=int, default=2023, help="Seed to choose the dataset"
     )
     parser.add_argument(
-        "--keep-raw", action="store_true", help="Keep raw folder"
-    )
+        "--keep-raw",
+        action="store_true",
+        help="Keep raw folder")
 
     args = parser.parse_args()
     return args
@@ -42,10 +43,17 @@ def get_args():
 if __name__ == "__main__":
     args = get_args()
     dataset_dir = os.path.abspath(args.dataset_dir)
-    calibration_dir = args.calibration_dir if args.calibration_dir is not None else os.path.join(os.path.dirname(__file__), "..", "..", "calibration", "COCO-2014")
-    
+    calibration_dir = (
+        args.calibration_dir
+        if args.calibration_dir is not None
+        else os.path.join(
+            os.path.dirname(__file__), "..", "..", "calibration", "COCO-2014"
+        )
+    )
+
     # Check if raw annotations file already exist
-    if not os.path.exists(f"{dataset_dir}/raw/annotations/captions_train2014.json"):
+    if not os.path.exists(
+            f"{dataset_dir}/raw/annotations/captions_train2014.json"):
         # Download annotations
         os.makedirs(f"{dataset_dir}/raw/", exist_ok=True)
         os.makedirs(f"{dataset_dir}/download_aux/", exist_ok=True)
@@ -68,19 +76,24 @@ def get_args():
     df_annotations = pd.DataFrame(annotations)
     df_images = pd.DataFrame(images)
 
-    # Calibration images 
+    # Calibration images
     df_annotations = df_annotations.drop_duplicates(
-        subset=["image_id"], keep="first"
-    )
+        subset=["image_id"], keep="first")
     # Sort, shuffle and choose the final dataset
     df_annotations = df_annotations.sort_values(by=["id"])
-    df_annotations = df_annotations.sample(frac=1, random_state=args.seed).reset_index(drop=True)
+    df_annotations = df_annotations.sample(frac=1, random_state=args.seed).reset_index(
+        drop=True
+    )
     df_annotations = df_annotations.iloc[: args.max_images]
-    df_annotations['caption'] = df_annotations['caption'].apply(lambda x: x.replace('\n', '').strip())
+    df_annotations["caption"] = df_annotations["caption"].apply(
+        lambda x: x.replace("\n", "").strip()
+    )
     df_annotations = (
         df_annotations.merge(
-            df_images, how="inner", left_on="image_id", right_on="id"
-        )
+            df_images,
+            how="inner",
+            left_on="image_id",
+            right_on="id")
         .drop(["id_y"], axis=1)
         .rename(columns={"id_x": "id"})
         .sort_values(by=["id"])
@@ -92,4 +105,3 @@ def get_args():
         f.write(s)
     # Remove Folder
     os.system(f"rm -rf {dataset_dir}")
-
diff --git a/text_to_image/tools/fid/fid_score.py b/text_to_image/tools/fid/fid_score.py
index ce7cfe4b0..17094c98a 100644
--- a/text_to_image/tools/fid/fid_score.py
+++ b/text_to_image/tools/fid/fid_score.py
@@ -31,20 +31,22 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+
+from inception import InceptionV3
+from torch.nn.functional import adaptive_avg_pool2d
+from scipy import linalg
+from PIL import Image
+import torchvision.transforms as TF
+import torch
+import random
+import numpy as np
+from typing import Any
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+import pathlib
 import os
 import sys
-sys.path.append(os.path.dirname(__file__))
-import pathlib
-from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
-from typing import Any
 
-import numpy as np
-import random
-import torch
-import torchvision.transforms as TF
-from PIL import Image
-from scipy import linalg
-from torch.nn.functional import adaptive_avg_pool2d
+sys.path.append(os.path.dirname(__file__))
 
 
 try:
@@ -55,9 +57,6 @@ def tqdm(x):
         return x
 
 
-from inception import InceptionV3
-
-
 class ImagesDataset(torch.utils.data.Dataset):
     def __init__(self, imgs, transforms=None):
         self.imgs = imgs
@@ -71,7 +70,7 @@ def __getitem__(self, i):
         if self.transforms is not None:
             img = self.transforms(img)
         return img
-    
+
 
 def get_activations(
     files, model, batch_size=50, dims=2048, device="cpu", num_workers=1
@@ -132,7 +131,7 @@ def get_activations(
 
         pred = pred.squeeze(3).squeeze(2).cpu().numpy()
 
-        pred_arr[start_idx : start_idx + pred.shape[0]] = pred
+        pred_arr[start_idx: start_idx + pred.shape[0]] = pred
 
         start_idx = start_idx + pred.shape[0]
 
@@ -196,7 +195,8 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
 
     tr_covmean = np.trace(covmean)
 
-    return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
+    return diff.dot(diff) + np.trace(sigma1) + \
+        np.trace(sigma2) - 2 * tr_covmean
 
 
 def calculate_activation_statistics(
@@ -241,7 +241,8 @@ def compute_statistics_of_path(
     else:
         path = pathlib.Path(path)
         files = sorted(
-            [file for ext in IMAGE_EXTENSIONS for file in path.glob("*.{}".format(ext))]
+            [file for ext in IMAGE_EXTENSIONS for file in path.glob(
+                "*.{}".format(ext))]
         )
         if subset_size is not None:
             random.seed(shuffle_seed)
@@ -319,7 +320,7 @@ def compute_fid(
     num_workers=1,
     batch_size=1,
     subset_size=None,
-    shuffle_seed=None
+    shuffle_seed=None,
 ):
     imgs = [Image.fromarray(e).convert("RGB") for e in results]
     device = torch.device(device if torch.cuda.is_available() else "cpu")
@@ -359,4 +360,3 @@ def compute_fid(
     fid_value = calculate_frechet_distance(m1, s1, m2, s2)
 
     return fid_value
-
diff --git a/text_to_image/tools/fid/inception.py b/text_to_image/tools/fid/inception.py
index cc5687052..7854487ca 100644
--- a/text_to_image/tools/fid/inception.py
+++ b/text_to_image/tools/fid/inception.py
@@ -10,7 +10,7 @@
 
 # Inception weights ported to Pytorch from
 # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
-FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'  # noqa: E501
+FID_WEIGHTS_URL = "https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth"  # noqa: E501
 
 
 class InceptionV3(nn.Module):
@@ -22,18 +22,20 @@ class InceptionV3(nn.Module):
 
     # Maps feature dimensionality to their output blocks indices
     BLOCK_INDEX_BY_DIM = {
-        64: 0,   # First max pooling features
+        64: 0,  # First max pooling features
         192: 1,  # Second max pooling featurs
         768: 2,  # Pre-aux classifier features
-        2048: 3  # Final average pooling features
+        2048: 3,  # Final average pooling features
     }
 
-    def __init__(self,
-                 output_blocks=(DEFAULT_BLOCK_INDEX,),
-                 resize_input=True,
-                 normalize_input=True,
-                 requires_grad=False,
-                 use_fid_inception=True):
+    def __init__(
+        self,
+        output_blocks=(DEFAULT_BLOCK_INDEX,),
+        resize_input=True,
+        normalize_input=True,
+        requires_grad=False,
+        use_fid_inception=True,
+    ):
         """Build pretrained InceptionV3
 
         Parameters
@@ -71,22 +73,21 @@ def __init__(self,
         self.output_blocks = sorted(output_blocks)
         self.last_needed_block = max(output_blocks)
 
-        assert self.last_needed_block <= 3, \
-            'Last possible output block index is 3'
+        assert self.last_needed_block <= 3, "Last possible output block index is 3"
 
         self.blocks = nn.ModuleList()
 
         if use_fid_inception:
             inception = fid_inception_v3()
         else:
-            inception = _inception_v3(weights='DEFAULT')
+            inception = _inception_v3(weights="DEFAULT")
 
         # Block 0: input to maxpool1
         block0 = [
             inception.Conv2d_1a_3x3,
             inception.Conv2d_2a_3x3,
             inception.Conv2d_2b_3x3,
-            nn.MaxPool2d(kernel_size=3, stride=2)
+            nn.MaxPool2d(kernel_size=3, stride=2),
         ]
         self.blocks.append(nn.Sequential(*block0))
 
@@ -95,7 +96,7 @@ def __init__(self,
             block1 = [
                 inception.Conv2d_3b_1x1,
                 inception.Conv2d_4a_3x3,
-                nn.MaxPool2d(kernel_size=3, stride=2)
+                nn.MaxPool2d(kernel_size=3, stride=2),
             ]
             self.blocks.append(nn.Sequential(*block1))
 
@@ -119,7 +120,7 @@ def __init__(self,
                 inception.Mixed_7a,
                 inception.Mixed_7b,
                 inception.Mixed_7c,
-                nn.AdaptiveAvgPool2d(output_size=(1, 1))
+                nn.AdaptiveAvgPool2d(output_size=(1, 1)),
             ]
             self.blocks.append(nn.Sequential(*block3))
 
@@ -144,10 +145,13 @@ def forward(self, inp):
         x = inp
 
         if self.resize_input:
-            x = F.interpolate(x,
-                              size=(299, 299),
-                              mode='bilinear',
-                              align_corners=False)
+            x = F.interpolate(
+                x,
+                size=(
+                    299,
+                    299),
+                mode="bilinear",
+                align_corners=False)
 
         if self.normalize_input:
             x = 2 * x - 1  # Scale from range (0, 1) to range (-1, 1)
@@ -166,7 +170,7 @@ def forward(self, inp):
 def _inception_v3(*args, **kwargs):
     """Wraps `torchvision.models.inception_v3`"""
     try:
-        version = tuple(map(int, torchvision.__version__.split('.')[:2]))
+        version = tuple(map(int, torchvision.__version__.split(".")[:2]))
     except ValueError:
         # Just a caution against weird version strings
         version = (0,)
@@ -174,22 +178,22 @@ def _inception_v3(*args, **kwargs):
     # Skips default weight inititialization if supported by torchvision
     # version. See https://github.com/mseitzer/pytorch-fid/issues/28.
     if version >= (0, 6):
-        kwargs['init_weights'] = False
+        kwargs["init_weights"] = False
 
     # Backwards compatibility: `weights` argument was handled by `pretrained`
     # argument prior to version 0.13.
-    if version < (0, 13) and 'weights' in kwargs:
-        if kwargs['weights'] == 'DEFAULT':
-            kwargs['pretrained'] = True
-        elif kwargs['weights'] is None:
-            kwargs['pretrained'] = False
+    if version < (0, 13) and "weights" in kwargs:
+        if kwargs["weights"] == "DEFAULT":
+            kwargs["pretrained"] = True
+        elif kwargs["weights"] is None:
+            kwargs["pretrained"] = False
         else:
             raise ValueError(
-                'weights=={} not supported in torchvision {}'.format(
-                    kwargs['weights'], torchvision.__version__
+                "weights=={} not supported in torchvision {}".format(
+                    kwargs["weights"], torchvision.__version__
                 )
             )
-        del kwargs['weights']
+        del kwargs["weights"]
 
     return torchvision.models.inception_v3(*args, **kwargs)
 
@@ -203,9 +207,7 @@ def fid_inception_v3():
     This method first constructs torchvision's Inception and then patches the
     necessary parts that are different in the FID Inception model.
     """
-    inception = _inception_v3(num_classes=1008,
-                              aux_logits=False,
-                              weights=None)
+    inception = _inception_v3(num_classes=1008, aux_logits=False, weights=None)
     inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
     inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
     inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
@@ -223,6 +225,7 @@ def fid_inception_v3():
 
 class FIDInceptionA(torchvision.models.inception.InceptionA):
     """InceptionA block patched for FID computation"""
+
     def __init__(self, in_channels, pool_features):
         super(FIDInceptionA, self).__init__(in_channels, pool_features)
 
@@ -238,8 +241,9 @@ def forward(self, x):
 
         # Patch: Tensorflow's average pool does not use the padded zero's in
         # its average calculation
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
-                                   count_include_pad=False)
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False
+        )
         branch_pool = self.branch_pool(branch_pool)
 
         outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
@@ -248,6 +252,7 @@ def forward(self, x):
 
 class FIDInceptionC(torchvision.models.inception.InceptionC):
     """InceptionC block patched for FID computation"""
+
     def __init__(self, in_channels, channels_7x7):
         super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
 
@@ -266,8 +271,9 @@ def forward(self, x):
 
         # Patch: Tensorflow's average pool does not use the padded zero's in
         # its average calculation
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
-                                   count_include_pad=False)
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False
+        )
         branch_pool = self.branch_pool(branch_pool)
 
         outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
@@ -276,6 +282,7 @@ def forward(self, x):
 
 class FIDInceptionE_1(torchvision.models.inception.InceptionE):
     """First InceptionE block patched for FID computation"""
+
     def __init__(self, in_channels):
         super(FIDInceptionE_1, self).__init__(in_channels)
 
@@ -299,8 +306,9 @@ def forward(self, x):
 
         # Patch: Tensorflow's average pool does not use the padded zero's in
         # its average calculation
-        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1,
-                                   count_include_pad=False)
+        branch_pool = F.avg_pool2d(
+            x, kernel_size=3, stride=1, padding=1, count_include_pad=False
+        )
         branch_pool = self.branch_pool(branch_pool)
 
         outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
@@ -309,6 +317,7 @@ def forward(self, x):
 
 class FIDInceptionE_2(torchvision.models.inception.InceptionE):
     """Second InceptionE block patched for FID computation"""
+
     def __init__(self, in_channels):
         super(FIDInceptionE_2, self).__init__(in_channels)
 
@@ -338,4 +347,4 @@ def forward(self, x):
         branch_pool = self.branch_pool(branch_pool)
 
         outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
-        return torch.cat(outputs, 1)
\ No newline at end of file
+        return torch.cat(outputs, 1)
diff --git a/text_to_image/tools/latent.py b/text_to_image/tools/latent.py
index dcbfc9761..d8c3e4efe 100644
--- a/text_to_image/tools/latent.py
+++ b/text_to_image/tools/latent.py
@@ -12,8 +12,16 @@ def get_args():
     parser.add_argument(
         "--num-channels-latents", type=int, default=4, help="Batch size of the latent"
     )
-    parser.add_argument("--height", type=int, default=1024, help="Height of the image")
-    parser.add_argument("--width", type=int, default=1024, help="Width of the image")
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=1024,
+        help="Height of the image")
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=1024,
+        help="Width of the image")
     parser.add_argument(
         "--dtype",
         type=str,
@@ -33,7 +41,13 @@ def get_args():
         default=8,
         help="Variational Autoencoder scale factor, obtainded from model inspection",
     )
-    parser.add_argument("--output-type", type=str, default="pt", choices=["pt", "np"])
+    parser.add_argument(
+        "--output-type",
+        type=str,
+        default="pt",
+        choices=[
+            "pt",
+            "np"])
     args = parser.parse_args()
     return args
 
diff --git a/text_to_image/tools/sample_ids.py b/text_to_image/tools/sample_ids.py
index e1d6effb4..29cf91202 100644
--- a/text_to_image/tools/sample_ids.py
+++ b/text_to_image/tools/sample_ids.py
@@ -1,20 +1,24 @@
-
 import argparse
 import numpy as np
 import pandas as pd
 
+
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--tsv-path", default="../coco2014/captions/captions_source.tsv", help="Dataset download location"
+        "--tsv-path",
+        default="../coco2014/captions/captions_source.tsv",
+        help="Dataset download location",
     )
     parser.add_argument(
         "--output-path", default="sample_ids.txt", help="Dataset download location"
     )
     parser.add_argument(
-        "--n", type=int, default=10, help="Dataset download location"
-    )
+        "--n",
+        type=int,
+        default=10,
+        help="Dataset download location")
     parser.add_argument(
         "--seed", "-s", type=int, default=926019364, help="Dataset download location"
     )
@@ -29,8 +33,7 @@ def get_args():
     sample_ids = list(np.random.choice(df_annotations.shape[0], args.n))
     with open(args.output_path, "w+") as f:
         for i, sample in enumerate(sample_ids):
-            if i != (len(sample_ids)-1):
+            if i != (len(sample_ids) - 1):
                 f.write(str(sample) + "\n")
             else:
                 f.write(str(sample))
-    
\ No newline at end of file
diff --git a/tools/submission/filter_errors.py b/tools/submission/filter_errors.py
index 95fe7a294..2da9dbe95 100644
--- a/tools/submission/filter_errors.py
+++ b/tools/submission/filter_errors.py
@@ -43,8 +43,10 @@
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--input", required=True,
-                        help="orignal submission directory")
+    parser.add_argument(
+        "--input",
+        required=True,
+        help="orignal submission directory")
     parser.add_argument("--output", help="new submission directory")
     args = parser.parse_args()
     return args
diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
index 13a01fe55..60139b572 100644
--- a/tools/submission/generate_final_report.py
+++ b/tools/submission/generate_final_report.py
@@ -2,6 +2,7 @@
 
 The resulting excel files can be imported into google sheets.
 """
+
 import argparse
 import os
 import sys
@@ -11,242 +12,285 @@
 
 
 def get_args():
-  """Parse commandline."""
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--input', required=True, help='results csv from checker')
-  parser.add_argument('--version', default='4.0', help='mlperf version')
-  parser.add_argument('--repository', default='submissions_inference_4.0', help='mlperf repository')
-  args = parser.parse_args()
-  return args
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input",
+        required=True,
+        help="results csv from checker")
+    parser.add_argument("--version", default="4.0", help="mlperf version")
+    parser.add_argument(
+        "--repository", default="submissions_inference_4.0", help="mlperf repository"
+    )
+    args = parser.parse_args()
+    return args
 
 
 def main():
-  args = get_args()
-
-  df = pd.read_csv(args.input).fillna('')
-
-  # rename some fields
-  df.rename(
-      columns={
-          'Organization': 'Submitter',
-          'Division': 'Category',
-          'SystemType': 'Suite',
-          'SystemName': 'System',
-          'number_of_nodes': 'Nodes',
-          'host_processor_model_name': 'Processor',
-          'accelerator_model_name': 'Accelerator',
-          'accelerators_per_node': 'a#',
-          'notes': 'Notes',
-          'framework': 'Software',
-      },
-      inplace=True)
-  df.rename(columns={'Model': 'UsedModel'}, inplace=True)
-  df.rename(columns={'MlperfModel': 'Model'}, inplace=True)
-
-  # fix issues with raw data
-  df['host_processor_core_count'] = df['host_processor_core_count'].apply(
-      lambda x: 2 if x == '2 (big); 4 (LITTLE)' else x)
-  df['Availability'] = df['Availability'].apply(lambda x: 'available'
-                                                if x == 'on-premise' else x)
-
-  # cleanup counts
-  df['Accelerator'] = df['Accelerator'].apply(lambda x: x if x != '-' else '')
-  df['a#'] = df['a#'].apply(lambda x: int(x) if str(x).isnumeric() else x)
-  df['a#'] = df['a#'].apply(lambda x: x if x != 0 else '')
-  df['p#'] = df.apply(lambda x: int(x['host_processors_per_node']), axis=1)
-
-  # details url
-  base_url = f'https://github.com/mlcommons/{args.repository}/tree/main'
-  df['Details'] = df.apply(
-      lambda x: '=HYPERLINK("{}","details")'.format('/'.join(
-          [base_url, x['Category'], x['Submitter'], 'results', x['Platform']])),
-      axis=1)
-  
-  # code url
-  df['Code'] = df.apply(
-      lambda x: '=HYPERLINK("{}","code")'.format('/'.join(
-          [base_url, x['Category'], x['Submitter'], 'code'])),
-      axis=1)
-
-  output = args.input[:-4]
-  writer = pd.ExcelWriter(output + '.xlsx', engine='xlsxwriter')
-
-  indices = {}
-  indices['closed'] = [
-      'ID',
-      'Unique ID (e.g. for Audit)',
-      'ColorKey',
-      'Submitter',
-      'Availability',
-      'System',
-      'Nodes',
-      'Processor',
-      'p#',
-      'Accelerator',
-      'a#',
-      'Software',
-      'Notes',
-  ]
-  indices['open'] = indices['closed'].copy()
-  indices['closed'].append('Details')
-  indices['closed'].append('Code')
-  indices['network'] = indices['closed'].copy()
-  indices['open'].append('UsedModel')
-  indices['open'].append('Accuracy')
-  indices['open'].append('Details')
-  indices['open'].append('Code')
-  columns = [
-      'Model',
-      'Scenario',
-      'Units',
-  ]
-  columns_order = [['Result'],
-                   [
-                       'resnet', 'retinanet', '3d-unet-99', '3d-unet-99.9',
-                       'rnnt', 'bert-99', 'bert-99.9', 'dlrm-v2-99', 'dlrm-v2-99.9',
-                       'gptj-99', 'gptj-99.9', 'stable-diffusion-xl', 'llama2-70b-99', 'llama2-70b-99.9'
-                   ], ['SingleStream', 'MultiStream', 'Server', 'Offline'],
-                   [
-                       'Latency (ms)',
-                       'Samples/s',
-                       'Queries/s',
-                       'millijoules',
-                       'Watts',
-                   ]]
-
-  filter_scenarios = {
-      'datacenter': {
-          'resnet': ['Server', 'Offline'],
-          'retinanet': ['Server', 'Offline'],
-          'rnnt': ['Server', 'Offline'],
-          'bert-99': ['Server', 'Offline'],
-          'bert-99.9': ['Server', 'Offline'],
-          'dlrm-v2-99': ['Server', 'Offline'],
-          'dlrm-v2-99.9': ['Server', 'Offline'],
-          '3d-unet-99': ['Offline'],
-          '3d-unet-99.9': ['Offline'],
-          'gptj-99': ['Server', 'Offline'],
-          'gptj-99.9': ['Server', 'Offline'],
-          'stable-diffusion-xl': ['Server', 'Offline'],
-          'llama2-70b-99': ['Server', 'Offline'],
-          'llama2-70b-99.9': ['Server', 'Offline'],
-      },
-      'edge': {
-          'resnet': ['SingleStream', 'MultiStream', 'Offline'],
-          'retinanet': ['SingleStream', 'MultiStream', 'Offline'],
-          'rnnt': ['SingleStream', 'Offline'],
-          'bert-99': ['SingleStream', 'Offline'],
-          'bert-99.9': [],
-          'dlrm-v2-99': [],
-          'dlrm-v2-99.9': [],
-          '3d-unet-99': ['SingleStream', 'Offline'],
-          '3d-unet-99.9': ['SingleStream', 'Offline'],
-          'gptj-99': ['SingleStream', 'Offline'],
-          'gptj-99.9': ['SingleStream', 'Offline'],
-          'stable-diffusion-xl': ['SingleStream', 'Offline'],
-      }
-  }
-
-  def MakeWorksheet(df, index, filter_dict, sheet_name):
-    for key, value in filter_dict.items():
-      if type(key) == tuple:
-        key = list(key)
-      df = df[value(df[key])]
-    df = df.pivot_table(index=index, columns=columns, values=['Result'])
-    df = df.fillna('')
-    if df.size == 0:
-      return
-    for i, order in enumerate(columns_order):
-      df = df.reindex(columns=order, level=i)
-    df.to_excel(writer, sheet_name=sheet_name)
-
-  def Equal(x):
-    return lambda y: y == x
-
-  def NotEqual(x):
-    return lambda y: y != x
-
-  def Contain(x):
-    return lambda y: y.str.find(x) != -1
-
-  def And(x, y):
-    return lambda z: x(z) & y(z)
-
-  def Apply(f, *args):
-    return lambda x: f(x, *args)
-
-  def FilterScenario(x, suite):
-    return x.apply(
-        lambda y: y['Scenario'] in filter_scenarios[suite][y['Model']], axis=1)
-
-  def MakeUniqueID(x):
-    key_list = ['Suite', 'Category', 'Submitter', 'Platform']
-    if x['Category'] == 'open':
-      key_list.append('UsedModel')
-    return '/'.join(x[key_list])
-
-  df['Unique ID (e.g. for Audit)'] = df.apply(MakeUniqueID, axis=1)
-  df['ColorKey'] = df.apply(
-      lambda x: ''.join(x[['Availability', 'Submitter']]), axis=1)
-  df.sort_values(
-      by=[
-          'Category', 'Availability', 'Submitter', 'Unique ID (e.g. for Audit)'
-      ],
-      inplace=True)
-  id_dict = {
-      key: 1 + value
-      for (value,
-           key) in enumerate(pd.unique(df['Unique ID (e.g. for Audit)']))
-  }
-  df['ID'] = df.apply(
-      lambda x: '{}-{:04}'.format(args.version, id_dict[x['Unique ID (e.g. for Audit)']]),
-      axis=1)
-
-  for category in ['closed', 'open', 'network']:
-    for suite in ['datacenter', 'edge']:
-      MakeWorksheet(
-          df, indices[category], {
-              'Category':
-                  Equal(category),
-              'Suite':
-                  Contain(suite),
-              'Units':
-                  And(
-                      And(NotEqual('Watts'), NotEqual('millijoules')),
-                      NotEqual('millijoules/Stream')),
-              ('Scenario', 'Model'):
-                  Apply(FilterScenario, suite)
-          }, suite + ' - ' + category)
-
-      MakeWorksheet(
-          df, indices[category], {
-              'Category': Equal(category),
-              'Suite': Contain(suite),
-              'has_power': Equal(True),
-              ('Scenario', 'Model'): Apply(FilterScenario, suite)
-          }, suite + ' - ' + category + ' - power')
-
-  score_format = writer.book.add_format({'num_format': '#,##0.00'})
-  bg_format = writer.book.add_format({'bg_color': '#efefef'})
-  for ws in writer.book.worksheets():
-    ws.set_column(1, 1, None, None, {'hidden': 1})
-    ws.set_column(2, 2, None, None, {'hidden': 1})
-    ws.set_column(len(indices['closed']), 100, None, score_format)
-    ws.conditional_format(
-        2 + len(columns), 0, 200, 100, {
-            'type':
-                'formula',
-            'criteria':
-                '=mod(countunique($c$' + str(len(columns) + 3) + ':$c' +
-                str(len(columns) + 3) + '), 2) = 0',
-            'format':
-                bg_format,
-        })
-
-  writer.close()
-
-  return 0
-
-
-if __name__ == '__main__':
-  sys.exit(main())
+    args = get_args()
+
+    df = pd.read_csv(args.input).fillna("")
+
+    # rename some fields
+    df.rename(
+        columns={
+            "Organization": "Submitter",
+            "Division": "Category",
+            "SystemType": "Suite",
+            "SystemName": "System",
+            "number_of_nodes": "Nodes",
+            "host_processor_model_name": "Processor",
+            "accelerator_model_name": "Accelerator",
+            "accelerators_per_node": "a#",
+            "notes": "Notes",
+            "framework": "Software",
+        },
+        inplace=True,
+    )
+    df.rename(columns={"Model": "UsedModel"}, inplace=True)
+    df.rename(columns={"MlperfModel": "Model"}, inplace=True)
+
+    # fix issues with raw data
+    df["host_processor_core_count"] = df["host_processor_core_count"].apply(
+        lambda x: 2 if x == "2 (big); 4 (LITTLE)" else x
+    )
+    df["Availability"] = df["Availability"].apply(
+        lambda x: "available" if x == "on-premise" else x
+    )
+
+    # cleanup counts
+    df["Accelerator"] = df["Accelerator"].apply(
+        lambda x: x if x != "-" else "")
+    df["a#"] = df["a#"].apply(lambda x: int(x) if str(x).isnumeric() else x)
+    df["a#"] = df["a#"].apply(lambda x: x if x != 0 else "")
+    df["p#"] = df.apply(lambda x: int(x["host_processors_per_node"]), axis=1)
+
+    # details url
+    base_url = f"https://github.com/mlcommons/{args.repository}/tree/main"
+    df["Details"] = df.apply(
+        lambda x: '=HYPERLINK("{}","details")'.format(
+            "/".join(
+                [base_url, x["Category"], x["Submitter"], "results", x["Platform"]]
+            )
+        ),
+        axis=1,
+    )
+
+    # code url
+    df["Code"] = df.apply(
+        lambda x: '=HYPERLINK("{}","code")'.format(
+            "/".join([base_url, x["Category"], x["Submitter"], "code"])
+        ),
+        axis=1,
+    )
+
+    output = args.input[:-4]
+    writer = pd.ExcelWriter(output + ".xlsx", engine="xlsxwriter")
+
+    indices = {}
+    indices["closed"] = [
+        "ID",
+        "Unique ID (e.g. for Audit)",
+        "ColorKey",
+        "Submitter",
+        "Availability",
+        "System",
+        "Nodes",
+        "Processor",
+        "p#",
+        "Accelerator",
+        "a#",
+        "Software",
+        "Notes",
+    ]
+    indices["open"] = indices["closed"].copy()
+    indices["closed"].append("Details")
+    indices["closed"].append("Code")
+    indices["network"] = indices["closed"].copy()
+    indices["open"].append("UsedModel")
+    indices["open"].append("Accuracy")
+    indices["open"].append("Details")
+    indices["open"].append("Code")
+    columns = [
+        "Model",
+        "Scenario",
+        "Units",
+    ]
+    columns_order = [
+        ["Result"],
+        [
+            "resnet",
+            "retinanet",
+            "3d-unet-99",
+            "3d-unet-99.9",
+            "rnnt",
+            "bert-99",
+            "bert-99.9",
+            "dlrm-v2-99",
+            "dlrm-v2-99.9",
+            "gptj-99",
+            "gptj-99.9",
+            "stable-diffusion-xl",
+            "llama2-70b-99",
+            "llama2-70b-99.9",
+        ],
+        ["SingleStream", "MultiStream", "Server", "Offline"],
+        [
+            "Latency (ms)",
+            "Samples/s",
+            "Queries/s",
+            "millijoules",
+            "Watts",
+        ],
+    ]
+
+    filter_scenarios = {
+        "datacenter": {
+            "resnet": ["Server", "Offline"],
+            "retinanet": ["Server", "Offline"],
+            "rnnt": ["Server", "Offline"],
+            "bert-99": ["Server", "Offline"],
+            "bert-99.9": ["Server", "Offline"],
+            "dlrm-v2-99": ["Server", "Offline"],
+            "dlrm-v2-99.9": ["Server", "Offline"],
+            "3d-unet-99": ["Offline"],
+            "3d-unet-99.9": ["Offline"],
+            "gptj-99": ["Server", "Offline"],
+            "gptj-99.9": ["Server", "Offline"],
+            "stable-diffusion-xl": ["Server", "Offline"],
+            "llama2-70b-99": ["Server", "Offline"],
+            "llama2-70b-99.9": ["Server", "Offline"],
+        },
+        "edge": {
+            "resnet": ["SingleStream", "MultiStream", "Offline"],
+            "retinanet": ["SingleStream", "MultiStream", "Offline"],
+            "rnnt": ["SingleStream", "Offline"],
+            "bert-99": ["SingleStream", "Offline"],
+            "bert-99.9": [],
+            "dlrm-v2-99": [],
+            "dlrm-v2-99.9": [],
+            "3d-unet-99": ["SingleStream", "Offline"],
+            "3d-unet-99.9": ["SingleStream", "Offline"],
+            "gptj-99": ["SingleStream", "Offline"],
+            "gptj-99.9": ["SingleStream", "Offline"],
+            "stable-diffusion-xl": ["SingleStream", "Offline"],
+        },
+    }
+
+    def MakeWorksheet(df, index, filter_dict, sheet_name):
+        for key, value in filter_dict.items():
+            if isinstance(key, tuple):
+                key = list(key)
+            df = df[value(df[key])]
+        df = df.pivot_table(index=index, columns=columns, values=["Result"])
+        df = df.fillna("")
+        if df.size == 0:
+            return
+        for i, order in enumerate(columns_order):
+            df = df.reindex(columns=order, level=i)
+        df.to_excel(writer, sheet_name=sheet_name)
+
+    def Equal(x):
+        return lambda y: y == x
+
+    def NotEqual(x):
+        return lambda y: y != x
+
+    def Contain(x):
+        return lambda y: y.str.find(x) != -1
+
+    def And(x, y):
+        return lambda z: x(z) & y(z)
+
+    def Apply(f, *args):
+        return lambda x: f(x, *args)
+
+    def FilterScenario(x, suite):
+        return x.apply(
+            lambda y: y["Scenario"] in filter_scenarios[suite][y["Model"]], axis=1
+        )
+
+    def MakeUniqueID(x):
+        key_list = ["Suite", "Category", "Submitter", "Platform"]
+        if x["Category"] == "open":
+            key_list.append("UsedModel")
+        return "/".join(x[key_list])
+
+    df["Unique ID (e.g. for Audit)"] = df.apply(MakeUniqueID, axis=1)
+    df["ColorKey"] = df.apply(
+        lambda x: "".join(x[["Availability", "Submitter"]]), axis=1
+    )
+    df.sort_values(
+        by=["Category", "Availability", "Submitter",
+            "Unique ID (e.g. for Audit)"],
+        inplace=True,
+    )
+    id_dict = {
+        key: 1 + value
+        for (value, key) in enumerate(pd.unique(df["Unique ID (e.g. for Audit)"]))
+    }
+    df["ID"] = df.apply(
+        lambda x: "{}-{:04}".format(
+            args.version, id_dict[x["Unique ID (e.g. for Audit)"]]
+        ),
+        axis=1,
+    )
+
+    for category in ["closed", "open", "network"]:
+        for suite in ["datacenter", "edge"]:
+            MakeWorksheet(
+                df,
+                indices[category],
+                {
+                    "Category": Equal(category),
+                    "Suite": Contain(suite),
+                    "Units": And(
+                        And(NotEqual("Watts"), NotEqual("millijoules")),
+                        NotEqual("millijoules/Stream"),
+                    ),
+                    ("Scenario", "Model"): Apply(FilterScenario, suite),
+                },
+                suite + " - " + category,
+            )
+
+            MakeWorksheet(
+                df,
+                indices[category],
+                {
+                    "Category": Equal(category),
+                    "Suite": Contain(suite),
+                    "has_power": Equal(True),
+                    ("Scenario", "Model"): Apply(FilterScenario, suite),
+                },
+                suite + " - " + category + " - power",
+            )
+
+    score_format = writer.book.add_format({"num_format": "#,##0.00"})
+    bg_format = writer.book.add_format({"bg_color": "#efefef"})
+    for ws in writer.book.worksheets():
+        ws.set_column(1, 1, None, None, {"hidden": 1})
+        ws.set_column(2, 2, None, None, {"hidden": 1})
+        ws.set_column(len(indices["closed"]), 100, None, score_format)
+        ws.conditional_format(
+            2 + len(columns),
+            0,
+            200,
+            100,
+            {
+                "type": "formula",
+                "criteria": "=mod(countunique($c$"
+                + str(len(columns) + 3)
+                + ":$c"
+                + str(len(columns) + 3)
+                + "), 2) = 0",
+                "format": bg_format,
+            },
+        )
+
+    writer.close()
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tools/submission/log_parser.py b/tools/submission/log_parser.py
index 178495e32..df9d82ad1 100755
--- a/tools/submission/log_parser.py
+++ b/tools/submission/log_parser.py
@@ -21,9 +21,13 @@
 
 # pylint: disable=missing-docstring
 
-logging.basicConfig(level=logging.INFO, format="[%(asctime)s %(filename)s:%(lineno)d %(levelname)s] %(message)s")
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s %(filename)s:%(lineno)d %(levelname)s] %(message)s",
+)
 
-class MLPerfLog():
+
+class MLPerfLog:
     def __init__(self, log_path, strict=True):
         """
         Helper class to parse the detail logs.
@@ -38,16 +42,22 @@ def __init__(self, log_path, strict=True):
                 line = line.rstrip()
                 if line.find(self.marker) == 0:
                     try:
-                        self.messages.append(json.loads(line[len(self.marker):]))
-                    except:
+                        self.messages.append(
+                            json.loads(line[len(self.marker):]))
+                    except BaseException:
                         if strict:
-                            raise RuntimeError("Encountered invalid line: {:}".format(line))
+                            raise RuntimeError(
+                                "Encountered invalid line: {:}".format(line)
+                            )
                         else:
-                            self.logger.warning("Skipping invalid line: {:}".format(line))
+                            self.logger.warning(
+                                "Skipping invalid line: {:}".format(line)
+                            )
         self.keys = set()
         for message in self.messages:
             self.keys.add(message["key"])
-        self.logger.info("Sucessfully loaded MLPerf log from {:}.".format(log_path))
+        self.logger.info(
+            "Sucessfully loaded MLPerf log from {:}.".format(log_path))
 
     def __getitem__(self, key):
         """
@@ -60,7 +70,11 @@ def __getitem__(self, key):
             if message["key"] == key:
                 results.append(message)
         if len(results) != 1:
-            self.logger.warning("There are multiple messages with key {:} in the log. Emprically choosing the first one.".format(key))
+            self.logger.warning(
+                "There are multiple messages with key {:} in the log. Emprically choosing the first one.".format(
+                    key
+                )
+            )
         return results[0]["value"]
 
     def get(self, key):
@@ -95,7 +109,11 @@ def get_dict(self):
             if message["key"] not in result:
                 result[message["key"]] = message["value"]
             else:
-                self.logger.warning("There are multiple messages with key {:} in the log. Emprically choosing the first one.".format(key))
+                self.logger.warning(
+                    "There are multiple messages with key {:} in the log. Emprically choosing the first one.".format(
+                        key
+                    )
+                )
 
     def dump(self, output_path):
         """
@@ -105,11 +123,11 @@ def dump(self, output_path):
             json.dump(self.messages, f, indent=4)
 
     def num_messages(self):
-        """ Get number of messages (including errors and warnings) in the log. """
+        """Get number of messages (including errors and warnings) in the log."""
         return len(self.messages)
 
     def num_errors(self):
-        """ Get number of errors in the log. """
+        """Get number of errors in the log."""
         count = 0
         for message in self.messages:
             if message["metadata"]["is_error"]:
@@ -117,7 +135,7 @@ def num_errors(self):
         return count
 
     def num_warnings(self):
-        """ Get number of warning in the log. """
+        """Get number of warning in the log."""
         count = 0
         for message in self.messages:
             if message["metadata"]["is_warning"]:
@@ -125,11 +143,11 @@ def num_warnings(self):
         return count
 
     def has_error(self):
-        """ Check if the log contains any errors. """
+        """Check if the log contains any errors."""
         return self.num_errors() != 0
 
     def has_warning(self):
-        """ Check if the log contains any warnings. """
+        """Check if the log contains any warnings."""
         return self.num_warnings() != 0
 
     def get_errors(self):
@@ -152,14 +170,23 @@ def get_warnings(self):
                 results.append(message)
         return results
 
+
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--input", required=True, help="path to the detail log")
-    parser.add_argument("--ignore_invalid_lines", action="store_true", help="whether to stop if there are lines with invalid formats")
+    parser.add_argument(
+        "--input",
+        required=True,
+        help="path to the detail log")
+    parser.add_argument(
+        "--ignore_invalid_lines",
+        action="store_true",
+        help="whether to stop if there are lines with invalid formats",
+    )
     args = parser.parse_args()
     return args
 
+
 def main():
     """
     Inspect a detailed log.
@@ -174,8 +201,9 @@ def main():
     logger.info("- Contents:")
     messages = mlperf_log.get_messages()
     for message in messages:
-        logger.info("\"{:}\": {:}".format(message["key"], message["value"]))
+        logger.info('"{:}": {:}'.format(message["key"], message["value"]))
     logger.info("Done!")
 
+
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/tools/submission/power/power_checker.py b/tools/submission/power/power_checker.py
index 93d9d6fb9..834cd59dd 100755
--- a/tools/submission/power/power_checker.py
+++ b/tools/submission/power/power_checker.py
@@ -81,7 +81,8 @@ class CheckerWarning(Exception):
 ]
 COMMON_ERROR_TESTING = ["USB."]
 WARNING_NEEDS_TO_BE_ERROR_TESTING_RE = [
-    re.compile(r"Uncertainty \d+.\d+%, which is above 1.00% limit for the last sample!")
+    re.compile(
+        r"Uncertainty \d+.\d+%, which is above 1.00% limit for the last sample!")
 ]
 
 TIME_DELTA_TOLERANCE = 800  # in milliseconds
@@ -89,7 +90,7 @@ class CheckerWarning(Exception):
 
 def _normalize(path: str) -> str:
     allparts: List[str] = []
-    while 1:
+    while True:
         parts = os.path.split(path)
         if parts[0] == path:  # sentinel for absolute paths
             allparts.insert(0, parts[0])
@@ -126,8 +127,10 @@ def get_time_from_line(
 ) -> float:
     log_time_str = re.search(data_regexp, line)
     if log_time_str and log_time_str.group(0):
-        log_datetime = datetime.strptime(log_time_str.group(0), "%m-%d-%Y %H:%M:%S.%f")
-        return log_datetime.replace(tzinfo=timezone.utc).timestamp() + timezone_offset
+        log_datetime = datetime.strptime(
+            log_time_str.group(0), "%m-%d-%Y %H:%M:%S.%f")
+        return log_datetime.replace(
+            tzinfo=timezone.utc).timestamp() + timezone_offset
     raise LineWithoutTimeStamp(f"{line.strip()!r} in {file}.")
 
 
@@ -156,8 +159,10 @@ def required_fields_check(self) -> None:
         ), f"Required fields {', '.join(absent_keys)!r} does not exist in {self.path!r}"
 
 
-def compare_dicts_values(d1: Dict[str, str], d2: Dict[str, str], comment: str) -> None:
-    files_with_diff_check_sum = {k: d1[k] for k in d1 if k in d2 and d1[k] != d2[k]}
+def compare_dicts_values(
+        d1: Dict[str, str], d2: Dict[str, str], comment: str) -> None:
+    files_with_diff_check_sum = {k: d1[k]
+                                 for k in d1 if k in d2 and d1[k] != d2[k]}
     assert len(files_with_diff_check_sum) == 0, f"{comment}" + "".join(
         [
             f"Expected {d1[i]}, but got {d2[i]} for {i}\n"
@@ -166,7 +171,8 @@ def compare_dicts_values(d1: Dict[str, str], d2: Dict[str, str], comment: str) -
     )
 
 
-def compare_dicts(s1: Dict[str, str], s2: Dict[str, str], comment: str) -> None:
+def compare_dicts(s1: Dict[str, str],
+                  s2: Dict[str, str], comment: str) -> None:
     assert (
         not s1.keys() - s2.keys()
     ), f"{comment} Missing {', '.join(sorted(s1.keys() - s2.keys()))!r}"
@@ -224,7 +230,8 @@ def check_reply(cmd: str, reply: str) -> None:
         for msg in msgs:
             if msg["cmd"].startswith(cmd):
                 if msg["cmd"] == "Stop":
-                    # In normal flow the third answer to stop command is `Error: no measurement to stop`
+                    # In normal flow the third answer to stop command is
+                    # `Error: no measurement to stop`
                     if stop_counter == 2:
                         reply = "Error: no measurement to stop"
                     stop_counter += 1
@@ -243,13 +250,15 @@ def check_reply(cmd: str, reply: str) -> None:
     def get_initial_range(param_num: int, reply: str) -> str:
         reply_list = reply.split(",")
         try:
-            if reply_list[param_num] == "0" and float(reply_list[param_num + 1]) > 0:
+            if reply_list[param_num] == "0" and float(
+                    reply_list[param_num + 1]) > 0:
                 return reply_list[param_num + 1]
         except (ValueError, IndexError):
             assert False, f"Can not get power meters initial values from {reply!r}"
         return "Auto"
 
-    def get_command_by_value_and_number(cmd: str, number: int) -> Optional[str]:
+    def get_command_by_value_and_number(
+            cmd: str, number: int) -> Optional[str]:
         command_counter = 0
         for msg in msgs:
             if msg["cmd"].startswith(cmd):
@@ -273,7 +282,8 @@ def get_command_by_value_and_number(cmd: str, number: int) -> Optional[str]:
     ), f"Do not set Volts range as initial. Expected 'SR,V,{initial_volts}', got {initial_volts_command!r}."
 
 
-def uuid_check(client_sd: SessionDescriptor, server_sd: SessionDescriptor) -> None:
+def uuid_check(client_sd: SessionDescriptor,
+               server_sd: SessionDescriptor) -> None:
     """Compare UUIDs from client.json and server.json. They should be the same."""
     uuid_c = client_sd.json_object["uuid"]
     uuid_s = server_sd.json_object["uuid"]
@@ -362,7 +372,8 @@ def compare_duration(range_duration: float, test_duration: float) -> None:
     def compare_time_boundaries(
         begin: float, end: float, phases: List[Any], mode: str
     ) -> None:
-        # TODO: temporary workaround, remove when proper DST handling is implemented!
+        # TODO: temporary workaround, remove when proper DST handling is
+        # implemented!
         assert (
             phases[1][0] < begin < phases[2][0]
             or phases[1][0] < begin - 3600 < phases[2][0]
@@ -380,8 +391,16 @@ def compare_time_boundaries(
         os.path.join(path, "run_1"), client_sd
     )
 
-    compare_time_boundaries(system_begin_r, system_end_r, phases_ranging_c, "ranging")
-    compare_time_boundaries(system_begin_t, system_end_t, phases_testing_c, "testing")
+    compare_time_boundaries(
+        system_begin_r,
+        system_end_r,
+        phases_ranging_c,
+        "ranging")
+    compare_time_boundaries(
+        system_begin_t,
+        system_end_t,
+        phases_testing_c,
+        "testing")
 
     ranging_duration_d = system_end_r - system_begin_r
     testing_duration_d = system_end_t - system_begin_t
@@ -464,7 +483,8 @@ def session_name_check(
     ), f"Session name is not equal. Client session name is {session_name_c!r}. Server session name is {session_name_s!r}"
 
 
-def messages_check(client_sd: SessionDescriptor, server_sd: SessionDescriptor) -> None:
+def messages_check(client_sd: SessionDescriptor,
+                   server_sd: SessionDescriptor) -> None:
     """Compare client and server messages list length.
     Compare messages values and replies from client.json and server.json.
     Compare client and server version.
@@ -488,14 +508,19 @@ def messages_check(client_sd: SessionDescriptor, server_sd: SessionDescriptor) -
             )
 
     # Check client and server version from server.json.
-    # Server.json contains all client.json messages and replies. Checked earlier.
+    # Server.json contains all client.json messages and replies. Checked
+    # earlier.
     def get_version(regexp: str, line: str) -> str:
         version_o = re.search(regexp, line)
         assert version_o is not None, f"Server version is not defined in:'{line}'"
         return version_o.group(1)
 
-    client_version = get_version(r"mlcommons\/power client v(\d+)$", ms[0]["cmd"])
-    server_version = get_version(r"mlcommons\/power server v(\d+)$", ms[0]["reply"])
+    client_version = get_version(
+        r"mlcommons\/power client v(\d+)$",
+        ms[0]["cmd"])
+    server_version = get_version(
+        r"mlcommons\/power server v(\d+)$",
+        ms[0]["reply"])
 
     assert (
         client_version == server_version
@@ -550,7 +575,8 @@ def remove_optional_path(res: Dict[str, str]) -> None:
         f"{client_sd.path} and {server_sd.path} results checksum comparison",
     )
 
-    # Check if the hashes of the files in results directory match the ones recorded in server.json/client.json.
+    # Check if the hashes of the files in results directory match the ones
+    # recorded in server.json/client.json.
     result_c_s = {**results_c, **results_s}
 
     compare_dicts(
@@ -616,7 +642,8 @@ def find_error_or_warning(reg_exp: str, line: str, error: bool) -> None:
 
                 # Treat uncommon errors in ranging phase as warnings
                 if all(
-                    not problem_line.group(0).strip().startswith(common_ranging_error)
+                    not problem_line.group(0).strip().startswith(
+                        common_ranging_error)
                     for common_ranging_error in COMMON_ERROR_RANGING
                 ):
                     raise CheckerWarning(
@@ -674,7 +701,8 @@ def get_msg_without_time(line: str) -> Optional[str]:
     is_uncertainty_check_activated = False
 
     for line in ptd_log_lines:
-        msg_o = re.search(r"Uncertainty checking for Yokogawa\S+ is activated", line)
+        msg_o = re.search(
+            r"Uncertainty checking for Yokogawa\S+ is activated", line)
         if msg_o is not None:
             try:
                 log_time = None
@@ -742,7 +770,8 @@ def debug_check(server_sd: SessionDescriptor) -> None:
     ), "Server was running in debug mode"
 
 
-def check_with_logging(check_name: str, check: Callable[[], None]) -> Tuple[bool, bool]:
+def check_with_logging(
+        check_name: str, check: Callable[[], None]) -> Tuple[bool, bool]:
     try:
         check()
     except AssertionError as e:
@@ -811,7 +840,9 @@ def check(path: str) -> int:
     parser = argparse.ArgumentParser(
         description="Check PTD client-server session results"
     )
-    parser.add_argument("session_directory", help="directory with session results data")
+    parser.add_argument(
+        "session_directory",
+        help="directory with session results data")
 
     args = parser.parse_args()
 
diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py
index fee2aadde..d49d0f1ea 100644
--- a/tools/submission/preprocess_submission.py
+++ b/tools/submission/preprocess_submission.py
@@ -30,29 +30,44 @@
 
 """
 
+
 def get_args():
     """Parse commandline."""
-    parser = argparse.ArgumentParser(description="Infer scenario results",
-                    formatter_class=argparse.RawDescriptionHelpFormatter, epilog=HELP_TEXT)
-    parser.add_argument("--input", required=True, help="orignal submission directory")
+    parser = argparse.ArgumentParser(
+        description="Infer scenario results",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=HELP_TEXT,
+    )
+    parser.add_argument(
+        "--input",
+        required=True,
+        help="orignal submission directory")
     parser.add_argument("--output", help="new submission directory")
-    parser.add_argument("--noinfer_low_accuracy_results",
+    parser.add_argument(
+        "--noinfer_low_accuracy_results",
         help="do not infer low accuracy results if a high accuracy result is present",
-        default=False, action="store_true")
-    parser.add_argument("--nodelete_empty_dirs",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--nodelete_empty_dirs",
         help="do not delete empty dirs in submission tree",
-        default=False, action="store_true")
+        default=False,
+        action="store_true",
+    )
     parser.add_argument(
         "--version",
         default="v4.0",
         choices=list(checker.MODEL_CONFIG.keys()),
-        help="mlperf version")
+        help="mlperf version",
+    )
     parser.add_argument("--submitter", help="filter to submitter")
     parser.add_argument(
         "--extra-model-benchmark-map",
         help="File containing extra custom model mapping.\
         It is assumed to be inside the folder open/<submitter>",
-        default="model_mapping.json")
+        default="model_mapping.json",
+    )
 
     args = parser.parse_args()
     if not args.output:
@@ -69,6 +84,7 @@ def list_dir(*path):
     path = os.path.join(*path)
     return next(os.walk(path))[1]
 
+
 def delete_empty_dirs(src):
     """
     Deletes any empty directory in the src tree
@@ -76,13 +92,15 @@ def delete_empty_dirs(src):
     if not os.path.isdir(src):
         return False
 
-    if all ([delete_empty_dirs(os.path.join(src, file)) for file in os.listdir(src)]):
+    if all([delete_empty_dirs(os.path.join(src, file))
+           for file in os.listdir(src)]):
         log.info("Removing empty dir: (%s)", src)
         os.rmdir(src)
         return True
 
     return False
 
+
 def copy_submission_dir(src, dst, filter_submitter):
     """
     Copies the submission tree to output directory for processing
@@ -93,11 +111,14 @@ def copy_submission_dir(src, dst, filter_submitter):
         for submitter in next(os.walk(os.path.join(src, division)))[1]:
             if filter_submitter and submitter != filter_submitter:
                 continue
-            shutil.copytree(os.path.join(src, division, submitter),
-                            os.path.join(dst, division, submitter))
+            shutil.copytree(
+                os.path.join(src, division, submitter),
+                os.path.join(dst, division, submitter),
+            )
 
 
-def infer_scenario_results(filter_submitter, noinfer_low_accuracy_results, config):
+def infer_scenario_results(
+        filter_submitter, noinfer_low_accuracy_results, config):
     """Walk result dir and check for singlestream (SS) folders and \
                corresponding offline and multistream (MS) ones.
        If SS exists and offline and MS are not existing, \
@@ -116,8 +137,9 @@ def infer_scenario_results(filter_submitter, noinfer_low_accuracy_results, confi
                 continue
 
             # process results
-            for directory in ["results", "measurements"] + \
-                (["compliance"] if division == "closed" else []):
+            for directory in ["results", "measurements"] + (
+                ["compliance"] if division == "closed" else []
+            ):
 
                 log_path = os.path.join(division, submitter, directory)
                 if not os.path.exists(log_path):
@@ -125,136 +147,216 @@ def infer_scenario_results(filter_submitter, noinfer_low_accuracy_results, confi
                     continue
 
                 for system_desc in list_dir(log_path):
-                    system_id_json = os.path.join(division, submitter, "systems",
-                                      system_desc + ".json")
+                    system_id_json = os.path.join(
+                        division, submitter, "systems", system_desc + ".json"
+                    )
                     if not os.path.exists(system_id_json):
-                        log.error("no system_desc for %s/%s/%s", division, submitter,
-                            system_desc)
+                        log.error(
+                            "no system_desc for %s/%s/%s",
+                            division,
+                            submitter,
+                            system_desc,
+                        )
                         continue
 
                     with open(system_id_json) as system_info:
                         system_json = json.load(system_info)
                     system_type = system_json.get("system_type")
-                    valid_system_types = ["datacenter", "edge", \
-                            "datacenter,edge", "edge,datacenter"]
+                    valid_system_types = [
+                        "datacenter",
+                        "edge",
+                        "datacenter,edge",
+                        "edge,datacenter",
+                    ]
                     if system_type not in valid_system_types:
-                        log.error("Division %s, submitter %s, "\
-                                "system %s has invalid system type (%s)", \
-                                division, submitter, system_id_json, system_type)
+                        log.error(
+                            "Division %s, submitter %s, "
+                            "system %s has invalid system type (%s)",
+                            division,
+                            submitter,
+                            system_id_json,
+                            system_type,
+                        )
                     config.set_type(system_type)
 
                     for model in list_dir(log_path, system_desc):
                         extra_model_mapping = None
                         if division == "open":
-                            model_mapping_path = (
-                                f"{division}/{submitter}/{config.extra_model_benchmark_map}"
-                            )
+                            model_mapping_path = f"{division}/{submitter}/{config.extra_model_benchmark_map}"
                             if os.path.exists(model_mapping_path):
                                 with open(model_mapping_path) as fp:
                                     extra_model_mapping = json.load(fp)
 
-                        mlperf_model = config.get_mlperf_model(model, extra_model_mapping)
+                        mlperf_model = config.get_mlperf_model(
+                            model, extra_model_mapping
+                        )
                         if not mlperf_model:
-                            log.error("Division %s, submitter %s, system %s has "\
-                                    "invalid model (%s)", division, submitter, \
-                                    system_id_json, model)
+                            log.error(
+                                "Division %s, submitter %s, system %s has "
+                                "invalid model (%s)",
+                                division,
+                                submitter,
+                                system_id_json,
+                                model,
+                            )
                             continue
 
                         if mlperf_model not in config.required:
-                            log.error("Division %s, submitter %s, system %s has invalid "\
-                                    "MLPerf model (%s) corresponding to given model (%s). "\
-                                    "Valid ones for MLPerf inference version (%s) in (%s) "\
-                                    "category are [%s]", division, submitter, system_id_json,\
-                                    mlperf_model, model, config.version, system_type, \
-                                    config.required.keys())
+                            log.error(
+                                "Division %s, submitter %s, system %s has invalid "
+                                "MLPerf model (%s) corresponding to given model (%s). "
+                                "Valid ones for MLPerf inference version (%s) in (%s) "
+                                "category are [%s]",
+                                division,
+                                submitter,
+                                system_id_json,
+                                mlperf_model,
+                                model,
+                                config.version,
+                                system_type,
+                                config.required.keys(),
+                            )
                             continue
 
-
                         required_scenarios = config.get_required(model)
                         all_scenarios = set(
-                            list(required_scenarios) +
-                            list(config.get_optional(mlperf_model)))
+                            list(required_scenarios)
+                            + list(config.get_optional(mlperf_model))
+                        )
 
                         for scenario in list_dir(log_path, system_desc, model):
 
-                            scenario_path = os.path.join(log_path, system_desc, model, scenario)
+                            scenario_path = os.path.join(
+                                log_path, system_desc, model, scenario
+                            )
 
                             if scenario.lower() == "singlestream":
                                 tobeinferredpaths = []
-                                offline_scenario_path =  os.path.join(log_path, system_desc, \
-                                        model, "offline")
-                                multistream_scenario_path =  os.path.join(log_path, system_desc, \
-                                        model, "multistream")
-                                if not os.path.exists(multistream_scenario_path) and \
-                                        not os.path.exists(offline_scenario_path):
-                                    #infer both the scenarios from SS
-                                    tobeinferredpaths = [ offline_scenario_path ]
+                                offline_scenario_path = os.path.join(
+                                    log_path, system_desc, model, "offline"
+                                )
+                                multistream_scenario_path = os.path.join(
+                                    log_path, system_desc, model, "multistream"
+                                )
+                                if not os.path.exists(
+                                    multistream_scenario_path
+                                ) and not os.path.exists(offline_scenario_path):
+                                    # infer both the scenarios from SS
+                                    tobeinferredpaths = [offline_scenario_path]
                                     if "MultiStream" in all_scenarios:
-                                        tobeinferredpaths.append(multistream_scenario_path)
+                                        tobeinferredpaths.append(
+                                            multistream_scenario_path
+                                        )
 
                                     for tobeinferredpath in tobeinferredpaths:
-                                        inferred_scenario = os.path.basename(tobeinferredpath)
-                                        log.info("Division %s, submitter %s, system %s, " \
+                                        inferred_scenario = os.path.basename(
+                                            tobeinferredpath
+                                        )
+                                        log.info(
+                                            "Division %s, submitter %s, system %s, "
                                             "model %s: \
-                                            inferring %s results from %s", \
-                                            division, submitter, system_desc, model, \
-                                            inferred_scenario, "singlestream")
-                                        shutil.copytree(scenario_path, tobeinferredpath)
-
-                                elif not os.path.exists(multistream_scenario_path) and \
-                                        "MultiStream" in all_scenarios:
-                                    #infer MS from SS
-                                    for tobeinferredpath in [ multistream_scenario_path ]:
-                                        log.info("Division %s, submitter %s, system %s, model %s: \
-                                            inferring %s results from %s", division, submitter, \
-                                            system_desc, model, "multistream", "singlestream")
-                                        shutil.copytree(scenario_path, multistream_scenario_path)
+                                            inferring %s results from %s",
+                                            division,
+                                            submitter,
+                                            system_desc,
+                                            model,
+                                            inferred_scenario,
+                                            "singlestream",
+                                        )
+                                        shutil.copytree(
+                                            scenario_path, tobeinferredpath)
+
+                                elif (
+                                    not os.path.exists(
+                                        multistream_scenario_path)
+                                    and "MultiStream" in all_scenarios
+                                ):
+                                    # infer MS from SS
+                                    for tobeinferredpath in [
+                                            multistream_scenario_path]:
+                                        log.info(
+                                            "Division %s, submitter %s, system %s, model %s: \
+                                            inferring %s results from %s",
+                                            division,
+                                            submitter,
+                                            system_desc,
+                                            model,
+                                            "multistream",
+                                            "singlestream",
+                                        )
+                                        shutil.copytree(
+                                            scenario_path, multistream_scenario_path
+                                        )
                                 elif not os.path.exists(offline_scenario_path):
-                                    '''we have both MS and SS results. Inferring from MS is \
+                                    """we have both MS and SS results. Inferring from MS is \
                                         expected to be better \
-                                    '''
+                                    """
                                     pass
 
                             elif scenario.lower() == "multistream":
-                                offline_scenario_path =  os.path.join(log_path, system_desc, \
-                                        model, "offline")
-                                '''Need to check if MS is indeed a measured result and not infeered.\
+                                offline_scenario_path = os.path.join(
+                                    log_path, system_desc, model, "offline"
+                                )
+                                """Need to check if MS is indeed a measured result and not infeered.\
                                 But if MS is indeed inferred from SS, offline scenario will also be \
                                 inferred already by the inferring code above \
-                                '''
-                                for tobeinferredpath in [offline_scenario_path]:
+                                """
+                                for tobeinferredpath in [
+                                        offline_scenario_path]:
                                     if not os.path.exists(tobeinferredpath):
-                                        log.info("Division %s, submitter %s, system %s, model %s: \
-                                                inferring %s results from %s", division, submitter,\
-                                                system_desc, model, "offline", "multistream")
-                                        shutil.copytree(scenario_path, tobeinferredpath)
+                                        log.info(
+                                            "Division %s, submitter %s, system %s, model %s: \
+                                                inferring %s results from %s",
+                                            division,
+                                            submitter,
+                                            system_desc,
+                                            model,
+                                            "offline",
+                                            "multistream",
+                                        )
+                                        shutil.copytree(
+                                            scenario_path, tobeinferredpath)
 
                 if not noinfer_low_accuracy_results:
                     for system_desc in list_dir(log_path):
                         for model in list_dir(log_path, system_desc):
                             if model.endswith("-99.9"):
-                                low_accuracy_model =model[:-2]
+                                low_accuracy_model = model[:-2]
                                 if low_accuracy_model not in config.required:
                                     continue
-                                high_accuracy_model_path = os.path.join(log_path, \
-                                        system_desc, model)
-                                low_accuracy_model_path = os.path.join(log_path, system_desc, \
-                                        low_accuracy_model)
+                                high_accuracy_model_path = os.path.join(
+                                    log_path, system_desc, model
+                                )
+                                low_accuracy_model_path = os.path.join(
+                                    log_path, system_desc, low_accuracy_model
+                                )
                                 if not os.path.exists(low_accuracy_model_path):
-                                    log.info("Division %s, submitter %s, system %s: \
-                                            copying %s results to %s", division, submitter, \
-                                            system_desc, model, low_accuracy_model)
-
-                                    shutil.copytree(high_accuracy_model_path, \
-                                            low_accuracy_model_path)
-                                high_accuracy_model_code_path = os.path.join(log_path, "..", \
-                                        "code", model)
-                                low_accuracy_model_code_path = os.path.join(log_path, "..", \
-                                        "code", low_accuracy_model)
-                                if not os.path.exists(low_accuracy_model_code_path):
-                                    shutil.copytree(high_accuracy_model_code_path, \
-                                            low_accuracy_model_code_path)
-
+                                    log.info(
+                                        "Division %s, submitter %s, system %s: \
+                                            copying %s results to %s",
+                                        division,
+                                        submitter,
+                                        system_desc,
+                                        model,
+                                        low_accuracy_model,
+                                    )
+
+                                    shutil.copytree(
+                                        high_accuracy_model_path,
+                                        low_accuracy_model_path,
+                                    )
+                                high_accuracy_model_code_path = os.path.join(
+                                    log_path, "..", "code", model
+                                )
+                                low_accuracy_model_code_path = os.path.join(
+                                    log_path, "..", "code", low_accuracy_model
+                                )
+                                if not os.path.exists(
+                                        low_accuracy_model_code_path):
+                                    shutil.copytree(
+                                        high_accuracy_model_code_path,
+                                        low_accuracy_model_code_path,
+                                    )
 
 
 def main():
@@ -272,16 +374,17 @@ def main():
     copy_submission_dir(args.input, args.output, args.submitter)
     src_dir = args.output
 
-    config = checker.Config(
-      args.version,
-      args.extra_model_benchmark_map)
+    config = checker.Config(args.version, args.extra_model_benchmark_map)
 
     if not args.nodelete_empty_dirs:
         delete_empty_dirs(os.path.join(src_dir))
 
     os.chdir(src_dir)
 
-    infer_scenario_results(args.submitter, args.noinfer_low_accuracy_results, config)
+    infer_scenario_results(
+        args.submitter,
+        args.noinfer_low_accuracy_results,
+        config)
 
     return 0
 
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 451cd66b6..d6ea18c2b 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -98,16 +98,62 @@
             "dlrm-v2-99.9": ("AUC", 80.31 * 0.999),
             "3d-unet-99": ("DICE", 0.86170 * 0.99),
             "3d-unet-99.9": ("DICE", 0.86170 * 0.999),
-            "gptj-99" : ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878*0.9),
-            "gptj-99.9" : ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878*0.9),
-            "llama2-70b-99" : ("ROUGE1", 44.4312 * 0.99, "ROUGE2", 22.0352 * 0.99, "ROUGEL", 28.6162 * 0.99, "TOKENS_PER_SAMPLE", 294.45*0.9),
-            "llama2-70b-99.9" : ("ROUGE1", 44.4312 * 0.999, "ROUGE2", 22.0352 * 0.999, "ROUGEL", 28.6162 * 0.999, "TOKENS_PER_SAMPLE", 294.45*0.9),
-            "stable-diffusion-xl": ("CLIP_SCORE", 31.68631873, "FID_SCORE", 23.01085758)
+            "gptj-99": (
+                "ROUGE1",
+                42.9865 * 0.99,
+                "ROUGE2",
+                20.1235 * 0.99,
+                "ROUGEL",
+                29.9881 * 0.99,
+                "GEN_LEN",
+                4016878 * 0.9,
+            ),
+            "gptj-99.9": (
+                "ROUGE1",
+                42.9865 * 0.999,
+                "ROUGE2",
+                20.1235 * 0.999,
+                "ROUGEL",
+                29.9881 * 0.999,
+                "GEN_LEN",
+                4016878 * 0.9,
+            ),
+            "llama2-70b-99": (
+                "ROUGE1",
+                44.4312 * 0.99,
+                "ROUGE2",
+                22.0352 * 0.99,
+                "ROUGEL",
+                28.6162 * 0.99,
+                "TOKENS_PER_SAMPLE",
+                294.45 * 0.9,
+            ),
+            "llama2-70b-99.9": (
+                "ROUGE1",
+                44.4312 * 0.999,
+                "ROUGE2",
+                22.0352 * 0.999,
+                "ROUGEL",
+                28.6162 * 0.999,
+                "TOKENS_PER_SAMPLE",
+                294.45 * 0.9,
+            ),
+            "stable-diffusion-xl": (
+                "CLIP_SCORE",
+                31.68631873,
+                "FID_SCORE",
+                23.01085758,
+            ),
         },
         "accuracy-upper-limit": {
-            "stable-diffusion-xl": ("CLIP_SCORE", 31.81331801, "FID_SCORE", 23.95007626),
-            "llama2-70b-99" : ("TOKENS_PER_SAMPLE", 294.45*1.1),
-            "llama2-70b-99.9" : ("TOKENS_PER_SAMPLE", 294.45*1.1)
+            "stable-diffusion-xl": (
+                "CLIP_SCORE",
+                31.81331801,
+                "FID_SCORE",
+                23.95007626,
+            ),
+            "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
+            "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
         },
         "performance-sample-count": {
             "resnet": 1024,
@@ -123,14 +169,14 @@
             "gptj-99.9": 13368,
             "llama2-70b-99": 24576,
             "llama2-70b-99.9": 24576,
-            "stable-diffusion-xl": 5000
+            "stable-diffusion-xl": 5000,
         },
         # TODO: Update this list.
         "model_mapping": {
             # map model names to the official mlperf model class
             "ssd-resnet34": "retinanet",
             "mobilenet": "resnet",
-            "resnet50": "resnet"
+            "resnet50": "resnet",
         },
         "seeds": {
             # TODO: Update random seeds
@@ -157,7 +203,7 @@
             "gptj-99.9": {"Server": 20000000000},
             "llama2-70b-99": {"Server": 20000000000},
             "llama2-70b-99.9": {"Server": 20000000000},
-            "stable-diffusion-xl" : {"Server": 20000000000}
+            "stable-diffusion-xl": {"Server": 20000000000},
         },
         "min-queries": {
             "resnet": {
@@ -183,7 +229,11 @@
             "gptj-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
-            "stable-diffusion-xl": {"SingleStream": 1024, "Server": 270336, "Offline": 1}
+            "stable-diffusion-xl": {
+                "SingleStream": 1024,
+                "Server": 270336,
+                "Offline": 1,
+            },
         },
     },
 }
@@ -219,7 +269,7 @@
                 "3757",
                 "1578",
                 "3319",
-                "95"
+                "95",
             ]
         }
     }
@@ -255,7 +305,7 @@
     "gptj-99.9": 13368,
     "llama2-70b-99": 24576,
     "llama2-70b-99.9": 24576,
-    "stable-diffusion-xl": 5000
+    "stable-diffusion-xl": 5000,
 }
 
 SCENARIO_MAPPING = {
@@ -286,7 +336,7 @@
         "MultiStreamLegacy": "effective_samples_per_query",
         "MultiStream": "early_stopping_latency_ms",
         "Server": "result_completed_samples_per_sec",
-    }
+    },
 }
 
 RESULT_FIELD_BENCHMARK_OVERWRITE = {
@@ -302,8 +352,8 @@
     },
     "v4.1": {
         "llama2-70b-99": {
-        "Offline": "result_tokens_per_second",
-        "Server": "result_completed_tokens_per_second",
+            "Offline": "result_tokens_per_second",
+            "Server": "result_completed_tokens_per_second",
         },
         "llama2-70b-99.9": {
             "Offline": "result_tokens_per_second",
@@ -316,16 +366,13 @@
         "gptj-99.9": {
             "Offline": "result_inferred_tokens_per_second",
             "Server": "result_inferred_completed_tokens_per_second",
-        }
-    }
+        },
+    },
 }
 
 LLAMA2_LATENCY_LIMITS = {
     # We might add interactive in the next round. Latency in ns
-    "conversational": {
-        "ttft": 2000 * 1000000,
-        "tpot": 200 * 1000000
-    }
+    "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
 }
 
 ACC_PATTERN = {
@@ -381,7 +428,7 @@
     "hw_notes",
     "sw_notes",
     "host_network_card_count",
-    "system_type_detail"
+    "system_type_detail",
 ]
 
 SYSTEM_DESC_MEANINGFUL_RESPONSE_REQUIRED_FIELDS = [
@@ -545,7 +592,7 @@ def get_accuracy_target(self, model):
         if model not in self.accuracy_target:
             raise ValueError("model not known: " + model)
         return self.accuracy_target[model]
-    
+
     def get_accuracy_upper_limit(self, model):
         return self.accuracy_upper_limit.get(model, None)
 
@@ -575,11 +622,8 @@ def get_min_query_count(self, model, scenario):
     def has_new_logging_format(self):
         return True
 
-
     def uses_early_stopping(self, scenario):
-        return (
-            scenario in ["Server", "SingleStream", "MultiStream"]
-        )
+        return scenario in ["Server", "SingleStream", "MultiStream"]
 
 
 def get_args():
@@ -593,7 +637,10 @@ def get_args():
         help="mlperf version",
     )
     parser.add_argument("--submitter", help="filter to submitter")
-    parser.add_argument("--csv", default="summary.csv", help="csv file with results")
+    parser.add_argument(
+        "--csv",
+        default="summary.csv",
+        help="csv file with results")
     parser.add_argument(
         "--skip_compliance",
         action="store_true",
@@ -604,7 +651,10 @@ def get_args():
         help="File containing extra custom model mapping. It is assumed to be inside the folder open/<submitter>",
         default="model_mapping.json",
     )
-    parser.add_argument("--debug", action="store_true", help="extra debug output")
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="extra debug output")
     parser.add_argument(
         "--submission-exceptions",
         action="store_true",
@@ -641,17 +691,20 @@ def get_args():
 
 def list_dir(*path):
     path = os.path.join(*path)
-    return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
+    return [f for f in os.listdir(
+        path) if os.path.isdir(os.path.join(path, f))]
 
 
 def list_files(*path):
     path = os.path.join(*path)
-    return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+    return [f for f in os.listdir(
+        path) if os.path.isfile(os.path.join(path, f))]
 
 
 def list_empty_dirs_recursively(*path):
     path = os.path.join(*path)
-    return [dirpath for dirpath, dirs, files in os.walk(path) if not dirs and not files]
+    return [dirpath for dirpath, dirs, files in os.walk(
+        path) if not dirs and not files]
 
 
 def list_dirs_recursively(*path):
@@ -677,13 +730,16 @@ def check_extra_files(path, target_files):
             check_pass = False
             missing_files.append(os.path.join(path, dir))
         else:
-            files = [f.split(".")[0] for f in list_files(os.path.join(path, dir))]
+            files = [f.split(".")[0]
+                     for f in list_files(os.path.join(path, dir))]
             for target_file in target_files[dir]:
                 if target_file not in files:
                     check_pass = False
-                    missing_files.append(f"{os.path.join(path, dir, target_file)}.png")
+                    missing_files.append(
+                        f"{os.path.join(path, dir, target_file)}.png")
             if "captions" not in files:
-                missing_files.append(f"{os.path.join(path, dir, 'captions.txt')}")
+                missing_files.append(
+                    f"{os.path.join(path, dir, 'captions.txt')}")
     return check_pass, missing_files
 
 
@@ -702,10 +758,7 @@ def find_error_in_detail_log(config, fname):
             if config.ignore_uncommited:
                 has_other_errors = False
                 for error in mlperf_log.get_errors():
-                    if (
-                        "Loadgen built with uncommitted changes!"
-                        not in error["value"]
-                    ):
+                    if "Loadgen built with uncommitted changes!" not in error["value"]:
                         has_other_errors = True
 
             log.error("%s contains errors:", fname)
@@ -732,18 +785,19 @@ def check_accuracy_dir(config, model, path, verbose):
         up_patterns = []
         acc_limit_check = True
         for i in range(0, len(acc_upper_limit), 2):
-            acc_type, acc_target = acc_upper_limit[i:i+2]
+            acc_type, acc_target = acc_upper_limit[i: i + 2]
             acc_limits.append(acc_target)
             up_patterns.append(ACC_PATTERN[acc_type])
 
     for i in range(0, len(target), 2):
-        acc_type, acc_target = target[i:i+2]
+        acc_type, acc_target = target[i: i + 2]
         patterns.append(ACC_PATTERN[acc_type])
         acc_targets.append(acc_target)
     acc_seen = [False for _ in acc_targets]
     with open(os.path.join(path, "accuracy.txt"), "r", encoding="utf-8") as f:
         for line in f:
-            for i, (pattern, acc_target) in enumerate(zip(patterns, acc_targets)):
+            for i, (pattern, acc_target) in enumerate(
+                    zip(patterns, acc_targets)):
                 m = re.match(pattern, line)
                 if m:
                     acc = m.group(1)
@@ -755,24 +809,39 @@ def check_accuracy_dir(config, model, path, verbose):
                     acc_seen[i] = True
                 elif acc is not None:
                     all_accuracy_valid = False
-                    log.warning("%s accuracy not met: expected=%f, found=%s", path, acc_target, acc)
+                    log.warning(
+                        "%s accuracy not met: expected=%f, found=%s",
+                        path,
+                        acc_target,
+                        acc,
+                    )
                 if i == 0 and acc:
                     result_acc = acc
                 acc = None
             if acc_upper_limit is not None:
-                for i, (pattern, acc_limit) in enumerate(zip(up_patterns, acc_limits)):
+                for i, (pattern, acc_limit) in enumerate(
+                        zip(up_patterns, acc_limits)):
                     m = re.match(pattern, line)
                     if m:
                         acc = m.group(1)
                     m = re.match(r"^hash=([\w\d]+)$", line)
                     if m:
                         hash_val = m.group(1)
-                    if acc is not None and acc_upper_limit is not None and float(acc) > acc_limit:
+                    if (
+                        acc is not None
+                        and acc_upper_limit is not None
+                        and float(acc) > acc_limit
+                    ):
                         acc_limit_check = False
-                        log.warning("%s accuracy not met: upper limit=%f, found=%s", path, acc_limit, acc)
+                        log.warning(
+                            "%s accuracy not met: upper limit=%f, found=%s",
+                            path,
+                            acc_limit,
+                            acc,
+                        )
                     acc = None
             if all(acc_seen) and hash_val:
-                break;
+                break
         is_valid = all_accuracy_valid & all(acc_seen)
         if acc_upper_limit is not None:
             is_valid &= acc_limit_check
@@ -800,26 +869,34 @@ def check_accuracy_dir(config, model, path, verbose):
 
 
 def extra_check_llama2(mlperf_log, scenario):
-    if (mlperf_log["requested_use_token_latencies"]):
+    if mlperf_log["requested_use_token_latencies"]:
         if scenario == "Offline":
             # For offline no further checks are necessary
             return None, True
         else:
             for constraint, limits in LLAMA2_LATENCY_LIMITS.items():
-                if mlperf_log["result_first_token_99.00_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_per_output_token_99.00_percentile_ns"] < limits["tpot"]:
+                if (
+                    mlperf_log["result_first_token_99.00_percentile_latency_ns"]
+                    < limits["ttft"]
+                    and mlperf_log["result_time_per_output_token_99.00_percentile_ns"]
+                    < limits["tpot"]
+                ):
                     return constraint, True
     else:
-        log.error(f'use_token_latencies flag needs to be enabled for Llama2 benchmark')
+        log.error(
+            f"use_token_latencies flag needs to be enabled for Llama2 benchmark")
         return None, False
 
-    log.error(f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}')
+    log.error(
+        f'Failed Llama2 extra check for TTFT and TPOT. TTFT 99-tile: {mlperf_log["result_first_token_99.00_percentile_latency_ns"]}, TPOT 99-tile: {mlperf_log["result_time_per_output_token_99.00_percentile_ns"]}'
+    )
     return None, False
-            
+
 
 def get_performance_metric(
     config, model, path, scenario_fixed, division, system_json, has_power=False
 ):
-    #Assumes new logging format
+    # Assumes new logging format
     version = config.version
 
     fname = os.path.join(path, "mlperf_log_detail.txt")
@@ -832,15 +909,25 @@ def get_performance_metric(
     scenario = mlperf_log["effective_scenario"]
 
     res = float(mlperf_log[RESULT_FIELD_NEW[version][scenario]])
-    if version in RESULT_FIELD_BENCHMARK_OVERWRITE and model in RESULT_FIELD_BENCHMARK_OVERWRITE[version] and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][model]:
-        res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version][model][scenario]])
+    if (
+        version in RESULT_FIELD_BENCHMARK_OVERWRITE
+        and model in RESULT_FIELD_BENCHMARK_OVERWRITE[version]
+        and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][model]
+    ):
+        res = float(
+            mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version]
+                       [model][scenario]]
+        )
 
     inferred = False
     if scenario_fixed != scenario:
-        inferred, res =  get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, False)
+        inferred, res = get_inferred_result(
+            scenario_fixed, scenario, res, mlperf_log, config, False
+        )
 
     return res
 
+
 def check_performance_dir(
     config, model, path, scenario_fixed, division, system_json, has_power=False
 ):
@@ -863,12 +950,19 @@ def check_performance_dir(
     scenario = mlperf_log["effective_scenario"]
 
     res = float(mlperf_log[RESULT_FIELD_NEW[version][scenario]])
-    if version in RESULT_FIELD_BENCHMARK_OVERWRITE and model in RESULT_FIELD_BENCHMARK_OVERWRITE[version] and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][model]:
-        res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version][model][scenario]])
+    if (
+        version in RESULT_FIELD_BENCHMARK_OVERWRITE
+        and model in RESULT_FIELD_BENCHMARK_OVERWRITE[version]
+        and scenario in RESULT_FIELD_BENCHMARK_OVERWRITE[version][model]
+    ):
+        res = float(
+            mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version]
+                       [model][scenario]]
+        )
 
-        
     if model in ["llama2-70b-99", "llama2-70b-99.9"]:
-        llama_constraint, is_valid = extra_check_llama2(mlperf_log, scenario_fixed)
+        llama_constraint, is_valid = extra_check_llama2(
+            mlperf_log, scenario_fixed)
 
     latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
     latency_mean = mlperf_log["result_mean_latency_ns"]
@@ -887,7 +981,8 @@ def check_performance_dir(
     if not find_error_in_detail_log(config, fname):
         is_valid = False
 
-    required_performance_sample_count = config.get_performance_sample_count(model)
+    required_performance_sample_count = config.get_performance_sample_count(
+        model)
     if performance_sample_count < required_performance_sample_count:
         log.error(
             "%s performance_sample_count, found %d, needs to be >= %d",
@@ -941,7 +1036,8 @@ def check_performance_dir(
         # If the scenario has a target latency (Server scenario), check
         # that the target latency that was passed to the early stopping
         # is less than the target latency.
-        target_latency = config.latency_constraint.get(model, dict()).get(scenario)
+        target_latency = config.latency_constraint.get(
+            model, dict()).get(scenario)
         if target_latency:
             early_stopping_latency_ns = mlperf_log["effective_target_latency_ns"]
             log.info(
@@ -961,7 +1057,8 @@ def check_performance_dir(
 
     else:
         # check if the benchmark meets latency constraint
-        target_latency = config.latency_constraint.get(model, dict()).get(scenario)
+        target_latency = config.latency_constraint.get(
+            model, dict()).get(scenario)
         log.info(
             "Target latency: %s, Latency: %s, Scenario: %s",
             target_latency,
@@ -992,7 +1089,8 @@ def check_performance_dir(
             )
             is_valid = False
 
-    if scenario == "Offline" and (samples_per_query < OFFLINE_MIN_SPQ_SINCE_V4[model]):
+    if scenario == "Offline" and (
+            samples_per_query < OFFLINE_MIN_SPQ_SINCE_V4[model]):
         log.error(
             "%s Required minimum samples per query not met by user config, Expected=%s, Found=%s",
             fname,
@@ -1015,14 +1113,17 @@ def check_performance_dir(
 
     inferred = False
     if scenario_fixed != scenario:
-        inferred, res =  get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, True)
+        inferred, res = get_inferred_result(
+            scenario_fixed, scenario, res, mlperf_log, config, True
+        )
 
     is_network_system, is_network_mode_valid = is_system_over_network(
         division, system_json, path
     )
     is_valid &= is_network_mode_valid
     if is_network_system:
-        # for network mode verify the SUT name is valid, according to the rules (must include "Network SUT" in name)
+        # for network mode verify the SUT name is valid, according to the rules
+        # (must include "Network SUT" in name)
         if NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME not in sut_name:
             log.error(
                 f"{fname} invalid sut name for network mode. expecting the substring '{NETWORK_MODE_REQUIRED_SUBSTRING_IN_SUT_NAME}' got '{sut_name}'"
@@ -1031,7 +1132,10 @@ def check_performance_dir(
 
     return is_valid, res, inferred
 
-def get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, log_error=False):
+
+def get_inferred_result(
+    scenario_fixed, scenario, res, mlperf_log, config, log_error=False
+):
 
     inferred = False
     # Check if current scenario (and version) uses early stopping
@@ -1045,7 +1149,8 @@ def get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, log_e
         latency_mean = mlperf_log["result_mean_query_latency_ns"]
     samples_per_query = mlperf_log["effective_samples_per_query"]
     if scenario == "SingleStream":
-        # qps_wo_loadgen_overhead is only used for inferring Offline from SingleStream; only for old submissions
+        # qps_wo_loadgen_overhead is only used for inferring Offline from
+        # SingleStream; only for old submissions
         qps_wo_loadgen_overhead = mlperf_log["result_qps_without_loadgen_overhead"]
 
     # special case for results inferred from different scenario
@@ -1053,15 +1158,11 @@ def get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, log_e
         inferred = True
         res = qps_wo_loadgen_overhead
 
-    if (
-        scenario_fixed in ["Offline"]
-    ) and scenario in ["MultiStream"]:
+    if (scenario_fixed in ["Offline"]) and scenario in ["MultiStream"]:
         inferred = True
         res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS)
 
-    if (
-        scenario_fixed in ["MultiStream"]
-    ) and scenario in ["SingleStream"]:
+    if (scenario_fixed in ["MultiStream"]) and scenario in ["SingleStream"]:
         inferred = True
         # samples_per_query does not match with the one reported in the logs
         # when inferring MultiStream from SingleStream
@@ -1078,11 +1179,12 @@ def get_inferred_result(scenario_fixed, scenario, res, mlperf_log, config, log_e
             res = (latency_99_percentile * samples_per_query) / MS_TO_NS
     return inferred, res
 
+
 def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
     # parse the power logs
     server_timezone = datetime.timedelta(0)
     client_timezone = datetime.timedelta(0)
-    
+
     detail_log_fname = os.path.join(log_path, "mlperf_log_detail.txt")
     mlperf_log = MLPerfLog(detail_log_fname)
     datetime_format = "%m-%d-%Y %H:%M:%S.%f"
@@ -1096,7 +1198,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
     )
     # Obtain the scenario also from logs to check if power is inferred
     scenario = mlperf_log["effective_scenario"]
-    
+
     spl_fname = os.path.join(log_path, "spl.txt")
     power_list = []
     with open(spl_fname) as f:
@@ -1129,7 +1231,8 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
             avg_power_efficiency = res / avg_power
 
         else:
-            # In SingleStream and MultiStream scenarios, the power metric is in mJ/query.
+            # In SingleStream and MultiStream scenarios, the power metric is in
+            # mJ/query.
             assert scenario_fixed in [
                 "MultiStream",
                 "SingleStream",
@@ -1144,9 +1247,8 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
             elif scenario_fixed in ["MultiStream"]:
                 samples_per_query = 8
 
-            if (
-                scenario_fixed in ["MultiStream"]
-            ) and scenario in ["SingleStream"]:
+            if (scenario_fixed in ["MultiStream"]
+                    ) and scenario in ["SingleStream"]:
                 power_metric = (
                     avg_power * power_duration * samples_per_query * 1000 / num_queries
                 )
@@ -1172,11 +1274,17 @@ def check_power_dir(
 
     # check if all the required files are present
     required_files = REQUIRED_PERF_FILES + REQUIRED_PERF_POWER_FILES
-    diff = files_diff(list_files(testing_path), required_files, OPTIONAL_PERF_FILES)
+    diff = files_diff(
+        list_files(testing_path),
+        required_files,
+        OPTIONAL_PERF_FILES)
     if diff:
         log.error("%s has file list mismatch (%s)", testing_path, diff)
         is_valid = False
-    diff = files_diff(list_files(ranging_path), required_files, OPTIONAL_PERF_FILES)
+    diff = files_diff(
+        list_files(ranging_path),
+        required_files,
+        OPTIONAL_PERF_FILES)
     if diff:
         log.error("%s has file list mismatch (%s)", ranging_path, diff)
         is_valid = False
@@ -1186,7 +1294,7 @@ def check_power_dir(
         is_valid = False
 
     # uncomment to measure ranging mode power
-    '''
+    """
     (
         is_valid,
         power_metric_ranging,
@@ -1195,7 +1303,7 @@ def check_power_dir(
     ) = get_power_metric(
         config, scenario_fixed, ranging_path, is_valid, power_res_ranging
     )
-    '''
+    """
     is_valid, power_metric, scenario, power_efficiency_testing = get_power_metric(
         config, scenario_fixed, testing_path, is_valid, power_res_testing
     )
@@ -1214,7 +1322,9 @@ def check_power_dir(
         sys.stdout.flush()
         sys.stderr.flush()
         if check_power_result != 0:
-            log.error("Power WG power_checker.py did not pass for: %s", perf_path)
+            log.error(
+                "Power WG power_checker.py did not pass for: %s",
+                perf_path)
             is_valid = False
 
     return is_valid, power_metric, power_efficiency_testing
@@ -1468,7 +1578,7 @@ def log_result(
             if not os.path.exists(results_path):
                 continue
 
-            ## Apply folder checks
+            # Apply folder checks
             dirs = list_dirs_recursively(division, submitter)
             files = list_files_recursively(division, submitter)
 
@@ -1569,7 +1679,9 @@ def log_result(
                         extra_model_mapping = json.load(fp)
 
             for system_desc in list_dir(results_path):
-                # we are looking at ./$division/$submitter/results/$system_desc, ie ./closed/mlperf_org/results/t4-ort
+                # we are looking at
+                # ./$division/$submitter/results/$system_desc, ie
+                # ./closed/mlperf_org/results/t4-ort
 
                 #
                 # check if system_id is good.
@@ -1598,7 +1710,8 @@ def log_result(
                     if config.version not in ["v0.5"]:
                         valid_system_types = ["datacenter", "edge"]
                         if config.version not in ["v0.7"]:
-                            valid_system_types += ["datacenter,edge", "edge,datacenter"]
+                            valid_system_types += ["datacenter,edge",
+                                                   "edge,datacenter"]
                         if system_type not in valid_system_types:
                             log.error(
                                 "%s has invalid system type (%s)",
@@ -1632,7 +1745,8 @@ def log_result(
 
                     if is_closed_or_network and mlperf_model not in config.models:
                         # for closed/network divisions we want the model name to match.
-                        # for open division the model_name might be different than the task
+                        # for open division the model_name might be different
+                        # than the task
                         log.error(
                             "%s has an invalid model %s for closed/network division",
                             name,
@@ -1660,9 +1774,12 @@ def log_result(
                         list(required_scenarios)
                         + list(config.get_optional(mlperf_model))
                     )
-                    for scenario in list_dir(results_path, system_desc, model_name):
-                        # some submissions in v0.5 use lower case scenarios - map them for now
-                        scenario_fixed = SCENARIO_MAPPING.get(scenario, scenario)
+                    for scenario in list_dir(
+                            results_path, system_desc, model_name):
+                        # some submissions in v0.5 use lower case scenarios -
+                        # map them for now
+                        scenario_fixed = SCENARIO_MAPPING.get(
+                            scenario, scenario)
 
                         # we are looking at ./$division/$submitter/results/$system_desc/$model/$scenario,
                         #   ie ./closed/mlperf_org/results/t4-ort/bert/Offline
@@ -1710,7 +1827,8 @@ def log_result(
                             scenario,
                         )
                         if not os.path.exists(measurement_dir):
-                            log.error("no measurement_dir for %s", measurement_dir)
+                            log.error(
+                                "no measurement_dir for %s", measurement_dir)
                             results[measurement_dir] = None
                             errors += 1
                             continue
@@ -1737,7 +1855,8 @@ def log_result(
                         # check accuracy
                         accuracy_is_valid = False
                         acc_path = os.path.join(name, "accuracy")
-                        if not os.path.exists(os.path.join(acc_path, "accuracy.txt")):
+                        if not os.path.exists(
+                                os.path.join(acc_path, "accuracy.txt")):
                             log.error(
                                 "%s has no accuracy.txt. Generate it with accuracy-imagenet.py or accuracy-coco.py or "
                                 "process_accuracy.py",
@@ -1746,7 +1865,8 @@ def log_result(
                             errors += 1
                             continue
                         else:
-                            diff = files_diff(list_files(acc_path), REQUIRED_ACC_FILES)
+                            diff = files_diff(
+                                list_files(acc_path), REQUIRED_ACC_FILES)
                             if diff:
                                 log.error(
                                     "%s has file list mismatch (%s)", acc_path, diff
@@ -1760,11 +1880,21 @@ def log_result(
                                 debug or is_closed_or_network,
                             )
                             if mlperf_model in REQUIRED_ACC_BENCHMARK:
-                                if config.version in REQUIRED_ACC_BENCHMARK[mlperf_model]:
-                                    extra_files_pass, missing_files = check_extra_files(acc_path, REQUIRED_ACC_BENCHMARK[mlperf_model][config.version])
+                                if (
+                                    config.version
+                                    in REQUIRED_ACC_BENCHMARK[mlperf_model]
+                                ):
+                                    extra_files_pass, missing_files = check_extra_files(
+                                        acc_path,
+                                        REQUIRED_ACC_BENCHMARK[mlperf_model][
+                                            config.version
+                                        ],
+                                    )
                                     if not extra_files_pass:
                                         log.error(
-                                            "%s expected to have the following extra files (%s)", acc_path, missing_files
+                                            "%s expected to have the following extra files (%s)",
+                                            acc_path,
+                                            missing_files,
                                         )
                                         accuracy_is_valid = False
                             if not accuracy_is_valid and not is_closed_or_network:
@@ -1775,7 +1905,8 @@ def log_result(
                                     )
                                 accuracy_is_valid = True
                             if not accuracy_is_valid:
-                                # a little below we'll not copy this into the results csv
+                                # a little below we'll not copy this into the
+                                # results csv
                                 errors += 1
                                 log.error("%s, accuracy not valid", acc_path)
 
@@ -1836,9 +1967,7 @@ def log_result(
                                     ranging_path = os.path.join(
                                         name, "performance", "ranging"
                                     )
-                                    (
-                                        ranging_r
-                                    ) = get_performance_metric(
+                                    (ranging_r) = get_performance_metric(
                                         config,
                                         mlperf_model,
                                         ranging_path,
@@ -1924,7 +2053,7 @@ def log_result(
                                 config,
                                 division,
                                 system_json,
-                                name
+                                name,
                             ):
                                 log.error(
                                     "compliance dir %s has issues", compliance_dir
@@ -1957,10 +2086,12 @@ def log_result(
                                 )
                             else:
                                 results[name] = None
-                                log.error("%s is OK but accuracy has issues", name)
+                                log.error(
+                                    "%s is OK but accuracy has issues", name)
 
                     if required_scenarios:
-                        name = os.path.join(results_path, system_desc, model_name)
+                        name = os.path.join(
+                            results_path, system_desc, model_name)
                         if is_closed_or_network:
                             results[name] = None
                             log.error(
@@ -1988,9 +2119,8 @@ def check_system_desc_id(
 ):
     is_valid = True
     # check all required fields
-   
-    required_fields = SYSTEM_DESC_REQUIRED_FIELDS.copy()
 
+    required_fields = SYSTEM_DESC_REQUIRED_FIELDS.copy()
 
     is_network_system, is_network_mode_valid = is_system_over_network(
         division, systems_json, fname
@@ -2015,7 +2145,6 @@ def check_system_desc_id(
                 "%s, field %s requires a meaningful response but is empty", fname, k
             )
 
-   
     # SYSTEM_DESC_REQUIRED_FIELDS_POWER should be mandatory when a submission has power logs, but since we
     # check power submission in check_results_dir, the information is not available yet at this stage and we do
     # this check later
@@ -2101,21 +2230,37 @@ def check_measurement_dir(
 
     if has_power and not skip_check_power_measure_files:
         path = measurement_dir
-        all_files_1 = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+        all_files_1 = [
+            os.path.join(path, f)
+            for f in os.listdir(path)
+            if os.path.isfile(os.path.join(path, f))
+        ]
         path = os.path.join(path, "..")
-        all_files_2 = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+        all_files_2 = [
+            os.path.join(path, f)
+            for f in os.listdir(path)
+            if os.path.isfile(os.path.join(path, f))
+        ]
         path = os.path.join(path, "..")
-        all_files_3 = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+        all_files_3 = [
+            os.path.join(path, f)
+            for f in os.listdir(path)
+            if os.path.isfile(os.path.join(path, f))
+        ]
         path = os.path.join(path, "..")
-        all_files_4 = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+        all_files_4 = [
+            os.path.join(path, f)
+            for f in os.listdir(path)
+            if os.path.isfile(os.path.join(path, f))
+        ]
         all_files = all_files_1 + all_files_2 + all_files_3 + all_files_4
 
         for i in REQUIRED_POWER_MEASURE_FILES:
             found = False
             for file in all_files:
                 if re.match(i, os.path.basename(file)):
-                   found = True
-                   file_path = file
+                    found = True
+                    file_path = file
             if not found:
                 log.error("%s is missing %s", measurement_dir, i)
                 is_valid = False
@@ -2153,9 +2298,10 @@ def check_measurement_dir(
                     log.error("%s, field %s is missing", fname, k)
                 elif check_empty_fields and not j[k]:
                     is_valid = False
-                    log.error("%s, field %s is missing meaningful value", fname, k)
+                    log.error(
+                        "%s, field %s is missing meaningful value", fname, k)
 
-        impl = system_file[len(system_desc) + 1 : -end]
+        impl = system_file[len(system_desc) + 1: -end]
         code_dir = os.path.join(root, "code", model)
         if os.path.isfile(code_dir):
             with open(code_dir, "r") as f:
@@ -2191,7 +2337,9 @@ def check_compliance_perf_dir(test_dir):
                     is_valid = True
                     break
         if is_valid == False:
-            log.error("Compliance test performance check in %s failed", test_dir)
+            log.error(
+                "Compliance test performance check in %s failed",
+                test_dir)
 
         # Check performance dir
         test_perf_path = os.path.join(test_dir, "performance", "run_1")
@@ -2205,7 +2353,10 @@ def check_compliance_perf_dir(test_dir):
                 ["mlperf_log_accuracy.json"],
             )
             if diff:
-                log.error("%s has file list mismatch (%s)", test_perf_path, diff)
+                log.error(
+                    "%s has file list mismatch (%s)",
+                    test_perf_path,
+                    diff)
                 is_valid = False
 
     return is_valid
@@ -2242,19 +2393,24 @@ def check_compliance_acc_dir(test_dir, model, config):
             else:
                 diff = files_diff(
                     list_files(test_acc_path),
-                    REQUIRED_TEST01_ACC_FILES_1
-                    if acc_passed
-                    else REQUIRED_TEST01_ACC_FILES,
+                    (
+                        REQUIRED_TEST01_ACC_FILES_1
+                        if acc_passed
+                        else REQUIRED_TEST01_ACC_FILES
+                    ),
                 )
                 if diff:
-                    log.error("%s has file list mismatch (%s)", test_acc_path, diff)
+                    log.error(
+                        "%s has file list mismatch (%s)",
+                        test_acc_path,
+                        diff)
                     is_valid = False
                 elif not acc_passed:
                     target = config.get_accuracy_target(model)
                     patterns = []
                     acc_types = []
                     for i in range(0, len(target), 2):
-                        acc_type = target[i:i+2]
+                        acc_type = target[i: i + 2]
                         acc_types.append(acc_type)
                         patterns.append(ACC_PATTERN[acc_type[0]])
                     acc_seen = [False for _ in acc_type]
@@ -2264,12 +2420,8 @@ def check_compliance_acc_dir(test_dir, model, config):
                         required_delta_perc = 1
                     else:
                         required_delta_perc = 0.1
-                    acc_baseline = {
-                        acc_type: 0 for acc_type in acc_types
-                    }
-                    acc_compliance = {
-                        acc_type: 0 for acc_type in acc_types
-                    }
+                    acc_baseline = {acc_type: 0 for acc_type in acc_types}
+                    acc_compliance = {acc_type: 0 for acc_type in acc_types}
                     with open(
                         os.path.join(test_acc_path, "baseline_accuracy.txt"),
                         "r",
@@ -2289,13 +2441,21 @@ def check_compliance_acc_dir(test_dir, model, config):
                             for acc_type, pattern in zip(acc_types, patterns):
                                 m = re.match(pattern, line)
                                 if m:
-                                    acc_compliance[acc_type] = float(m.group(1))
+                                    acc_compliance[acc_type] = float(
+                                        m.group(1))
                     for acc_type in acc_types:
                         if acc_baseline[acc_type] == 0 or acc_compliance[acc_type] == 0:
                             is_valid = False
                             break
                         else:
-                            delta_perc = abs(1 - acc_baseline[acc_type] / acc_compliance[acc_type]) * 100
+                            delta_perc = (
+                                abs(
+                                    1
+                                    - acc_baseline[acc_type] /
+                                    acc_compliance[acc_type]
+                                )
+                                * 100
+                            )
                             if delta_perc <= required_delta_perc:
                                 is_valid = True
                             else:
@@ -2311,14 +2471,21 @@ def check_compliance_acc_dir(test_dir, model, config):
             with open(fname, "r") as f:
                 lines = f.readlines()
             lines = [line.strip() for line in lines]
-            first_token_pass = "First token check pass: True" in lines or "First token check pass: Skipped" in lines
+            first_token_pass = (
+                "First token check pass: True" in lines
+                or "First token check pass: Skipped" in lines
+            )
             eos_pass = "EOS check pass: True" in lines
             length_check_pass = "Sample length check pass: True" in lines
             is_valid = first_token_pass and eos_pass and length_check_pass
             if not is_valid:
-                log.error(f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}.")
+                log.error(
+                    f"TEST06 accuracy check failed. first_token_check: {first_token_pass} eos_check: {eos_pass} length_check: {length_check_pass}."
+                )
         else:
-            raise NotImplemented(f"{test_dir} is neither TEST01 and TEST06, which doesn't require accuracy check")
+            raise NotImplemented(
+                f"{test_dir} is neither TEST01 and TEST06, which doesn't require accuracy check"
+            )
 
     return is_valid
 
@@ -2344,8 +2511,7 @@ def check_compliance_dir(
         "gptj-99.9",
         "llama2-70b-99",
         "llama2-70b-99.9",
-        "stable-diffusion-xl"
-
+        "stable-diffusion-xl",
     ]:
         test_list.remove("TEST04")
 
@@ -2354,16 +2520,16 @@ def check_compliance_dir(
         "gptj-99.9",
         "llama2-70b-99",
         "llama2-70b-99.9",
-        "stable-diffusion-xl"
+        "stable-diffusion-xl",
     ]:
         test_list.remove("TEST05")
-        test_list.remove("TEST01") 
+        test_list.remove("TEST01")
 
     if model in [
         "llama2-70b-99",
         "llama2-70b-99.9",
     ]:
-        test_list.append("TEST06") 
+        test_list.append("TEST06")
 
     if test_list and not os.path.exists(compliance_dir):
         log.error("no compliance dir for %s: %s", name, compliance_dir)
@@ -2387,7 +2553,10 @@ def check_compliance_dir(
                     config, model, compliance_perf_dir, scenario, division, system_json
                 )
                 if is_inferred:
-                    log.info("%s has inferred results, qps=%s", compliance_perf_dir, r)
+                    log.info(
+                        "%s has inferred results, qps=%s",
+                        compliance_perf_dir,
+                        r)
             except Exception as e:
                 log.error(
                     "%s caused exception in check_performance_dir: %s",
@@ -2401,7 +2570,7 @@ def check_compliance_dir(
                 and compliance_perf_valid
             )
 
-    compliance_acc_pass= True
+    compliance_acc_pass = True
     for test in ["TEST01", "TEST06"]:
         if test in test_list:
             # Check accuracy for TEST01
@@ -2409,7 +2578,6 @@ def check_compliance_dir(
                 os.path.join(compliance_dir, test), model, config
             )
 
-
     return compliance_perf_pass and compliance_acc_pass and compliance_perf_dir_pass
 
 
@@ -2435,7 +2603,7 @@ def main():
             args.skip_meaningful_fields_emptiness_check,
             args.skip_empty_files_check,
             args.skip_check_power_measure_files,
-            args.skip_extra_files_in_root_check
+            args.skip_extra_files_in_root_check,
         )
 
     # log results
@@ -2487,7 +2655,8 @@ def merge_two_dict(x, y):
     unique_closed_systems = merge_two_dict(
         closed_power_systems, closed_non_power_systems
     )
-    unique_open_systems = merge_two_dict(open_power_systems, open_non_power_systems)
+    unique_open_systems = merge_two_dict(
+        open_power_systems, open_non_power_systems)
     unique_network_systems = merge_two_dict(
         network_power_systems, network_non_power_systems
     )
@@ -2496,8 +2665,10 @@ def merge_two_dict(x, y):
     unique_systems = merge_two_dict(unique_systems, unique_network_systems)
 
     # power systems can be repeating in open, closed and network
-    unique_power_systems = merge_two_dict(closed_power_systems, open_power_systems)
-    unique_power_systems = merge_two_dict(unique_power_systems, network_power_systems)
+    unique_power_systems = merge_two_dict(
+        closed_power_systems, open_power_systems)
+    unique_power_systems = merge_two_dict(
+        unique_power_systems, network_power_systems)
 
     number_systems = len(unique_systems)
     number_power_systems = len(unique_power_systems)
@@ -2518,7 +2689,8 @@ def sum_dict_values(x):
     count_open_results = count_open_power_results + count_open_non_power_results
 
     count_network_power_results = sum_dict_values(network_power_systems)
-    count_network_non_power_results = sum_dict_values(network_non_power_systems)
+    count_network_non_power_results = sum_dict_values(
+        network_non_power_systems)
     count_network_results = (
         count_network_power_results + count_network_non_power_results
     )
@@ -2556,7 +2728,10 @@ def sum_dict_values(x):
     )
     log.info("---")
 
-    log.info("Systems=%d, Power Systems=%d", number_systems, number_power_systems)
+    log.info(
+        "Systems=%d, Power Systems=%d",
+        number_systems,
+        number_power_systems)
     log.info(
         "Closed Systems=%d, Closed Power Systems=%d",
         number_closed_systems,
diff --git a/tools/submission/truncate_accuracy_log.py b/tools/submission/truncate_accuracy_log.py
index 50b9b6977..90a3f0b24 100755
--- a/tools/submission/truncate_accuracy_log.py
+++ b/tools/submission/truncate_accuracy_log.py
@@ -35,17 +35,29 @@
     submission directory.
 
     python tools/submission/truncate_accuracy_log.py --input ROOT_OF_SUBMISSION_DIRECTORY --submitter MY_ORG \\
-        --backup MY_SUPER_SAFE_STORAGE 
+        --backup MY_SUPER_SAFE_STORAGE
 """
 
+
 def get_args():
     """Parse commandline."""
-    parser = argparse.ArgumentParser(description="Truncate mlperf_log_accuracy.json files.",
-                                     formatter_class=argparse.RawDescriptionHelpFormatter, epilog=HELP_TEXT)
-    parser.add_argument("--input", required=True, help="orignal submission directory")
+    parser = argparse.ArgumentParser(
+        description="Truncate mlperf_log_accuracy.json files.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=HELP_TEXT,
+    )
+    parser.add_argument(
+        "--input",
+        required=True,
+        help="orignal submission directory")
     parser.add_argument("--output", help="new submission directory")
-    parser.add_argument("--submitter", required=True, help="filter to submitter")
-    parser.add_argument("--backup", help="directory to store the original accuacy log")
+    parser.add_argument(
+        "--submitter",
+        required=True,
+        help="filter to submitter")
+    parser.add_argument(
+        "--backup",
+        help="directory to store the original accuacy log")
 
     args = parser.parse_args()
     if not args.output and not args.backup:
@@ -57,12 +69,14 @@ def get_args():
 
 def list_dir(*path):
     path = os.path.join(*path)
-    return [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
+    return [f for f in os.listdir(
+        path) if os.path.isdir(os.path.join(path, f))]
 
 
 def list_files(*path):
     path = os.path.join(*path)
-    return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
+    return [f for f in os.listdir(
+        path) if os.path.isfile(os.path.join(path, f))]
 
 
 def split_path(m):
@@ -82,7 +96,7 @@ def truncate_file(fname):
     """Truncate file to 4K from start and 4K from end."""
     size = os.stat(fname).st_size
     if size < VIEWABLE_SIZE:
-       return
+        return
     with open(fname, "r") as src:
         start = src.read(VIEWABLE_SIZE)
         src.seek(size - VIEWABLE_SIZE, 0)
@@ -100,15 +114,17 @@ def copy_submission_dir(src, dst, filter_submitter):
         for submitter in list_dir(os.path.join(src, division)):
             if filter_submitter and submitter != filter_submitter:
                 continue
-            shutil.copytree(os.path.join(src, division, submitter),
-                            os.path.join(dst, division, submitter))
+            shutil.copytree(
+                os.path.join(src, division, submitter),
+                os.path.join(dst, division, submitter),
+            )
 
 
 def truncate_results_dir(filter_submitter, backup):
-    """Walk result dir and 
-       write a hash of mlperf_log_accuracy.json to accuracy.txt
-       copy mlperf_log_accuracy.json to a backup location
-       truncate mlperf_log_accuracy.
+    """Walk result dir and
+    write a hash of mlperf_log_accuracy.json to accuracy.txt
+    copy mlperf_log_accuracy.json to a backup location
+    truncate mlperf_log_accuracy.
     """
     for division in list_dir("."):
         # we are looking at ./$division, ie ./closed
@@ -131,16 +147,25 @@ def truncate_results_dir(filter_submitter, backup):
                 for system_desc in list_dir(log_path):
                     for model in list_dir(log_path, system_desc):
                         for scenario in list_dir(log_path, system_desc, model):
-                            for test in list_dir(log_path, system_desc, model, scenario):
+                            for test in list_dir(
+                                log_path, system_desc, model, scenario
+                            ):
 
-                                name = os.path.join(log_path, system_desc, model, scenario)
+                                name = os.path.join(
+                                    log_path, system_desc, model, scenario
+                                )
                                 if directory == "compliance":
-                                    name = os.path.join(log_path, system_desc, model, scenario, test)
+                                    name = os.path.join(
+                                        log_path, system_desc, model, scenario, test
+                                    )
 
                                 hash_val = None
                                 acc_path = os.path.join(name, "accuracy")
-                                acc_log = os.path.join(acc_path, "mlperf_log_accuracy.json")
-                                acc_txt = os.path.join(acc_path, "accuracy.txt")
+                                acc_log = os.path.join(
+                                    acc_path, "mlperf_log_accuracy.json"
+                                )
+                                acc_txt = os.path.join(
+                                    acc_path, "accuracy.txt")
 
                                 # only TEST01 has an accuracy log
                                 if directory == "compliance" and test != "TEST01":
@@ -148,30 +173,53 @@ def truncate_results_dir(filter_submitter, backup):
                                 if not os.path.exists(acc_log):
                                     log.error("%s missing", acc_log)
                                     continue
-                                if not os.path.exists(acc_txt) and directory == "compliance":
-                                    # compliance test directory will not have an accuracy.txt file by default
-                                    log.info("no accuracy.txt in compliance directory %s", acc_path)
+                                if (
+                                    not os.path.exists(acc_txt)
+                                    and directory == "compliance"
+                                ):
+                                    # compliance test directory will not have
+                                    # an accuracy.txt file by default
+                                    log.info(
+                                        "no accuracy.txt in compliance directory %s",
+                                        acc_path,
+                                    )
                                 else:
                                     if not os.path.exists(acc_txt):
-                                        log.error("%s missing, generate to continue", acc_txt)
+                                        log.error(
+                                            "%s missing, generate to continue", acc_txt
+                                        )
                                         continue
                                     with open(acc_txt, "r", encoding="utf-8") as f:
                                         for line in f:
-                                            m = re.match(r"^hash=([\w\d]+)$", line)
+                                            m = re.match(
+                                                r"^hash=([\w\d]+)$", line)
                                             if m:
                                                 hash_val = m.group(1)
                                                 break
                                 size = os.stat(acc_log).st_size
                                 if hash_val and size < MAX_ACCURACY_LOG_SIZE:
-                                    log.info("%s already has hash and size seems truncated", acc_path)
+                                    log.info(
+                                        "%s already has hash and size seems truncated",
+                                        acc_path,
+                                    )
                                     continue
 
                                 if backup:
-                                    backup_dir = os.path.join(backup, name, "accuracy")
+                                    backup_dir = os.path.join(
+                                        backup, name, "accuracy")
                                     os.makedirs(backup_dir, exist_ok=True)
-                                    dst = os.path.join(backup, name, "accuracy", "mlperf_log_accuracy.json")
+                                    dst = os.path.join(
+                                        backup,
+                                        name,
+                                        "accuracy",
+                                        "mlperf_log_accuracy.json",
+                                    )
                                     if os.path.exists(dst):
-                                        log.error("not processing %s because %s already exist", acc_log, dst)
+                                        log.error(
+                                            "not processing %s because %s already exist",
+                                            acc_log,
+                                            dst,
+                                        )
                                         continue
                                     shutil.copy(acc_log, dst)
 
@@ -182,7 +230,8 @@ def truncate_results_dir(filter_submitter, backup):
                                 truncate_file(acc_log)
                                 log.info("%s truncated", acc_log)
 
-                                # No need to iterate on compliance test subdirectories in the results folder
+                                # No need to iterate on compliance test
+                                # subdirectories in the results folder
                                 if directory == "results":
                                     break
 
@@ -204,7 +253,10 @@ def main():
     truncate_results_dir(args.submitter, args.backup)
 
     backup_location = args.output or args.backup
-    log.info("Make sure you keep a backup of %s in case mlperf wants to see the original accuracy logs", backup_location)
+    log.info(
+        "Make sure you keep a backup of %s in case mlperf wants to see the original accuracy logs",
+        backup_location,
+    )
 
     return 0
 
diff --git a/tools/upscale_coco/coco.py b/tools/upscale_coco/coco.py
index dd0e880be..11813957e 100755
--- a/tools/upscale_coco/coco.py
+++ b/tools/upscale_coco/coco.py
@@ -1,5 +1,5 @@
-__author__ = 'tylin'
-__version__ = '2.0'
+__author__ = "tylin"
+__version__ = "2.0"
 # Interface for accessing the Microsoft COCO dataset.
 
 # Microsoft COCO is a large image dataset designed for object detection,
@@ -56,6 +56,7 @@
 import os
 from collections import defaultdict
 import sys
+
 PYTHON_VERSION = sys.version_info[0]
 if PYTHON_VERSION == 2:
     from urllib import urlretrieve
@@ -64,7 +65,7 @@
 
 
 def _isArrayLike(obj):
-    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+    return hasattr(obj, "__iter__") and hasattr(obj, "__len__")
 
 
 class COCO:
@@ -76,40 +77,42 @@ def __init__(self, annotation_file=None):
         :return:
         """
         # load dataset
-        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
+        self.dataset, self.anns, self.cats, self.imgs = dict(), dict(), dict(), dict()
         self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
-        if not annotation_file == None:
-            print('loading annotations into memory...')
+        if not annotation_file is None:
+            print("loading annotations into memory...")
             tic = time.time()
-            dataset = json.load(open(annotation_file, 'r'))
-            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
-            print('Done (t={:0.2f}s)'.format(time.time()- tic))
+            dataset = json.load(open(annotation_file, "r"))
+            assert (
+                isinstance(dataset, dict)
+            ), "annotation file format {} not supported".format(type(dataset))
+            print("Done (t={:0.2f}s)".format(time.time() - tic))
             self.dataset = dataset
             self.createIndex()
 
     def createIndex(self):
         # create index
-        print('creating index...')
+        print("creating index...")
         anns, cats, imgs = {}, {}, {}
-        imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
-        if 'annotations' in self.dataset:
-            for ann in self.dataset['annotations']:
-                imgToAnns[ann['image_id']].append(ann)
-                anns[ann['id']] = ann
+        imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
+        if "annotations" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                imgToAnns[ann["image_id"]].append(ann)
+                anns[ann["id"]] = ann
 
-        if 'images' in self.dataset:
-            for img in self.dataset['images']:
-                imgs[img['id']] = img
+        if "images" in self.dataset:
+            for img in self.dataset["images"]:
+                imgs[img["id"]] = img
 
-        if 'categories' in self.dataset:
-            for cat in self.dataset['categories']:
-                cats[cat['id']] = cat
+        if "categories" in self.dataset:
+            for cat in self.dataset["categories"]:
+                cats[cat["id"]] = cat
 
-        if 'annotations' in self.dataset and 'categories' in self.dataset:
-            for ann in self.dataset['annotations']:
-                catToImgs[ann['category_id']].append(ann['image_id'])
+        if "annotations" in self.dataset and "categories" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                catToImgs[ann["category_id"]].append(ann["image_id"])
 
-        print('index created!')
+        print("index created!")
 
         # create class members
         self.anns = anns
@@ -123,8 +126,8 @@ def info(self):
         Print information about the annotation file.
         :return:
         """
-        for key, value in self.dataset['info'].items():
-            print('{}: {}'.format(key, value))
+        for key, value in self.dataset["info"].items():
+            print("{}: {}".format(key, value))
 
     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         """
@@ -139,19 +142,33 @@ def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
         if len(imgIds) == len(catIds) == len(areaRng) == 0:
-            anns = self.dataset['annotations']
+            anns = self.dataset["annotations"]
         else:
             if not len(imgIds) == 0:
-                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
+                lists = [
+                    self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns
+                ]
                 anns = list(itertools.chain.from_iterable(lists))
             else:
-                anns = self.dataset['annotations']
-            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
-            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
-        if not iscrowd == None:
-            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+                anns = self.dataset["annotations"]
+            anns = (
+                anns
+                if len(catIds) == 0
+                else [ann for ann in anns if ann["category_id"] in catIds]
+            )
+            anns = (
+                anns
+                if len(areaRng) == 0
+                else [
+                    ann
+                    for ann in anns
+                    if ann["area"] > areaRng[0] and ann["area"] < areaRng[1]
+                ]
+            )
+        if not iscrowd is None:
+            ids = [ann["id"] for ann in anns if ann["iscrowd"] == iscrowd]
         else:
-            ids = [ann['id'] for ann in anns]
+            ids = [ann["id"] for ann in anns]
         return ids
 
     def getCatIds(self, catNms=[], supNms=[], catIds=[]):
@@ -167,22 +184,34 @@ def getCatIds(self, catNms=[], supNms=[], catIds=[]):
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
         if len(catNms) == len(supNms) == len(catIds) == 0:
-            cats = self.dataset['categories']
+            cats = self.dataset["categories"]
         else:
-            cats = self.dataset['categories']
-            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
-            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
-            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
-        ids = [cat['id'] for cat in cats]
+            cats = self.dataset["categories"]
+            cats = (
+                cats
+                if len(catNms) == 0
+                else [cat for cat in cats if cat["name"] in catNms]
+            )
+            cats = (
+                cats
+                if len(supNms) == 0
+                else [cat for cat in cats if cat["supercategory"] in supNms]
+            )
+            cats = (
+                cats
+                if len(catIds) == 0
+                else [cat for cat in cats if cat["id"] in catIds]
+            )
+        ids = [cat["id"] for cat in cats]
         return ids
 
     def getImgIds(self, imgIds=[], catIds=[]):
-        '''
+        """
         Get img ids that satisfy given filter conditions.
         :param imgIds (int array) : get imgs for given ids
         :param catIds (int array) : get imgs with all given cats
         :return: ids (int array)  : integer array of img ids
-        '''
+        """
         imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
@@ -205,7 +234,7 @@ def loadAnns(self, ids=[]):
         """
         if _isArrayLike(ids):
             return [self.anns[id] for id in ids]
-        elif type(ids) == int:
+        elif isinstance(ids, int):
             return [self.anns[ids]]
 
     def loadCats(self, ids=[]):
@@ -216,7 +245,7 @@ def loadCats(self, ids=[]):
         """
         if _isArrayLike(ids):
             return [self.cats[id] for id in ids]
-        elif type(ids) == int:
+        elif isinstance(ids, int):
             return [self.cats[ids]]
 
     def loadImgs(self, ids=[]):
@@ -227,7 +256,7 @@ def loadImgs(self, ids=[]):
         """
         if _isArrayLike(ids):
             return [self.imgs[id] for id in ids]
-        elif type(ids) == int:
+        elif isinstance(ids, int):
             return [self.imgs[ids]]
 
     def showAnns(self, anns):
@@ -238,61 +267,88 @@ def showAnns(self, anns):
         """
         if len(anns) == 0:
             return 0
-        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
-            datasetType = 'instances'
-        elif 'caption' in anns[0]:
-            datasetType = 'captions'
+        if "segmentation" in anns[0] or "keypoints" in anns[0]:
+            datasetType = "instances"
+        elif "caption" in anns[0]:
+            datasetType = "captions"
         else:
-            raise Exception('datasetType not supported')
-        if datasetType == 'instances':
+            raise Exception("datasetType not supported")
+        if datasetType == "instances":
             ax = plt.gca()
             ax.set_autoscale_on(False)
             polygons = []
             color = []
             for ann in anns:
-                c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
-                if 'segmentation' in ann:
-                    if type(ann['segmentation']) == list:
+                c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
+                if "segmentation" in ann:
+                    if isinstance(ann["segmentation"], list):
                         # polygon
-                        for seg in ann['segmentation']:
-                            poly = np.array(seg).reshape((int(len(seg)/2), 2))
+                        for seg in ann["segmentation"]:
+                            poly = np.array(seg).reshape(
+                                (int(len(seg) / 2), 2))
                             polygons.append(Polygon(poly))
                             color.append(c)
                     else:
                         # mask
-                        t = self.imgs[ann['image_id']]
-                        if type(ann['segmentation']['counts']) == list:
-                            rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
+                        t = self.imgs[ann["image_id"]]
+                        if isinstance(ann["segmentation"]["counts"], list):
+                            rle = maskUtils.frPyObjects(
+                                [ann["segmentation"]], t["height"], t["width"]
+                            )
                         else:
-                            rle = [ann['segmentation']]
+                            rle = [ann["segmentation"]]
                         m = maskUtils.decode(rle)
-                        img = np.ones( (m.shape[0], m.shape[1], 3) )
-                        if ann['iscrowd'] == 1:
-                            color_mask = np.array([2.0,166.0,101.0])/255
-                        if ann['iscrowd'] == 0:
+                        img = np.ones((m.shape[0], m.shape[1], 3))
+                        if ann["iscrowd"] == 1:
+                            color_mask = np.array([2.0, 166.0, 101.0]) / 255
+                        if ann["iscrowd"] == 0:
                             color_mask = np.random.random((1, 3)).tolist()[0]
                         for i in range(3):
-                            img[:,:,i] = color_mask[i]
-                        ax.imshow(np.dstack( (img, m*0.5) ))
-                if 'keypoints' in ann and type(ann['keypoints']) == list:
+                            img[:, :, i] = color_mask[i]
+                        ax.imshow(np.dstack((img, m * 0.5)))
+                if "keypoints" in ann and isinstance(ann["keypoints"], list):
                     # turn skeleton into zero-based index
-                    sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
-                    kp = np.array(ann['keypoints'])
+                    sks = np.array(
+                        self.loadCats(
+                            ann["category_id"])[0]["skeleton"]) - 1
+                    kp = np.array(ann["keypoints"])
                     x = kp[0::3]
                     y = kp[1::3]
                     v = kp[2::3]
                     for sk in sks:
-                        if np.all(v[sk]>0):
-                            plt.plot(x[sk],y[sk], linewidth=3, color=c)
-                    plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
-                    plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
-            p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
+                        if np.all(v[sk] > 0):
+                            plt.plot(x[sk], y[sk], linewidth=3, color=c)
+                    plt.plot(
+                        x[v > 0],
+                        y[v > 0],
+                        "o",
+                        markersize=8,
+                        markerfacecolor=c,
+                        markeredgecolor="k",
+                        markeredgewidth=2,
+                    )
+                    plt.plot(
+                        x[v > 1],
+                        y[v > 1],
+                        "o",
+                        markersize=8,
+                        markerfacecolor=c,
+                        markeredgecolor=c,
+                        markeredgewidth=2,
+                    )
+            p = PatchCollection(
+                polygons,
+                facecolor=color,
+                linewidths=0,
+                alpha=0.4)
             ax.add_collection(p)
-            p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
+            p = PatchCollection(
+                polygons, facecolor="none", edgecolors=color, linewidths=2
+            )
             ax.add_collection(p)
-        elif datasetType == 'captions':
+        elif datasetType == "captions":
             for ann in anns:
-                print(ann['caption'])
+                print(ann["caption"])
 
     def loadRes(self, resFile):
         """
@@ -301,69 +357,78 @@ def loadRes(self, resFile):
         :return: res (obj)         : result api object
         """
         res = COCO()
-        res.dataset['images'] = [img for img in self.dataset['images']]
+        res.dataset["images"] = [img for img in self.dataset["images"]]
 
-        print('Loading and preparing results...')
+        print("Loading and preparing results...")
         tic = time.time()
-        if type(resFile) == str: #or type(resFile) == unicode:
+        if isinstance(resFile, str):  # or type(resFile) == unicode:
             anns = json.load(open(resFile))
-        elif type(resFile) == np.ndarray:
+        elif isinstance(resFile, np.ndarray):
             anns = self.loadNumpyAnnotations(resFile)
         else:
             anns = resFile
-        assert type(anns) == list, 'results in not an array of objects'
-        annsImgIds = [ann['image_id'] for ann in anns]
-        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
-               'Results do not correspond to current coco set'
-        if 'caption' in anns[0]:
-            imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
-            res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
+        assert isinstance(anns, list), "results in not an array of objects"
+        annsImgIds = [ann["image_id"] for ann in anns]
+        assert set(annsImgIds) == (
+            set(annsImgIds) & set(self.getImgIds())
+        ), "Results do not correspond to current coco set"
+        if "caption" in anns[0]:
+            imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
+                [ann["image_id"] for ann in anns]
+            )
+            res.dataset["images"] = [
+                img for img in res.dataset["images"] if img["id"] in imgIds
+            ]
             for id, ann in enumerate(anns):
-                ann['id'] = id+1
-        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+                ann["id"] = id + 1
+        elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
+            res.dataset["categories"] = copy.deepcopy(
+                self.dataset["categories"])
             for id, ann in enumerate(anns):
-                bb = ann['bbox']
-                x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
-                if not 'segmentation' in ann:
-                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
-                ann['area'] = bb[2]*bb[3]
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'segmentation' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+                bb = ann["bbox"]
+                x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+                if not "segmentation" in ann:
+                    ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann["area"] = bb[2] * bb[3]
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "segmentation" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(
+                self.dataset["categories"])
             for id, ann in enumerate(anns):
-                # now only support compressed RLE format as segmentation results
-                ann['area'] = maskUtils.area(ann['segmentation'])
-                if not 'bbox' in ann:
-                    ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'keypoints' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+                # now only support compressed RLE format as segmentation
+                # results
+                ann["area"] = maskUtils.area(ann["segmentation"])
+                if not "bbox" in ann:
+                    ann["bbox"] = maskUtils.toBbox(ann["segmentation"])
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "keypoints" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(
+                self.dataset["categories"])
             for id, ann in enumerate(anns):
-                s = ann['keypoints']
+                s = ann["keypoints"]
                 x = s[0::3]
                 y = s[1::3]
-                x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
-                ann['area'] = (x1-x0)*(y1-y0)
-                ann['id'] = id + 1
-                ann['bbox'] = [x0,y0,x1-x0,y1-y0]
-        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+                x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann["area"] = (x1 - x0) * (y1 - y0)
+                ann["id"] = id + 1
+                ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
+        print("DONE (t={:0.2f}s)".format(time.time() - tic))
 
-        res.dataset['annotations'] = anns
+        res.dataset["annotations"] = anns
         res.createIndex()
         return res
 
-    def download(self, tarDir = None, imgIds = [] ):
-        '''
+    def download(self, tarDir=None, imgIds=[]):
+        """
         Download COCO images from mscoco.org server.
         :param tarDir (str): COCO results directory name
                imgIds (list): images to be downloaded
         :return:
-        '''
+        """
         if tarDir is None:
-            print('Please specify target directory')
+            print("Please specify target directory")
             return -1
         if len(imgIds) == 0:
             imgs = self.imgs.values()
@@ -374,10 +439,13 @@ def download(self, tarDir = None, imgIds = [] ):
             os.makedirs(tarDir)
         for i, img in enumerate(imgs):
             tic = time.time()
-            fname = os.path.join(tarDir, img['file_name'])
+            fname = os.path.join(tarDir, img["file_name"])
             if not os.path.exists(fname):
-                urlretrieve(img['coco_url'], fname)
-            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
+                urlretrieve(img["coco_url"], fname)
+            print(
+                "downloaded {}/{} images (t={:0.1f}s)".format(i,
+                                                              N, time.time() - tic)
+            )
 
     def loadNumpyAnnotations(self, data):
         """
@@ -385,21 +453,23 @@ def loadNumpyAnnotations(self, data):
         :param  data (numpy.ndarray)
         :return: annotations (python nested list)
         """
-        print('Converting ndarray to lists...')
-        assert(type(data) == np.ndarray)
+        print("Converting ndarray to lists...")
+        assert isinstance(data, np.ndarray)
         print(data.shape)
-        assert(data.shape[1] == 7)
+        assert data.shape[1] == 7
         N = data.shape[0]
         ann = []
         for i in range(N):
             if i % 1000000 == 0:
-                print('{}/{}'.format(i,N))
-            ann += [{
-                'image_id'  : int(data[i, 0]),
-                'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
-                'score' : data[i, 5],
-                'category_id': int(data[i, 6]),
-                }]
+                print("{}/{}".format(i, N))
+            ann += [
+                {
+                    "image_id": int(data[i, 0]),
+                    "bbox": [data[i, 1], data[i, 2], data[i, 3], data[i, 4]],
+                    "score": data[i, 5],
+                    "category_id": int(data[i, 6]),
+                }
+            ]
         return ann
 
     def annToRLE(self, ann):
@@ -407,20 +477,20 @@ def annToRLE(self, ann):
         Convert annotation which can be polygons, uncompressed RLE to RLE.
         :return: binary mask (numpy 2D array)
         """
-        t = self.imgs[ann['image_id']]
-        h, w = t['height'], t['width']
-        segm = ann['segmentation']
-        if type(segm) == list:
+        t = self.imgs[ann["image_id"]]
+        h, w = t["height"], t["width"]
+        segm = ann["segmentation"]
+        if isinstance(segm, list):
             # polygon -- a single object might consist of multiple parts
             # we merge all parts into one mask rle code
             rles = maskUtils.frPyObjects(segm, h, w)
             rle = maskUtils.merge(rles)
-        elif type(segm['counts']) == list:
+        elif isinstance(segm["counts"], list):
             # uncompressed RLE
             rle = maskUtils.frPyObjects(segm, h, w)
         else:
             # rle
-            rle = ann['segmentation']
+            rle = ann["segmentation"]
         return rle
 
     def annToMask(self, ann):
diff --git a/tools/upscale_coco/upscale_coco.py b/tools/upscale_coco/upscale_coco.py
index 19c9624c1..6a97e3214 100755
--- a/tools/upscale_coco/upscale_coco.py
+++ b/tools/upscale_coco/upscale_coco.py
@@ -10,24 +10,49 @@
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Upscale COCO dataset")
-    parser.add_argument('--inputs', '-i', type=str, default='/coco',
-                        help='input directory for coco dataset')
-    parser.add_argument('--outputs', '-o', type=str, default='/cocoup',
-                        help='output directory for upscaled coco dataset')
-    parser.add_argument('--images', '-im', type=str, default='val2017',
-                        help='image directory')
-    parser.add_argument('--annotations', '-a', type=str, default='annotations/instances_val2017.json',
-                        help='annotations directory')
-    parser.add_argument('--size', required=True, type=int, nargs='+',
-                        help='upscaled image sizes (e.g 300 300, 1200 1200')
-    parser.add_argument('--format', '-f', type=str, default='jpg',
-                        help='image format')
+    parser.add_argument(
+        "--inputs",
+        "-i",
+        type=str,
+        default="/coco",
+        help="input directory for coco dataset",
+    )
+    parser.add_argument(
+        "--outputs",
+        "-o",
+        type=str,
+        default="/cocoup",
+        help="output directory for upscaled coco dataset",
+    )
+    parser.add_argument(
+        "--images", "-im", type=str, default="val2017", help="image directory"
+    )
+    parser.add_argument(
+        "--annotations",
+        "-a",
+        type=str,
+        default="annotations/instances_val2017.json",
+        help="annotations directory",
+    )
+    parser.add_argument(
+        "--size",
+        required=True,
+        type=int,
+        nargs="+",
+        help="upscaled image sizes (e.g 300 300, 1200 1200",
+    )
+    parser.add_argument(
+        "--format",
+        "-f",
+        type=str,
+        default="jpg",
+        help="image format")
     return parser.parse_args()
 
 
 def upscale_coco(indir, outdir, image_dir, annotate_file, size, fmt):
     # Build directories.
-    print('Building directories...')
+    print("Building directories...")
     size = tuple(size)
     image_in_path = os.path.join(indir, image_dir)
     image_out_path = os.path.join(outdir, image_dir)
@@ -41,68 +66,71 @@ def upscale_coco(indir, outdir, image_dir, annotate_file, size, fmt):
         os.makedirs(annotate_out_path)
 
     # Read annotations.
-    print('Reading COCO dataset...')
+    print("Reading COCO dataset...")
     coco = COCO(annotate_in_file)
-    print(len(coco.imgs), 'images')
-    print(len(coco.anns), 'annotations')
+    print(len(coco.imgs), "images")
+    print(len(coco.anns), "annotations")
 
     # Upscale annotations.
-    print('Upscaling annotations...')
+    print("Upscaling annotations...")
     annotations = []
     for idx in coco.anns:
         ann = coco.anns[idx]
         # Scaling factors
-        img = coco.imgs[ann['image_id']]
-        sx = size[0] / img['width']
-        sy = size[1] / img['height']
+        img = coco.imgs[ann["image_id"]]
+        sx = size[0] / img["width"]
+        sy = size[1] / img["height"]
         # Bounding boxes
-        bb = ann['bbox']
+        bb = ann["bbox"]
         bb[0] = bb[0] * sx
         bb[1] = bb[1] * sy
         bb[2] = bb[2] * sx
         bb[3] = bb[3] * sy
         # Area
-        ann['area'] = ann['area'] * sx * sy
+        ann["area"] = ann["area"] * sx * sy
         annotations.append(ann)
 
     # Upscale images.
-    print('Upscaling images...')
+    print("Upscaling images...")
     count = 0
     images = []
     for idx in coco.imgs:
         img = coco.imgs[idx]
         # Load, upscale, and save image.
-        image = cv2.imread(os.path.join(image_in_path, img['file_name']))
+        image = cv2.imread(os.path.join(image_in_path, img["file_name"]))
         if len(image.shape) < 3 or image.shape[2] != 3:
             # some images might be grayscale
             image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
         image = cv2.resize(image, size, interpolation=cv2.INTER_LINEAR)
-        cv2.imwrite(os.path.join(image_out_path, img['file_name'][0:-3] + fmt), image)
+        cv2.imwrite(os.path.join(image_out_path,
+                    img["file_name"][0:-3] + fmt), image)
         # Update image file extension
-        img['file_name'] = img['file_name'][0:-3] + fmt
+        img["file_name"] = img["file_name"][0:-3] + fmt
         # Image dimensions
-        img['width'] = size[0]
-        img['height'] = size[1]
+        img["width"] = size[0]
+        img["height"] = size[1]
         count = count + 1
         # print(count, end=' ', flush=True)
         images.append(img)
 
     # Save annotations.
-    print('Saving annotations...')
+    print("Saving annotations...")
     with open(annotate_in_file) as f:
         dataset = json.load(f)
-    dataset['images'] = images
-    dataset['annotations'] = annotations
-    with open(annotate_out_file, 'w') as outfile:
+    dataset["images"] = images
+    dataset["annotations"] = annotations
+    with open(annotate_out_file, "w") as outfile:
         json.dump(dataset, outfile)
-    print('Done.')
+    print("Done.")
 
 
 def main():
     # Get arguments.
     args = parse_args()
     # Upscale coco.
-    upscale_coco(args.inputs, args.outputs, args.images, args.annotations, args.size, args.format)
+    upscale_coco(
+        args.inputs, args.outputs, args.images, args.annotations, args.size, args.format
+    )
 
 
 main()
diff --git a/upcomming_benchmarks/graph/R-GAT/backend_pytorch.py b/upcomming_benchmarks/graph/R-GAT/backend_pytorch.py
index 05b899c08..70777cef1 100644
--- a/upcomming_benchmarks/graph/R-GAT/backend_pytorch.py
+++ b/upcomming_benchmarks/graph/R-GAT/backend_pytorch.py
@@ -1,3 +1,7 @@
+from graphlearn_torch.typing import InputNodes, NumNeighbors
+from graphlearn_torch.sampler import NeighborSampler, NodeSamplerInput
+from graphlearn_torch.data import Dataset
+from graphlearn_torch.loader import NodeLoader
 from typing import Optional, List, Union
 import os
 import torch
@@ -11,12 +15,6 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("backend-pytorch")
 
-from graphlearn_torch.loader import NodeLoader
-
-from graphlearn_torch.data import Dataset
-from graphlearn_torch.sampler import NeighborSampler, NodeSamplerInput
-from graphlearn_torch.typing import InputNodes, NumNeighbors
-
 
 class CustomNeighborLoader(NodeLoader):
     # Copyright 2022 Alibaba Group Holding Limited. All Rights Reserved.
@@ -143,14 +141,17 @@ def __init__(
         # Create Node and neighbor loade
         self.glt_dataset = glt.data.Dataset(edge_dir=edge_dir)
         self.glt_dataset.init_node_features(
-            node_feature_data=igbh_dataset.feat_dict, with_gpu=(device == "gpu"), dtype=self.type
+            node_feature_data=igbh_dataset.feat_dict,
+            with_gpu=(device == "gpu"),
+            dtype=self.type,
         )
         self.glt_dataset.init_graph(
             edge_index=igbh_dataset.edge_dict,
             layout=layout,
             graph_mode="ZERO_COPY" if (device == "gpu") else "CPU",
         )
-        self.glt_dataset.init_node_labels(node_label_data={"paper": igbh_dataset.label})
+        self.glt_dataset.init_node_labels(
+            node_label_data={"paper": igbh_dataset.label})
         self.neighbor_loader = CustomNeighborLoader(
             self.glt_dataset,
             [15, 10, 5],
@@ -161,17 +162,21 @@ def __init__(
             seed=42,
         )
 
-        self.model = RGNN(
-            self.glt_dataset.get_edge_types(),
-            self.glt_dataset.node_features["paper"].shape[1],
-            512,
-            2983,
-            num_layers=3,
-            dropout=0.2,
-            model=model_type,
-            heads=4,
-            node_type="paper",
-        ).to(self.type).to(self.device)
+        self.model = (
+            RGNN(
+                self.glt_dataset.get_edge_types(),
+                self.glt_dataset.node_features["paper"].shape[1],
+                512,
+                2983,
+                num_layers=3,
+                dropout=0.2,
+                model=model_type,
+                heads=4,
+                node_type="paper",
+            )
+            .to(self.type)
+            .to(self.device)
+        )
         self.model.eval()
         ckpt = None
         if ckpt_path is not None:
@@ -207,4 +212,3 @@ def predict(self, inputs: torch.Tensor):
                 batch.edge_index_dict,
             )[:input_size]
         return out
-
diff --git a/upcomming_benchmarks/graph/R-GAT/igbh.py b/upcomming_benchmarks/graph/R-GAT/igbh.py
index 70c668880..e23a816e1 100644
--- a/upcomming_benchmarks/graph/R-GAT/igbh.py
+++ b/upcomming_benchmarks/graph/R-GAT/igbh.py
@@ -5,9 +5,12 @@
 # pylint: disable=unused-argument,missing-docstring
 # Parts of this script were taken from:
 # https://github.com/mlcommons/training/blob/master/graph_neural_network/dataset.py
-# Specifically the float2half function and the IGBH class are 
+# Specifically the float2half function and the IGBH class are
 # slightly modified copies.
 
+from typing import Literal
+from torch_geometric.utils import add_self_loops, remove_self_loops
+import torch
 import os
 import logging
 import argparse
@@ -21,13 +24,6 @@
 log = logging.getLogger("coco")
 
 
-import numpy as np
-import torch
-
-from torch_geometric.utils import add_self_loops, remove_self_loops
-from typing import Literal
-
-
 def float2half(base_path, dataset_size):
     paper_nodes_num = {
         "tiny": 100000,
@@ -45,7 +41,8 @@ def float2half(base_path, dataset_size):
     }
     # paper node
     paper_feat_path = os.path.join(base_path, "paper", "node_feat.npy")
-    paper_fp16_feat_path = os.path.join(base_path, "paper", "node_feat_fp16.pt")
+    paper_fp16_feat_path = os.path.join(
+        base_path, "paper", "node_feat_fp16.pt")
     if not os.path.exists(paper_fp16_feat_path):
         if dataset_size in ["large", "full"]:
             num_paper_nodes = paper_nodes_num[dataset_size]
@@ -66,7 +63,8 @@ def float2half(base_path, dataset_size):
 
     # author node
     author_feat_path = os.path.join(base_path, "author", "node_feat.npy")
-    author_fp16_feat_path = os.path.join(base_path, "author", "node_feat_fp16.pt")
+    author_fp16_feat_path = os.path.join(
+        base_path, "author", "node_feat_fp16.pt")
     if not os.path.exists(author_fp16_feat_path):
         if dataset_size in ["large", "full"]:
             num_author_nodes = author_nodes_num[dataset_size]
@@ -87,7 +85,8 @@ def float2half(base_path, dataset_size):
 
     # institute node
     institute_feat_path = os.path.join(base_path, "institute", "node_feat.npy")
-    institute_fp16_feat_path = os.path.join(base_path, "institute", "node_feat_fp16.pt")
+    institute_fp16_feat_path = os.path.join(
+        base_path, "institute", "node_feat_fp16.pt")
     if not os.path.exists(institute_fp16_feat_path):
         institute_node_features = torch.from_numpy(
             np.load(institute_feat_path, mmap_mode="r")
@@ -99,12 +98,14 @@ def float2half(base_path, dataset_size):
     fos_feat_path = os.path.join(base_path, "fos", "node_feat.npy")
     fos_fp16_feat_path = os.path.join(base_path, "fos", "node_feat_fp16.pt")
     if not os.path.exists(fos_fp16_feat_path):
-        fos_node_features = torch.from_numpy(np.load(fos_feat_path, mmap_mode="r"))
+        fos_node_features = torch.from_numpy(
+            np.load(fos_feat_path, mmap_mode="r"))
         fos_node_features = fos_node_features.half()
         torch.save(fos_node_features, fos_fp16_feat_path)
 
     # conference node
-    conference_feat_path = os.path.join(base_path, "conference", "node_feat.npy")
+    conference_feat_path = os.path.join(
+        base_path, "conference", "node_feat.npy")
     conference_fp16_feat_path = os.path.join(
         base_path, "conference", "node_feat_fp16.pt"
     )
@@ -117,7 +118,8 @@ def float2half(base_path, dataset_size):
 
     # journal node
     journal_feat_path = os.path.join(base_path, "journal", "node_feat.npy")
-    journal_fp16_feat_path = os.path.join(base_path, "journal", "node_feat_fp16.pt")
+    journal_fp16_feat_path = os.path.join(
+        base_path, "journal", "node_feat_fp16.pt")
     if not os.path.exists(journal_fp16_feat_path):
         journal_node_features = torch.from_numpy(
             np.load(journal_feat_path, mmap_mode="r")
@@ -145,7 +147,13 @@ def __init__(
         self.layout = layout
         self.use_fp16 = use_fp16
 
-        self.ntypes = ["paper", "author", "institute", "fos", "journal", "conference"]
+        self.ntypes = [
+            "paper",
+            "author",
+            "institute",
+            "fos",
+            "journal",
+            "conference"]
         self.etypes = None
         self.edge_dict = {}
         self.feat_dict = {}
@@ -286,7 +294,8 @@ def process(self):
                         )
                     ).t()
 
-                cites_edge = add_self_loops(remove_self_loops(paper_paper_edges)[0])[0]
+                cites_edge = add_self_loops(
+                    remove_self_loops(paper_paper_edges)[0])[0]
                 self.edge_dict = {
                     ("paper", "cites", "paper"): (
                         torch.cat([cites_edge[1, :], cites_edge[0, :]]),
@@ -308,7 +317,7 @@ def process(self):
                         paper_fos_edges[0, :],
                     ),
                 }
-            
+
                 self.edge_dict[("paper", "published", "journal")] = (
                     paper_published_journal
                 )
@@ -324,17 +333,20 @@ def process(self):
                     paper_venue_conference[0, :],
                 )
 
-            # directly load from CSC or CSC files, which can be generated using compress_graph.py
+            # directly load from CSC or CSC files, which can be generated using
+            # compress_graph.py
             else:
                 compress_edge_dict = {}
-                compress_edge_dict[("paper", "cites", "paper")] = "paper__cites__paper"
+                compress_edge_dict[("paper", "cites", "paper")
+                                   ] = "paper__cites__paper"
                 compress_edge_dict[("paper", "written_by", "author")] = (
                     "paper__written_by__author"
                 )
                 compress_edge_dict[("author", "affiliated_to", "institute")] = (
                     "author__affiliated_to__institute"
                 )
-                compress_edge_dict[("paper", "topic", "fos")] = "paper__topic__fos"
+                compress_edge_dict[("paper", "topic", "fos")
+                                   ] = "paper__topic__fos"
                 compress_edge_dict[("author", "rev_written_by", "paper")] = (
                     "author__rev_written_by__paper"
                 )
@@ -365,8 +377,10 @@ def process(self):
                         edge_path = os.path.join(
                             self.base_path, self.layout, compress_edge_dict[etype]
                         )
-                        indptr = torch.load(os.path.join(edge_path, "indptr.pt"))
-                        indices = torch.load(os.path.join(edge_path, "indices.pt"))
+                        indptr = torch.load(
+                            os.path.join(edge_path, "indptr.pt"))
+                        indices = torch.load(
+                            os.path.join(edge_path, "indices.pt"))
                         if self.layout == "CSC":
                             self.edge_dict[etype] = (indices, indptr)
                         else:
@@ -383,7 +397,8 @@ def process(self):
         label_file = (
             "node_label_19.npy" if not self.use_label_2K else "node_label_2K.npy"
         )
-        paper_feat_path = os.path.join(self.base_path, "paper", "node_feat.npy")
+        paper_feat_path = os.path.join(
+            self.base_path, "paper", "node_feat.npy")
         paper_lbl_path = os.path.join(self.base_path, "paper", label_file)
         num_paper_nodes = self.paper_nodes_num[self.dataset_size]
         if self.in_memory:
@@ -392,7 +407,8 @@ def process(self):
                     os.path.join(self.base_path, "paper", "node_feat_fp16.pt")
                 )
             else:
-                paper_node_features = torch.from_numpy(np.load(paper_feat_path))
+                paper_node_features = torch.from_numpy(
+                    np.load(paper_feat_path))
         else:
             if self.dataset_size in ["large", "full"]:
                 paper_node_features = torch.from_numpy(
@@ -414,19 +430,23 @@ def process(self):
                 )
             ).to(torch.long)
         else:
-            paper_node_labels = torch.from_numpy(np.load(paper_lbl_path)).to(torch.long)
+            paper_node_labels = torch.from_numpy(
+                np.load(paper_lbl_path)).to(
+                torch.long)
         self.feat_dict["paper"] = paper_node_features
         self.label = paper_node_labels
 
         num_author_nodes = self.author_nodes_num[self.dataset_size]
-        author_feat_path = os.path.join(self.base_path, "author", "node_feat.npy")
+        author_feat_path = os.path.join(
+            self.base_path, "author", "node_feat.npy")
         if self.in_memory:
             if self.use_fp16:
                 author_node_features = torch.load(
                     os.path.join(self.base_path, "author", "node_feat_fp16.pt")
                 )
             else:
-                author_node_features = torch.from_numpy(np.load(author_feat_path))
+                author_node_features = torch.from_numpy(
+                    np.load(author_feat_path))
         else:
             if self.dataset_size in ["large", "full"]:
                 author_node_features = torch.from_numpy(
@@ -446,11 +466,18 @@ def process(self):
         if self.in_memory:
             if self.use_fp16:
                 institute_node_features = torch.load(
-                    os.path.join(self.base_path, "institute", "node_feat_fp16.pt")
+                    os.path.join(
+                        self.base_path,
+                        "institute",
+                        "node_feat_fp16.pt")
                 )
             else:
                 institute_node_features = torch.from_numpy(
-                    np.load(os.path.join(self.base_path, "institute", "node_feat.npy"))
+                    np.load(
+                        os.path.join(
+                            self.base_path,
+                            "institute",
+                            "node_feat.npy"))
                 )
         else:
             institute_node_features = torch.from_numpy(
@@ -468,7 +495,11 @@ def process(self):
                 )
             else:
                 fos_node_features = torch.from_numpy(
-                    np.load(os.path.join(self.base_path, "fos", "node_feat.npy"))
+                    np.load(
+                        os.path.join(
+                            self.base_path,
+                            "fos",
+                            "node_feat.npy"))
                 )
         else:
             fos_node_features = torch.from_numpy(
@@ -481,18 +512,26 @@ def process(self):
         if self.in_memory:
             if self.use_fp16:
                 conference_node_features = torch.load(
-                    os.path.join(self.base_path, "conference", "node_feat_fp16.pt")
+                    os.path.join(
+                        self.base_path,
+                        "conference",
+                        "node_feat_fp16.pt")
                 )
             else:
                 conference_node_features = torch.from_numpy(
                     np.load(
-                        os.path.join(self.base_path, "conference", "node_feat.npy")
-                    )
+                        os.path.join(
+                            self.base_path,
+                            "conference",
+                            "node_feat.npy"))
                 )
         else:
             conference_node_features = torch.from_numpy(
                 np.load(
-                    os.path.join(self.base_path, "conference", "node_feat.npy"),
+                    os.path.join(
+                        self.base_path,
+                        "conference",
+                        "node_feat.npy"),
                     mmap_mode="r",
                 )
             )
@@ -501,13 +540,18 @@ def process(self):
         if self.in_memory:
             if self.use_fp16:
                 journal_node_features = torch.load(
-                    os.path.join(self.base_path, "journal", "node_feat_fp16.pt")
+                    os.path.join(
+                        self.base_path,
+                        "journal",
+                        "node_feat_fp16.pt")
                 )
             else:
                 journal_node_features = torch.from_numpy(
                     np.load(
-                        os.path.join(self.base_path, "journal", "node_feat.npy")
-                    )
+                        os.path.join(
+                            self.base_path,
+                            "journal",
+                            "node_feat.npy"))
                 )
         else:
             journal_node_features = torch.from_numpy(
@@ -518,10 +562,17 @@ def process(self):
             )
         self.feat_dict["journal"] = journal_node_features
 
-        # Please ensure that train_idx and val_idx have been generated using split_seeds.py
+        # Please ensure that train_idx and val_idx have been generated using
+        # split_seeds.py
         try:
-            self.train_idx = torch.load(os.path.join(self.base_path, "train_idx.pt"))
-            self.val_idx = torch.load(os.path.join(self.base_path, "val_idx.pt"))
+            self.train_idx = torch.load(
+                os.path.join(
+                    self.base_path,
+                    "train_idx.pt"))
+            self.val_idx = torch.load(
+                os.path.join(
+                    self.base_path,
+                    "val_idx.pt"))
         except FileNotFoundError as e:
             print(
                 f"FileNotFound: {e}, please ensure that train_idx and val_idx have been generated using split_seeds.py"
@@ -565,10 +616,10 @@ def get_samples(self, id_list):
 
     def get_labels(self, id_list):
         return self.igbh_dataset.label[self.get_samples(id_list)]
-    
+
     def get_item_count(self):
         return len(self.igbh_dataset.val_idx)
-    
+
     def load_query_samples(self, id):
         pass
 
@@ -581,7 +632,8 @@ def __init__(
         self,
         device="cpu",
         dtype="uint8",
-        statistics_path=os.path.join(os.path.dirname(__file__), "tools", "val2014.npz"),
+        statistics_path=os.path.join(
+            os.path.dirname(__file__), "tools", "val2014.npz"),
     ):
         self.results = []
         self.content_ids = []
@@ -603,8 +655,7 @@ def finalize(self, result_dict, ds=None, output_dir=None):
         total = len(self.results)
         good = 0
         for l, r in zip(labels, self.results):
-            if (l == r):
+            if l == r:
                 good += 1
         result_dict["accuracy"] = good / total
         return result_dict
-
diff --git a/upcomming_benchmarks/graph/R-GAT/main.py b/upcomming_benchmarks/graph/R-GAT/main.py
index 145c3931b..a7697481a 100644
--- a/upcomming_benchmarks/graph/R-GAT/main.py
+++ b/upcomming_benchmarks/graph/R-GAT/main.py
@@ -60,7 +60,7 @@
         dataset.preprocess,
         igbh.PostProcessIGBH(),
         {"dataset_size": "full", "use_label_2K": True},
-    )
+    ),
 }
 
 
@@ -108,17 +108,32 @@
 def get_args():
     parser = argparse.ArgumentParser()
     # Dataset arguments
-    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
-    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
-    parser.add_argument("--in-memory", action="store_true", help="path to the dataset")
-    parser.add_argument("--layout", default="COO", choices=["CSC", "CSR", "COO"], help="path to the dataset")
+    parser.add_argument(
+        "--dataset",
+        choices=SUPPORTED_DATASETS.keys(),
+        help="dataset")
+    parser.add_argument(
+        "--dataset-path",
+        required=True,
+        help="path to the dataset")
+    parser.add_argument(
+        "--in-memory",
+        action="store_true",
+        help="path to the dataset")
+    parser.add_argument(
+        "--layout",
+        default="COO",
+        choices=["CSC", "CSR", "COO"],
+        help="path to the dataset",
+    )
     parser.add_argument(
         "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
     )
     parser.add_argument(
         "--scenario",
         default="SingleStream",
-        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())),
+        help="mlperf benchmark scenario, one of " +
+        str(list(SCENARIO_MAP.keys())),
     )
     parser.add_argument(
         "--max-batchsize",
@@ -127,7 +142,10 @@ def get_args():
         help="max batch size in a single inference",
     )
     parser.add_argument("--threads", default=1, type=int, help="threads")
-    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
     parser.add_argument(
         "--find-peak-performance",
         action="store_true",
@@ -168,12 +186,16 @@ def get_args():
         "--audit_conf", default="audit.config", help="config for LoadGen audit settings"
     )
 
-    # below will override mlperf rules compliant settings - don't use for official submission
+    # below will override mlperf rules compliant settings - don't use for
+    # official submission
     parser.add_argument("--time", type=int, help="time to scan in seconds")
     parser.add_argument("--count", type=int, help="dataset items to use")
     parser.add_argument("--debug", action="store_true", help="debug")
     parser.add_argument(
-        "--performance-sample-count", type=int, help="performance sample count", default=5000
+        "--performance-sample-count",
+        type=int,
+        help="performance sample count",
+        default=5000,
     )
     parser.add_argument(
         "--max-latency", type=float, help="mlperf max latency in pct tile"
@@ -206,6 +228,7 @@ def get_args():
 def get_backend(backend, **kwargs):
     if backend == "pytorch":
         from backend_pytorch import BackendPytorch
+
         backend = BackendPytorch(**kwargs)
     else:
         raise ValueError("unknown backend: " + backend)
@@ -279,10 +302,9 @@ def enqueue(self, query_samples):
         else:
             bs = self.max_batchsize
             for i in range(0, len(idx), bs):
-                samples = self.ds.get_samples(idx[i : i + bs])
+                samples = self.ds.get_samples(idx[i: i + bs])
                 self.run_one_item(
-                    Item(query_id[i : i + bs], idx[i : i + bs], samples)
-                )
+                    Item(query_id[i: i + bs], idx[i: i + bs], samples))
 
     def finish(self):
         pass
@@ -296,7 +318,9 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         self.result_dict = {}
 
         for _ in range(self.threads):
-            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
+            worker = threading.Thread(
+                target=self.handle_tasks, args=(
+                    self.tasks,))
             worker.daemon = True
             self.workers.append(worker)
             worker.start()
@@ -357,7 +381,7 @@ def main():
         ckpt_path=args.model_path,
         batch_size=args.max_batchsize,
         igbh_dataset=ds.igbh_dataset,
-        layout=args.layout
+        layout=args.layout,
     )
 
     # --count applies to accuracy mode only and can be used to limit the number of images
@@ -389,7 +413,6 @@ def main():
         sys.exit(1)
 
     audit_config = os.path.abspath(args.audit_conf)
-    
 
     if args.output:
         output_dir = os.path.abspath(args.output)
@@ -458,7 +481,8 @@ def flush_queries():
         settings.multi_stream_samples_per_query = args.samples_per_query
     if args.max_latency:
         settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
-        settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC)
+        settings.multi_stream_expected_latency_ns = int(
+            args.max_latency * NANO_SEC)
 
     performance_sample_count = (
         args.performance_sample_count
diff --git a/upcomming_benchmarks/graph/R-GAT/rgnn.py b/upcomming_benchmarks/graph/R-GAT/rgnn.py
index 8743fe309..11d43eac9 100644
--- a/upcomming_benchmarks/graph/R-GAT/rgnn.py
+++ b/upcomming_benchmarks/graph/R-GAT/rgnn.py
@@ -7,63 +7,99 @@
 from torch_geometric.nn import HeteroConv, GATConv, GCNConv, SAGEConv
 from torch_geometric.utils import trim_to_layer
 
+
 class RGNN(torch.nn.Module):
-  r""" [Relational GNN model](https://arxiv.org/abs/1703.06103).
+    r"""[Relational GNN model](https://arxiv.org/abs/1703.06103).
+
+    Args:
+      etypes: edge types.
+      in_dim: input size.
+      h_dim: Dimension of hidden layer.
+      out_dim: Output dimension.
+      num_layers: Number of conv layers.
+      dropout: Dropout probability for hidden layers.
+      model: "rsage" or "rgat".
+      heads: Number of multi-head-attentions for GAT.
+      node_type: The predict node type for node classification.
+
+    """
 
-  Args:
-    etypes: edge types.
-    in_dim: input size.
-    h_dim: Dimension of hidden layer.
-    out_dim: Output dimension.
-    num_layers: Number of conv layers.
-    dropout: Dropout probability for hidden layers.
-    model: "rsage" or "rgat".
-    heads: Number of multi-head-attentions for GAT.
-    node_type: The predict node type for node classification.
+    def __init__(
+        self,
+        etypes,
+        in_dim,
+        h_dim,
+        out_dim,
+        num_layers=2,
+        dropout=0.2,
+        model="rgat",
+        heads=4,
+        node_type=None,
+        with_trim=False,
+    ):
+        super().__init__()
+        self.node_type = node_type
+        if node_type is not None:
+            self.lin = torch.nn.Linear(h_dim, out_dim)
 
-  """
-  def __init__(self, etypes, in_dim, h_dim, out_dim, num_layers=2,
-               dropout=0.2, model='rgat', heads=4, node_type=None, with_trim=False):
-    super().__init__()
-    self.node_type = node_type
-    if node_type is not None:
-      self.lin = torch.nn.Linear(h_dim, out_dim)
+        self.convs = torch.nn.ModuleList()
+        for i in range(num_layers):
+            in_dim = in_dim if i == 0 else h_dim
+            h_dim = out_dim if (
+                i == (
+                    num_layers -
+                    1) and node_type is None) else h_dim
+            if model == "rsage":
+                self.convs.append(
+                    HeteroConv(
+                        {
+                            etype: SAGEConv(in_dim, h_dim, root_weight=False)
+                            for etype in etypes
+                        }
+                    )
+                )
+            elif model == "rgat":
+                self.convs.append(
+                    HeteroConv(
+                        {
+                            etype: GATConv(
+                                in_dim,
+                                h_dim // heads,
+                                heads=heads,
+                                add_self_loops=False,
+                            )
+                            for etype in etypes
+                        }
+                    )
+                )
+        self.dropout = torch.nn.Dropout(dropout)
+        self.with_trim = with_trim
 
-    self.convs = torch.nn.ModuleList()
-    for i in range(num_layers):
-      in_dim = in_dim if i == 0 else h_dim
-      h_dim = out_dim if (i == (num_layers - 1) and node_type is None) else h_dim
-      if model == 'rsage':
-        self.convs.append(HeteroConv({
-            etype: SAGEConv(in_dim, h_dim, root_weight=False)
-            for etype in etypes}))
-      elif model == 'rgat':
-        self.convs.append(HeteroConv({
-            etype: GATConv(in_dim, h_dim // heads, heads=heads, add_self_loops=False)
-            for etype in etypes}))
-    self.dropout = torch.nn.Dropout(dropout)
-    self.with_trim = with_trim
+    def forward(
+        self,
+        x_dict,
+        edge_index_dict,
+        num_sampled_edges_dict=None,
+        num_sampled_nodes_dict=None,
+    ):
+        for i, conv in enumerate(self.convs):
+            if self.with_trim:
+                x_dict, edge_index_dict, _ = trim_to_layer(
+                    layer=i,
+                    num_sampled_nodes_per_hop=num_sampled_nodes_dict,
+                    num_sampled_edges_per_hop=num_sampled_edges_dict,
+                    x=x_dict,
+                    edge_index=edge_index_dict,
+                )
+            for key in list(edge_index_dict.keys()):
+                if key[0] not in x_dict or key[-1] not in x_dict:
+                    del edge_index_dict[key]
 
-  def forward(self, x_dict, edge_index_dict, num_sampled_edges_dict=None,
-              num_sampled_nodes_dict=None):
-    for i, conv in enumerate(self.convs):
-      if self.with_trim:
-        x_dict, edge_index_dict, _ = trim_to_layer(
-          layer=i,
-          num_sampled_nodes_per_hop=num_sampled_nodes_dict,
-          num_sampled_edges_per_hop=num_sampled_edges_dict,
-          x=x_dict,
-          edge_index=edge_index_dict
-        )
-      for key in list(edge_index_dict.keys()):
-        if key[0] not in x_dict or key[-1] not in x_dict:
-          del edge_index_dict[key]
-          
-      x_dict = conv(x_dict, edge_index_dict)
-      if i != len(self.convs) - 1:
-        x_dict = {key: F.leaky_relu(x) for key, x in x_dict.items()}
-        x_dict = {key: self.dropout(x) for key, x in x_dict.items()}
-    if hasattr(self, 'lin'): # for node classification
-      return self.lin(x_dict[self.node_type])
-    else:
-      return x_dict
\ No newline at end of file
+            x_dict = conv(x_dict, edge_index_dict)
+            if i != len(self.convs) - 1:
+                x_dict = {key: F.leaky_relu(x) for key, x in x_dict.items()}
+                x_dict = {key: self.dropout(x) for key, x in x_dict.items()}
+        if hasattr(self, "lin"):  # for node classification
+            return self.lin(x_dict[self.node_type])
+        else:
+            return x_dict
diff --git a/upcomming_benchmarks/graph/R-GAT/tools/compress_graph.py b/upcomming_benchmarks/graph/R-GAT/tools/compress_graph.py
index 252d8458b..6fa2c2989 100644
--- a/upcomming_benchmarks/graph/R-GAT/tools/compress_graph.py
+++ b/upcomming_benchmarks/graph/R-GAT/tools/compress_graph.py
@@ -1,6 +1,13 @@
-# This is a modified version of a script taken from: https://github.com/mlcommons/training/blob/master/graph_neural_network/compress_graph.py
+# This is a modified version of a script taken from:
+# https://github.com/mlcommons/training/blob/master/graph_neural_network/compress_graph.py
 
-import argparse, datetime, os
+from typing import Literal
+from torch_geometric.utils import add_self_loops, remove_self_loops
+from igb.download import download_dataset
+from igbh import float2half
+import argparse
+import datetime
+import os
 import numpy as np
 import torch
 import os.path as osp
@@ -8,113 +15,237 @@
 import graphlearn_torch as glt
 
 import sys
+
 sys.path.append(os.path.dirname(__file__))
 sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir))
-from igbh import float2half
-from igb.download import download_dataset
-from torch_geometric.utils import add_self_loops, remove_self_loops
-from typing import Literal
 
 
 class IGBHeteroDatasetCompress(object):
-  def __init__(self,
-               path,
-               dataset_size,
-               layout: Literal['CSC', 'CSR'] = 'CSC',):
-    self.dir = path
-    self.dataset_size = dataset_size
-    self.layout = layout
-
-    self.ntypes = ['paper', 'author', 'institute', 'fos']
-    self.etypes = None
-    self.edge_dict = {}
-    self.paper_nodes_num = {'tiny':100000, 'small':1000000, 'medium':10000000, 'large':100000000, 'full':269346174}
-    self.author_nodes_num = {'tiny':357041, 'small':1926066, 'medium':15544654, 'large':116959896, 'full':277220883}
-    if not osp.exists(osp.join(path, self.dataset_size, 'processed')):
-      download_dataset(path, 'heterogeneous', dataset_size)
-    self.process()
+    def __init__(
+        self,
+        path,
+        dataset_size,
+        layout: Literal["CSC", "CSR"] = "CSC",
+    ):
+        self.dir = path
+        self.dataset_size = dataset_size
+        self.layout = layout
 
-  def process(self):
-    paper_paper_edges = torch.from_numpy(np.load(osp.join(self.dir, self.dataset_size, 'processed',
-    'paper__cites__paper', 'edge_index.npy'))).t()
-    author_paper_edges = torch.from_numpy(np.load(osp.join(self.dir, self.dataset_size, 'processed',
-    'paper__written_by__author', 'edge_index.npy'))).t()
-    affiliation_author_edges = torch.from_numpy(np.load(osp.join(self.dir, self.dataset_size, 'processed',
-    'author__affiliated_to__institute', 'edge_index.npy'))).t()
-    paper_fos_edges = torch.from_numpy(np.load(osp.join(self.dir, self.dataset_size, 'processed',
-    'paper__topic__fos', 'edge_index.npy'))).t()
-    paper_published_journal = torch.from_numpy(np.load(osp.join(self.dir, self.dataset_size, 'processed',
-    'paper__published__journal', 'edge_index.npy'))).t()
-    paper_venue_conference = torch.from_numpy(np.load(osp.join(self.dir, self.dataset_size, 'processed',
-    'paper__venue__conference', 'edge_index.npy'))).t()
+        self.ntypes = ["paper", "author", "institute", "fos"]
+        self.etypes = None
+        self.edge_dict = {}
+        self.paper_nodes_num = {
+            "tiny": 100000,
+            "small": 1000000,
+            "medium": 10000000,
+            "large": 100000000,
+            "full": 269346174,
+        }
+        self.author_nodes_num = {
+            "tiny": 357041,
+            "small": 1926066,
+            "medium": 15544654,
+            "large": 116959896,
+            "full": 277220883,
+        }
+        if not osp.exists(osp.join(path, self.dataset_size, "processed")):
+            download_dataset(path, "heterogeneous", dataset_size)
+        self.process()
 
-    cites_edge = add_self_loops(remove_self_loops(paper_paper_edges)[0])[0]
-    self.edge_dict = {
-        ('paper', 'cites', 'paper'): (torch.cat([cites_edge[1, :], cites_edge[0, :]]), torch.cat([cites_edge[0, :], cites_edge[1, :]])),
-        ('paper', 'written_by', 'author'): author_paper_edges,
-        ('author', 'affiliated_to', 'institute'): affiliation_author_edges,
-        ('paper', 'topic', 'fos'): paper_fos_edges,
-        ('author', 'rev_written_by', 'paper'): (author_paper_edges[1, :], author_paper_edges[0, :]),
-        ('institute', 'rev_affiliated_to', 'author'): (affiliation_author_edges[1, :], affiliation_author_edges[0, :]),
-        ('fos', 'rev_topic', 'paper'): (paper_fos_edges[1, :], paper_fos_edges[0, :])
-    }
-    self.edge_dict[('paper', 'published', 'journal')] = paper_published_journal
-    self.edge_dict[('paper', 'venue', 'conference')] = paper_venue_conference
-    self.edge_dict[('journal', 'rev_published', 'paper')] = (paper_published_journal[1, :], paper_published_journal[0, :])
-    self.edge_dict[('conference', 'rev_venue', 'paper')] = (paper_venue_conference[1, :], paper_venue_conference[0, :])
-    self.etypes = list(self.edge_dict.keys())
+    def process(self):
+        paper_paper_edges = torch.from_numpy(
+            np.load(
+                osp.join(
+                    self.dir,
+                    self.dataset_size,
+                    "processed",
+                    "paper__cites__paper",
+                    "edge_index.npy",
+                )
+            )
+        ).t()
+        author_paper_edges = torch.from_numpy(
+            np.load(
+                osp.join(
+                    self.dir,
+                    self.dataset_size,
+                    "processed",
+                    "paper__written_by__author",
+                    "edge_index.npy",
+                )
+            )
+        ).t()
+        affiliation_author_edges = torch.from_numpy(
+            np.load(
+                osp.join(
+                    self.dir,
+                    self.dataset_size,
+                    "processed",
+                    "author__affiliated_to__institute",
+                    "edge_index.npy",
+                )
+            )
+        ).t()
+        paper_fos_edges = torch.from_numpy(
+            np.load(
+                osp.join(
+                    self.dir,
+                    self.dataset_size,
+                    "processed",
+                    "paper__topic__fos",
+                    "edge_index.npy",
+                )
+            )
+        ).t()
+        paper_published_journal = torch.from_numpy(
+            np.load(
+                osp.join(
+                    self.dir,
+                    self.dataset_size,
+                    "processed",
+                    "paper__published__journal",
+                    "edge_index.npy",
+                )
+            )
+        ).t()
+        paper_venue_conference = torch.from_numpy(
+            np.load(
+                osp.join(
+                    self.dir,
+                    self.dataset_size,
+                    "processed",
+                    "paper__venue__conference",
+                    "edge_index.npy",
+                )
+            )
+        ).t()
 
-    # init graphlearn_torch Dataset.
-    edge_dir = 'out' if self.layout == 'CSR' else 'in'
-    glt_dataset = glt.data.Dataset(edge_dir=edge_dir)
-    glt_dataset.init_graph(
-      edge_index=self.edge_dict,
-      graph_mode='CPU',
-    )
+        cites_edge = add_self_loops(remove_self_loops(paper_paper_edges)[0])[0]
+        self.edge_dict = {
+            ("paper", "cites", "paper"): (
+                torch.cat([cites_edge[1, :], cites_edge[0, :]]),
+                torch.cat([cites_edge[0, :], cites_edge[1, :]]),
+            ),
+            ("paper", "written_by", "author"): author_paper_edges,
+            ("author", "affiliated_to", "institute"): affiliation_author_edges,
+            ("paper", "topic", "fos"): paper_fos_edges,
+            ("author", "rev_written_by", "paper"): (
+                author_paper_edges[1, :],
+                author_paper_edges[0, :],
+            ),
+            ("institute", "rev_affiliated_to", "author"): (
+                affiliation_author_edges[1, :],
+                affiliation_author_edges[0, :],
+            ),
+            ("fos", "rev_topic", "paper"): (
+                paper_fos_edges[1, :],
+                paper_fos_edges[0, :],
+            ),
+        }
+        self.edge_dict[("paper", "published", "journal")
+                       ] = paper_published_journal
+        self.edge_dict[("paper", "venue", "conference")
+                       ] = paper_venue_conference
+        self.edge_dict[("journal", "rev_published", "paper")] = (
+            paper_published_journal[1, :],
+            paper_published_journal[0, :],
+        )
+        self.edge_dict[("conference", "rev_venue", "paper")] = (
+            paper_venue_conference[1, :],
+            paper_venue_conference[0, :],
+        )
+        self.etypes = list(self.edge_dict.keys())
 
-    # save the corresponding csr or csc file
-    compress_edge_dict = {}
-    compress_edge_dict[('paper', 'cites', 'paper')] = 'paper__cites__paper'
-    compress_edge_dict[('paper', 'written_by', 'author')] = 'paper__written_by__author'
-    compress_edge_dict[('author', 'affiliated_to', 'institute')] = 'author__affiliated_to__institute'
-    compress_edge_dict[('paper', 'topic', 'fos')] = 'paper__topic__fos'
-    compress_edge_dict[('author', 'rev_written_by', 'paper')] = 'author__rev_written_by__paper'
-    compress_edge_dict[('institute', 'rev_affiliated_to', 'author')] = 'institute__rev_affiliated_to__author'
-    compress_edge_dict[('fos', 'rev_topic', 'paper')] = 'fos__rev_topic__paper'
-    compress_edge_dict[('paper', 'published', 'journal')] = 'paper__published__journal'
-    compress_edge_dict[('paper', 'venue', 'conference')] = 'paper__venue__conference'
-    compress_edge_dict[('journal', 'rev_published', 'paper')] = 'journal__rev_published__paper'
-    compress_edge_dict[('conference', 'rev_venue', 'paper')] = 'conference__rev_venue__paper'
+        # init graphlearn_torch Dataset.
+        edge_dir = "out" if self.layout == "CSR" else "in"
+        glt_dataset = glt.data.Dataset(edge_dir=edge_dir)
+        glt_dataset.init_graph(
+            edge_index=self.edge_dict,
+            graph_mode="CPU",
+        )
 
-    for etype in self.etypes:
-      graph = glt_dataset.get_graph(etype)
-      indptr, indices, _ = graph.export_topology()
-      path = os.path.join(self.dir, self.dataset_size, 'processed', self.layout, compress_edge_dict[etype])
-      if not os.path.exists(path):
-        os.makedirs(path)
-      torch.save(indptr, os.path.join(path, 'indptr.pt'))
-      torch.save(indices, os.path.join(path, 'indices.pt'))
-    path = os.path.join(self.dir, self.dataset_size, 'processed', self.layout)
-    print(f"The {self.layout} graph has been persisted in path: {path}")
+        # save the corresponding csr or csc file
+        compress_edge_dict = {}
+        compress_edge_dict[("paper", "cites", "paper")] = "paper__cites__paper"
+        compress_edge_dict[("paper", "written_by", "author")] = (
+            "paper__written_by__author"
+        )
+        compress_edge_dict[("author", "affiliated_to", "institute")] = (
+            "author__affiliated_to__institute"
+        )
+        compress_edge_dict[("paper", "topic", "fos")] = "paper__topic__fos"
+        compress_edge_dict[("author", "rev_written_by", "paper")] = (
+            "author__rev_written_by__paper"
+        )
+        compress_edge_dict[("institute", "rev_affiliated_to", "author")] = (
+            "institute__rev_affiliated_to__author"
+        )
+        compress_edge_dict[("fos", "rev_topic", "paper")
+                           ] = "fos__rev_topic__paper"
+        compress_edge_dict[("paper", "published", "journal")] = (
+            "paper__published__journal"
+        )
+        compress_edge_dict[("paper", "venue", "conference")] = (
+            "paper__venue__conference"
+        )
+        compress_edge_dict[("journal", "rev_published", "paper")] = (
+            "journal__rev_published__paper"
+        )
+        compress_edge_dict[("conference", "rev_venue", "paper")] = (
+            "conference__rev_venue__paper"
+        )
 
+        for etype in self.etypes:
+            graph = glt_dataset.get_graph(etype)
+            indptr, indices, _ = graph.export_topology()
+            path = os.path.join(
+                self.dir,
+                self.dataset_size,
+                "processed",
+                self.layout,
+                compress_edge_dict[etype],
+            )
+            if not os.path.exists(path):
+                os.makedirs(path)
+            torch.save(indptr, os.path.join(path, "indptr.pt"))
+            torch.save(indices, os.path.join(path, "indices.pt"))
+        path = os.path.join(
+            self.dir,
+            self.dataset_size,
+            "processed",
+            self.layout)
+        print(f"The {self.layout} graph has been persisted in path: {path}")
 
 
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  root = osp.join(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__)))), 'data', 'igbh')
-  glt.utils.ensure_dir(root)
-  parser.add_argument('--path', type=str, default=root,
-      help='path containing the datasets')
-  parser.add_argument('--dataset_size', type=str, default='full',
-      choices=['tiny', 'small', 'medium', 'large', 'full'],
-      help='size of the datasets')
-  parser.add_argument("--layout", type=str, default='CSC')
-  parser.add_argument('--use_fp16', action="store_true",
-    help="convert the node/edge feature into fp16 format")
-  args = parser.parse_args()
-  print(f"Start constructing the {args.layout} graph...")
-  igbh_dataset = IGBHeteroDatasetCompress(args.path, args.dataset_size, args.layout)
-  if args.use_fp16:
-    base_path = osp.join(args.path, args.dataset_size, 'processed')
-    float2half(base_path, args.dataset_size)
-  
\ No newline at end of file
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    root = osp.join(
+        osp.dirname(
+            osp.dirname(
+                osp.dirname(
+                    osp.realpath(__file__)))), "data", "igbh"
+    )
+    glt.utils.ensure_dir(root)
+    parser.add_argument(
+        "--path", type=str, default=root, help="path containing the datasets"
+    )
+    parser.add_argument(
+        "--dataset_size",
+        type=str,
+        default="full",
+        choices=["tiny", "small", "medium", "large", "full"],
+        help="size of the datasets",
+    )
+    parser.add_argument("--layout", type=str, default="CSC")
+    parser.add_argument(
+        "--use_fp16",
+        action="store_true",
+        help="convert the node/edge feature into fp16 format",
+    )
+    args = parser.parse_args()
+    print(f"Start constructing the {args.layout} graph...")
+    igbh_dataset = IGBHeteroDatasetCompress(
+        args.path, args.dataset_size, args.layout)
+    if args.use_fp16:
+        base_path = osp.join(args.path, args.dataset_size, "processed")
+        float2half(base_path, args.dataset_size)
diff --git a/upcomming_benchmarks/graph/R-GAT/tools/format_model.py b/upcomming_benchmarks/graph/R-GAT/tools/format_model.py
index b7cd50507..04d326722 100644
--- a/upcomming_benchmarks/graph/R-GAT/tools/format_model.py
+++ b/upcomming_benchmarks/graph/R-GAT/tools/format_model.py
@@ -1,33 +1,32 @@
+from collections import OrderedDict
+from graphlearn_torch.typing import InputNodes, NumNeighbors
+from graphlearn_torch.sampler import NeighborSampler, NodeSamplerInput
+from graphlearn_torch.data import Dataset
+from graphlearn_torch.loader import NodeLoader
+import graphlearn_torch as glt
+from igbh import IGBHeteroDataset, IGBH
+from rgnn import RGNN
 from typing import Literal
 import torch
-import os, sys
+import os
+import sys
+
 sys.path.append(os.path.dirname(__file__))
 sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir))
-from rgnn import RGNN
-from igbh import IGBHeteroDataset, IGBH
-import graphlearn_torch as glt
-
-from graphlearn_torch.loader import NodeLoader
-
-from graphlearn_torch.data import Dataset
-from graphlearn_torch.sampler import NeighborSampler, NodeSamplerInput
-from graphlearn_torch.typing import InputNodes, NumNeighbors
-
-from collections import OrderedDict
-
 
 
 class Formatter:
-    def __init__(self,
-            model_type="rgat",
-            type: Literal["fp16", "fp32"] = "fp16",
-            device: Literal["cpu", "gpu"] = "gpu",
-            ckpt_path: str = None,
-            igbh_dataset: IGBHeteroDataset = None,
-            batch_size: int = 1,
-            layout: Literal["CSC", "CSR", "COO"] = "COO",
-            edge_dir: str = "in",
-        ) -> None:
+    def __init__(
+        self,
+        model_type="rgat",
+        type: Literal["fp16", "fp32"] = "fp16",
+        device: Literal["cpu", "gpu"] = "gpu",
+        ckpt_path: str = None,
+        igbh_dataset: IGBHeteroDataset = None,
+        batch_size: int = 1,
+        layout: Literal["CSC", "CSR", "COO"] = "COO",
+        edge_dir: str = "in",
+    ) -> None:
         if device == "gpu":
             self.device = torch.device("cuda")
         else:
@@ -40,26 +39,33 @@ def __init__(self,
         # Create Node and neighbor loade
         self.glt_dataset = glt.data.Dataset(edge_dir=edge_dir)
         self.glt_dataset.init_node_features(
-            node_feature_data=igbh_dataset.feat_dict, with_gpu=(device == "gpu"), dtype=self.type
+            node_feature_data=igbh_dataset.feat_dict,
+            with_gpu=(device == "gpu"),
+            dtype=self.type,
         )
         self.glt_dataset.init_graph(
             edge_index=igbh_dataset.edge_dict,
             layout=layout,
             graph_mode="ZERO_COPY" if (device == "gpu") else "CPU",
         )
-        self.glt_dataset.init_node_labels(node_label_data={"paper": igbh_dataset.label})
+        self.glt_dataset.init_node_labels(
+            node_label_data={"paper": igbh_dataset.label})
 
-        self.model = RGNN(
-            self.glt_dataset.get_edge_types(),
-            self.glt_dataset.node_features["paper"].shape[1],
-            512,
-            2983,
-            num_layers=3,
-            dropout=0.2,
-            model=model_type,
-            heads=4,
-            node_type="paper",
-        ).to(self.type).to(self.device)
+        self.model = (
+            RGNN(
+                self.glt_dataset.get_edge_types(),
+                self.glt_dataset.node_features["paper"].shape[1],
+                512,
+                2983,
+                num_layers=3,
+                dropout=0.2,
+                model=model_type,
+                heads=4,
+                node_type="paper",
+            )
+            .to(self.type)
+            .to(self.device)
+        )
         self.model.eval()
         ckpt = None
         if ckpt_path is not None:
@@ -75,12 +81,20 @@ def __init__(self,
                 if "lin_dst" in k:
                     pass
                 elif "lin_src" in k:
-                    formatted_ckpt["model_state_dict"][str(k).replace("lin_src", "lin")] = ckpt["model_state_dict"][k]
+                    formatted_ckpt["model_state_dict"][
+                        str(k).replace("lin_src", "lin")
+                    ] = ckpt["model_state_dict"][k]
                 else:
                     formatted_ckpt["model_state_dict"][k] = ckpt["model_state_dict"][k]
             self.model.load_state_dict(formatted_ckpt["model_state_dict"])
-            torch.save({'model_state_dict': self.model.state_dict()}, "model/FULL_model_seq_69294_formatted.ckpt")
+            torch.save(
+                {"model_state_dict": self.model.state_dict()},
+                "model/FULL_model_seq_69294_formatted.ckpt",
+            )
+
 
 if __name__ == "__main__":
     igbh = IGBHeteroDataset("igbh", use_label_2K=True)
-    f = Formatter(igbh_dataset=igbh, ckpt_path="model/FULL_model_seq_69294.ckpt", device="cpu")
+    f = Formatter(
+        igbh_dataset=igbh, ckpt_path="model/FULL_model_seq_69294.ckpt", device="cpu"
+    )
diff --git a/upcomming_benchmarks/graph/R-GAT/tools/split_seeds.py b/upcomming_benchmarks/graph/R-GAT/tools/split_seeds.py
index 9dcfe7168..5bbed6d0a 100644
--- a/upcomming_benchmarks/graph/R-GAT/tools/split_seeds.py
+++ b/upcomming_benchmarks/graph/R-GAT/tools/split_seeds.py
@@ -1,61 +1,93 @@
-# Script taken from https://github.com/mlcommons/training/blob/master/graph_neural_network/split_seeds.py
+# Script taken from
+# https://github.com/mlcommons/training/blob/master/graph_neural_network/split_seeds.py
 
 import argparse
 import os.path as osp
 import torch
 
+
 class SeedSplitter(object):
-  def __init__(self,
-               path,
-               dataset_size='tiny',
-               use_label_2K=True,
-               random_seed=42,
-               validation_frac=0.01):
-    self.path = path
-    self.dataset_size = dataset_size
-    self.use_label_2K = use_label_2K
-    self.random_seed = random_seed
-    self.validation_frac = validation_frac
-    self.paper_nodes_num = {'tiny':100000, 'small':1000000, 'medium':10000000, 'large':100000000, 'full':269346174}
-    self.process()
-  
-  def process(self):
-    torch.manual_seed(self.random_seed)
-    n_labeled_idx = self.paper_nodes_num[self.dataset_size]
-    if self.dataset_size == 'full':
-      if self.use_label_2K:
-          n_labeled_idx = 157675969
-      else:
-          n_labeled_idx = 227130858
-
-    shuffled_index = torch.randperm(n_labeled_idx)
-    n_train = int(n_labeled_idx * 0.6)
-    n_val = int(n_labeled_idx * self.validation_frac)
-
-    train_idx = shuffled_index[:n_train]
-    val_idx = shuffled_index[n_train : n_train + n_val]
-
-    path = osp.join(self.path, self.dataset_size, 'processed')
-    torch.save(train_idx, osp.join(path, 'train_idx.pt'))
-    torch.save(val_idx, osp.join(path, 'val_idx.pt'))
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  root = osp.join(osp.dirname(osp.dirname(osp.dirname(osp.realpath(__file__)))), 'data', 'igbh')
-  parser.add_argument('--path', type=str, default=root,
-      help='path containing the datasets')
-  parser.add_argument('--dataset_size', type=str, default='full',
-      choices=['tiny', 'small', 'medium', 'large', 'full'],
-      help='size of the datasets')
-  parser.add_argument("--random_seed", type=int, default='42')
-  parser.add_argument('--num_classes', type=int, default=2983,
-      choices=[19, 2983], help='number of classes')
-  parser.add_argument("--validation_frac", type=float, default=0.005,
-      help="Fraction of labeled vertices to be used for validation.")
-  
-  args = parser.parse_args()
-  splitter = SeedSplitter(path=args.path,
-                          dataset_size=args.dataset_size,
-                          use_label_2K=(args.num_classes==2983),
-                          random_seed=args.random_seed,
-                          validation_frac=args.validation_frac)
\ No newline at end of file
+    def __init__(
+        self,
+        path,
+        dataset_size="tiny",
+        use_label_2K=True,
+        random_seed=42,
+        validation_frac=0.01,
+    ):
+        self.path = path
+        self.dataset_size = dataset_size
+        self.use_label_2K = use_label_2K
+        self.random_seed = random_seed
+        self.validation_frac = validation_frac
+        self.paper_nodes_num = {
+            "tiny": 100000,
+            "small": 1000000,
+            "medium": 10000000,
+            "large": 100000000,
+            "full": 269346174,
+        }
+        self.process()
+
+    def process(self):
+        torch.manual_seed(self.random_seed)
+        n_labeled_idx = self.paper_nodes_num[self.dataset_size]
+        if self.dataset_size == "full":
+            if self.use_label_2K:
+                n_labeled_idx = 157675969
+            else:
+                n_labeled_idx = 227130858
+
+        shuffled_index = torch.randperm(n_labeled_idx)
+        n_train = int(n_labeled_idx * 0.6)
+        n_val = int(n_labeled_idx * self.validation_frac)
+
+        train_idx = shuffled_index[:n_train]
+        val_idx = shuffled_index[n_train: n_train + n_val]
+
+        path = osp.join(self.path, self.dataset_size, "processed")
+        torch.save(train_idx, osp.join(path, "train_idx.pt"))
+        torch.save(val_idx, osp.join(path, "val_idx.pt"))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    root = osp.join(
+        osp.dirname(
+            osp.dirname(
+                osp.dirname(
+                    osp.realpath(__file__)))), "data", "igbh"
+    )
+    parser.add_argument(
+        "--path", type=str, default=root, help="path containing the datasets"
+    )
+    parser.add_argument(
+        "--dataset_size",
+        type=str,
+        default="full",
+        choices=["tiny", "small", "medium", "large", "full"],
+        help="size of the datasets",
+    )
+    parser.add_argument("--random_seed", type=int, default="42")
+    parser.add_argument(
+        "--num_classes",
+        type=int,
+        default=2983,
+        choices=[19, 2983],
+        help="number of classes",
+    )
+    parser.add_argument(
+        "--validation_frac",
+        type=float,
+        default=0.005,
+        help="Fraction of labeled vertices to be used for validation.",
+    )
+
+    args = parser.parse_args()
+    splitter = SeedSplitter(
+        path=args.path,
+        dataset_size=args.dataset_size,
+        use_label_2K=(args.num_classes == 2983),
+        random_seed=args.random_seed,
+        validation_frac=args.validation_frac,
+    )
diff --git a/vision/classification_and_detection/python/backend.py b/vision/classification_and_detection/python/backend.py
index 955eddb88..6fc13454a 100755
--- a/vision/classification_and_detection/python/backend.py
+++ b/vision/classification_and_detection/python/backend.py
@@ -2,10 +2,10 @@
 abstract backend class
 """
 
-
 # pylint: disable=unused-argument,missing-docstring
 
-class Backend():
+
+class Backend:
     def __init__(self):
         self.inputs = []
         self.outputs = []
diff --git a/vision/classification_and_detection/python/backend_ncnn.py b/vision/classification_and_detection/python/backend_ncnn.py
index 3cc8ceabf..16d44a02f 100644
--- a/vision/classification_and_detection/python/backend_ncnn.py
+++ b/vision/classification_and_detection/python/backend_ncnn.py
@@ -3,6 +3,7 @@
 import backend
 from ncnn_models import *
 
+
 class BackendNCNN(backend.Backend):
     def __init__(self):
         super(BackendNCNN, self).__init__()
@@ -24,9 +25,12 @@ def load(self, model_path, inputs=None, outputs=None):
             self.net = Resnet50(param_file, bin_file)
         else:
             import sys
-            print("please add your ncnn model .param and .bin files to dir named 'resnet'")
+
+            print(
+                "please add your ncnn model .param and .bin files to dir named 'resnet'"
+            )
             sys.exit()
-        
+
         if not inputs:
             self.inputs = [self.net.input_name]
         else:
diff --git a/vision/classification_and_detection/python/backend_onnxruntime.py b/vision/classification_and_detection/python/backend_onnxruntime.py
index 2b25003c3..787e721f2 100755
--- a/vision/classification_and_detection/python/backend_onnxruntime.py
+++ b/vision/classification_and_detection/python/backend_onnxruntime.py
@@ -30,16 +30,23 @@ def load(self, model_path, inputs=None, outputs=None):
 
         # By default all optimizations are enabled
         # https://onnxruntime.ai/docs/performance/graph-optimizations.html
-        # Enable only upto extended optimizations on aarch64 due to an accuracy issue
+        # Enable only upto extended optimizations on aarch64 due to an accuracy
+        # issue
         if os.environ.get("HOST_PLATFORM_FLAVOR", "") == "aarch64":
             opt.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
 
         # self.sess = rt.InferenceSession(model_path, opt)
-        if len(rt.get_all_providers()) > 1 and os.environ.get("USE_GPU", "yes").lower() not in [ "0", "false", "off", "no" ]:
-            self.sess = rt.InferenceSession(model_path, opt, providers=["CUDAExecutionProvider"])
+        if len(rt.get_all_providers()) > 1 and os.environ.get(
+            "USE_GPU", "yes"
+        ).lower() not in ["0", "false", "off", "no"]:
+            self.sess = rt.InferenceSession(
+                model_path, opt, providers=["CUDAExecutionProvider"]
+            )
         else:
-            self.sess = rt.InferenceSession(model_path, opt, providers=["CPUExecutionProvider"])
-            
+            self.sess = rt.InferenceSession(
+                model_path, opt, providers=["CPUExecutionProvider"]
+            )
+
         # get input and output names
         if not inputs:
             self.inputs = [meta.name for meta in self.sess.get_inputs()]
diff --git a/vision/classification_and_detection/python/backend_pytorch_native.py b/vision/classification_and_detection/python/backend_pytorch_native.py
index 219e4159a..db152f8e0 100755
--- a/vision/classification_and_detection/python/backend_pytorch_native.py
+++ b/vision/classification_and_detection/python/backend_pytorch_native.py
@@ -1,6 +1,7 @@
 """
-pytoch native backend 
+pytoch native backend
 """
+
 # pylint: disable=unused-argument,missing-docstring
 import torch  # currently supports pytorch1.0
 import torchvision
diff --git a/vision/classification_and_detection/python/backend_tf.py b/vision/classification_and_detection/python/backend_tf.py
index ab32b56fc..b53bf0b78 100755
--- a/vision/classification_and_detection/python/backend_tf.py
+++ b/vision/classification_and_detection/python/backend_tf.py
@@ -23,11 +23,13 @@ def name(self):
         return "tensorflow"
 
     def image_format(self):
-        # By default tensorflow uses NHWC (and the cpu implementation only does NHWC)
+        # By default tensorflow uses NHWC (and the cpu implementation only does
+        # NHWC)
         return "NHWC"
 
     def load(self, model_path, inputs=None, outputs=None):
-        # there is no input/output meta data i the graph so it need to come from config.
+        # there is no input/output meta data i the graph so it need to come
+        # from config.
         if not inputs:
             raise ValueError("BackendTensorflow needs inputs")
         if not outputs:
@@ -36,10 +38,16 @@ def load(self, model_path, inputs=None, outputs=None):
         self.inputs = inputs
 
         infer_config = tf.compat.v1.ConfigProto()
-        infer_config.intra_op_parallelism_threads = int(os.environ['TF_INTRA_OP_PARALLELISM_THREADS']) \
-                if 'TF_INTRA_OP_PARALLELISM_THREADS' in os.environ else os.cpu_count()
-        infer_config.inter_op_parallelism_threads = int(os.environ['TF_INTER_OP_PARALLELISM_THREADS']) \
-                if 'TF_INTER_OP_PARALLELISM_THREADS' in os.environ else os.cpu_count()
+        infer_config.intra_op_parallelism_threads = (
+            int(os.environ["TF_INTRA_OP_PARALLELISM_THREADS"])
+            if "TF_INTRA_OP_PARALLELISM_THREADS" in os.environ
+            else os.cpu_count()
+        )
+        infer_config.inter_op_parallelism_threads = (
+            int(os.environ["TF_INTER_OP_PARALLELISM_THREADS"])
+            if "TF_INTER_OP_PARALLELISM_THREADS" in os.environ
+            else os.cpu_count()
+        )
         infer_config.use_per_session_threads = 1
 
         # TODO: support checkpoint and saved_model formats?
@@ -47,16 +55,26 @@ def load(self, model_path, inputs=None, outputs=None):
         with tf.compat.v1.gfile.FastGFile(model_path, "rb") as f:
             graph_def.ParseFromString(f.read())
         try:
-            optimized_graph_def = optimize_for_inference(graph_def, [item.split(':')[0] for item in inputs],
-                    [item.split(':')[0] for item in outputs], dtypes.float32.as_datatype_enum, False)
-            g = tf.compat.v1.import_graph_def(optimized_graph_def, name='')
+            optimized_graph_def = optimize_for_inference(
+                graph_def,
+                [item.split(":")[0] for item in inputs],
+                [item.split(":")[0] for item in outputs],
+                dtypes.float32.as_datatype_enum,
+                False,
+            )
+            g = tf.compat.v1.import_graph_def(optimized_graph_def, name="")
         except ValueError:
             try:
-                optimized_graph_def = optimize_for_inference(graph_def, [item.split(':')[0] for item in inputs],
-                        [item.split(':')[0] for item in outputs], dtypes.uint8.as_datatype_enum, False)
-                g = tf.compat.v1.import_graph_def(optimized_graph_def, name='')
+                optimized_graph_def = optimize_for_inference(
+                    graph_def,
+                    [item.split(":")[0] for item in inputs],
+                    [item.split(":")[0] for item in outputs],
+                    dtypes.uint8.as_datatype_enum,
+                    False,
+                )
+                g = tf.compat.v1.import_graph_def(optimized_graph_def, name="")
             except ValueError:
-                g = tf.compat.v1.import_graph_def(graph_def, name='')
+                g = tf.compat.v1.import_graph_def(graph_def, name="")
         self.sess = tf.compat.v1.Session(graph=g, config=infer_config)
         return self
 
diff --git a/vision/classification_and_detection/python/backend_tflite.py b/vision/classification_and_detection/python/backend_tflite.py
index 7c8c78c13..fa6cc5ba2 100755
--- a/vision/classification_and_detection/python/backend_tflite.py
+++ b/vision/classification_and_detection/python/backend_tflite.py
@@ -10,12 +10,14 @@
     # try dedicated tflite package first
     import tflite_runtime
     import tflite_runtime.interpreter as tflite
+
     _version = tflite_runtime.__version__
     _git_version = tflite_runtime.__git_version__
-except:
+except BaseException:
     # fall back to tflite bundled in tensorflow
     import tensorflow as tf
     from tensorflow.lite.python import interpreter as tflite
+
     _version = tf.__version__
     _git_version = tf.__git_version__
 
@@ -43,8 +45,12 @@ def load(self, model_path, inputs=None, outputs=None):
         self.sess = tflite.Interpreter(model_path=model_path)
         self.sess.allocate_tensors()
         # keep input/output name to index mapping
-        self.input2index = {i["name"]: i["index"] for i in self.sess.get_input_details()}
-        self.output2index = {i["name"]: i["index"] for i in self.sess.get_output_details()}
+        self.input2index = {
+            i["name"]: i["index"] for i in self.sess.get_input_details()
+        }
+        self.output2index = {
+            i["name"]: i["index"] for i in self.sess.get_output_details()
+        }
         # keep input/output names
         self.inputs = list(self.input2index.keys())
         self.outputs = list(self.output2index.keys())
diff --git a/vision/classification_and_detection/python/backend_tvm.py b/vision/classification_and_detection/python/backend_tvm.py
index 7e5a97787..86223b60e 100644
--- a/vision/classification_and_detection/python/backend_tvm.py
+++ b/vision/classification_and_detection/python/backend_tvm.py
@@ -41,8 +41,10 @@ def image_format(self):
 
     def create_omp_args(self, arena_idx):
         idx_start = self.arena_size * arena_idx
-        cur_arena_size = min(multiprocessing.cpu_count() -
-                             idx_start, self.arena_size)
+        cur_arena_size = min(
+            multiprocessing.cpu_count() -
+            idx_start,
+            self.arena_size)
         # idx_end = idx_start + cur_arena_size
 
         # OMP_PLACES="{N},{N+1},{N+2},...,{N+SZ}"
@@ -59,7 +61,7 @@ def create_omp_args(self, arena_idx):
     def set_omp_envs(omp_args):
         for env_arg in omp_args:
             os.environ[env_arg[0]] = env_arg[1]
-                
+
     @staticmethod
     def vmobj_to_list(o):
         if isinstance(o, tvm.nd.NDArray):
@@ -81,7 +83,8 @@ def vmobj_to_list(o):
             elif "tensor" in o.constructor.name_hint:
                 result = [o.fields[0].numpy()]
             else:
-                raise RuntimeError(f"Unknown object type: {o.constructor.name_hint}")
+                raise RuntimeError(
+                    f"Unknown object type: {o.constructor.name_hint}")
         else:
             raise RuntimeError(f"Unknown object type: {type(o)}")
         return result
@@ -90,25 +93,24 @@ def load_impl(self, model_path, inputs, outputs, max_batchsize):
 
         self.max_batchsize = max_batchsize
         _, self.model_format = os.path.splitext(model_path)
-        
+
         work_dir = os.path.dirname(model_path)
-        compiled_model = os.path.join(work_dir, 'model-tvm.so')
-        
-        with open(os.path.join(work_dir, "input_layer_name"), 'r') as file:
+        compiled_model = os.path.join(work_dir, "model-tvm.so")
+
+        with open(os.path.join(work_dir, "input_layer_name"), "r") as file:
             self.input_layer_name = file.read().strip()
 
-        if compiled_model.endswith('.so') or compiled_model.endswith('.dylib'):
+        if compiled_model.endswith(".so") or compiled_model.endswith(".dylib"):
             if not os.path.isfile(compiled_model):
                 print()
                 raise RuntimeError(
-                    f"Error: Model file {compiled_model} not found!"
-                )
+                    f"Error: Model file {compiled_model} not found!")
         else:
             raise RuntimeError(
                 f"Error: The specified path ({model_path}) does not match path to the compiled model!"
             )
 
-        print('TVM: loading model ' + compiled_model)
+        print("TVM: loading model " + compiled_model)
 
         mod = tvm.runtime.load_module(compiled_model)
         device = tvm.device("llvm", 0)
@@ -123,8 +125,7 @@ def load_impl(self, model_path, inputs, outputs, max_batchsize):
 
             for sub_dir in next(os.walk(work_dir))[1]:
                 if sub_dir.endswith("-tvm-tmp"):
-                    path_consts = os.path.join(
-                        work_dir, sub_dir + "/consts")
+                    path_consts = os.path.join(work_dir, sub_dir + "/consts")
                     break
 
             vm_exec.mod["load_late_bound_consts"](path_consts)
@@ -132,23 +133,26 @@ def load_impl(self, model_path, inputs, outputs, max_batchsize):
             self.executor = runtime_vm.VirtualMachine(vm_exec, device)
         else:
             self.executor_type = "graph_executor"
-            self.executor = graph_executor.GraphModule(
-                mod['default'](device))
+            self.executor = graph_executor.GraphModule(mod["default"](device))
 
         if not inputs:
             if self.executor_type == "virtual_machine":
-                inputs = [str(idx) for idx in range(
-                    self.executor.module["get_num_outputs"]())]
+                inputs = [
+                    str(idx) for idx in range(self.executor.module["get_num_outputs"]())
+                ]
             else:
-                inputs = [str(idx) for idx in range(
-                    self.executor.get_num_outputs())]
+                inputs = [
+                    str(idx) for idx in range(
+                        self.executor.get_num_outputs())]
         if not outputs:
             if self.executor_type == "virtual_machine":
-                outputs = [str(idx) for idx in range(
-                    self.executor.module["get_num_outputs"]())]
+                outputs = [
+                    str(idx) for idx in range(self.executor.module["get_num_outputs"]())
+                ]
             else:
-                outputs = [str(idx) for idx in range(
-                    self.executor.get_num_outputs())]
+                outputs = [
+                    str(idx) for idx in range(
+                        self.executor.get_num_outputs())]
 
         self.inputs = inputs
         self.outputs = outputs
@@ -163,7 +167,8 @@ def predict_impl(self, feed):
             item = np.vstack((item, item_extra))
         elif batch_size > self.max_batchsize:
             raise ValueError(
-                "Internal MLPerf error: dynamic batch size > max batch size")
+                "Internal MLPerf error: dynamic batch size > max batch size"
+            )
         input_idx = self.inputs.index(iname)
         if self.executor_type == "virtual_machine":
             self.executor.set_input(
@@ -176,11 +181,14 @@ def predict_impl(self, feed):
         else:
             self.executor.set_input(input_idx, tvm.nd.array(item))
             self.executor.run()
-            return [self.executor.get_output(0).asnumpy()[:batch_size],
-                    self.executor.get_output(1).asnumpy()[:batch_size]]
+            return [
+                self.executor.get_output(0).asnumpy()[:batch_size],
+                self.executor.get_output(1).asnumpy()[:batch_size],
+            ]
 
     @staticmethod
-    def _worker_initializer(model_path, inputs, outputs, max_batchsize, omp_envs):
+    def _worker_initializer(model_path, inputs, outputs,
+                            max_batchsize, omp_envs):
         BackendTVM.set_omp_envs(omp_envs)
         global global_executor
         global_executor = BackendTVM()
@@ -197,12 +205,20 @@ def load(self, model_path, inputs=None, outputs=None):
         self.load_impl(model_path, inputs, outputs, self.max_batchsize)
 
         if self.arena_num > 1:
-            multiprocessing.set_start_method(os.getenv("PYTHON_MP_START_METHOD", "fork"))
-            self.pool = multiprocessing.Pool(self.arena_num,
-                                             initializer=self._worker_initializer,
-                                             initargs=(model_path, inputs, outputs, self.max_batchsize,
-                                                       self.create_omp_args(0))
-                                             )
+            multiprocessing.set_start_method(
+                os.getenv("PYTHON_MP_START_METHOD", "fork")
+            )
+            self.pool = multiprocessing.Pool(
+                self.arena_num,
+                initializer=self._worker_initializer,
+                initargs=(
+                    model_path,
+                    inputs,
+                    outputs,
+                    self.max_batchsize,
+                    self.create_omp_args(0),
+                ),
+            )
 
         return self
 
diff --git a/vision/classification_and_detection/python/coco.py b/vision/classification_and_detection/python/coco.py
index 8e88bbfc7..a3e747276 100644
--- a/vision/classification_and_detection/python/coco.py
+++ b/vision/classification_and_detection/python/coco.py
@@ -20,8 +20,21 @@
 
 
 class Coco(dataset.Dataset):
-    def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
-                 image_format="NHWC", pre_process=None, count=None, cache_dir=None, preprocessed_dir=None, use_label_map=False, threads=os.cpu_count()):
+    def __init__(
+        self,
+        data_path,
+        image_list,
+        name,
+        use_cache=0,
+        image_size=None,
+        image_format="NHWC",
+        pre_process=None,
+        count=None,
+        cache_dir=None,
+        preprocessed_dir=None,
+        use_label_map=False,
+        threads=os.cpu_count(),
+    ):
         super().__init__()
         self.image_size = image_size
         self.image_list = []
@@ -32,17 +45,19 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         self.use_cache = use_cache
         self.data_path = data_path
         self.pre_process = pre_process
-        self.use_label_map=use_label_map
+        self.use_label_map = use_label_map
         if not cache_dir:
             cache_dir = os.getcwd()
-        self.cache_dir = os.path.join(cache_dir, "preprocessed", name, image_format)
+        self.cache_dir = os.path.join(
+            cache_dir, "preprocessed", name, image_format)
         # input images are in HWC
         self.need_transpose = True if image_format == "NCHW" else False
-        not_found = 0 
+        not_found = 0
         empty_80catageories = 0
         if image_list is None:
             # by default look for val_map.txt
-            image_list = os.path.join(data_path, "annotations/instances_val2017.json")
+            image_list = os.path.join(
+                data_path, "annotations/instances_val2017.json")
         self.annotation_file = image_list
         if self.use_label_map:
             # for pytorch
@@ -58,16 +73,22 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         with open(image_list, "r") as f:
             coco = json.load(f)
         for i in coco["images"]:
-            images[i["id"]] = {"file_name": i["file_name"],
-                               "height": i["height"],
-                               "width": i["width"],
-                               "bbox": [],
-                               "category": []}
+            images[i["id"]] = {
+                "file_name": i["file_name"],
+                "height": i["height"],
+                "width": i["width"],
+                "bbox": [],
+                "category": [],
+            }
         for a in coco["annotations"]:
             i = images.get(a["image_id"])
             if i is None:
                 continue
-            catagory_ids = label_map[a.get("category_id")] if self.use_label_map else a.get("category_id")
+            catagory_ids = (
+                label_map[a.get("category_id")]
+                if self.use_label_map
+                else a.get("category_id")
+            )
             i["category"].append(catagory_ids)
             i["bbox"].append(a.get("bbox"))
 
@@ -78,17 +99,23 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
                 # if the image does not exists ignore it
                 not_found += 1
                 continue
-            if len(img["category"])==0 and self.use_label_map: 
-                #if an image doesn't have any of the 81 categories in it    
-                empty_80catageories += 1 #should be 48 images - thus the validation sert has 4952 images
-                continue 
+            if len(img["category"]) == 0 and self.use_label_map:
+                # if an image doesn't have any of the 81 categories in it
+                empty_80catageories += (
+                    1  # should be 48 images - thus the validation sert has 4952 images
+                )
+                continue
 
-            os.makedirs(os.path.dirname(os.path.join(self.cache_dir, image_name)), exist_ok=True)
+            os.makedirs(
+                os.path.dirname(os.path.join(self.cache_dir, image_name)), exist_ok=True
+            )
             dst = os.path.join(self.cache_dir, image_name)
             if not os.path.exists(dst + ".npy"):
                 # cache a preprocessed version of the image
                 img_org = cv2.imread(src)
-                processed = self.pre_process(img_org, need_transpose=self.need_transpose, dims=self.image_size)
+                processed = self.pre_process(
+                    img_org, need_transpose=self.need_transpose, dims=self.image_size
+                )
                 np.save(dst, processed)
 
             self.image_ids.append(image_id)
@@ -107,10 +134,16 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         if not_found > 0:
             log.info("reduced image list, %d images not found", not_found)
         if empty_80catageories > 0:
-            log.info("reduced image list, %d images without any of the 80 categories", empty_80catageories)
+            log.info(
+                "reduced image list, %d images without any of the 80 categories",
+                empty_80catageories,
+            )
 
-        log.info("loaded {} images, cache={}, took={:.1f}sec".format(
-            len(self.image_list), use_cache, time_taken))
+        log.info(
+            "loaded {} images, cache={}, took={:.1f}sec".format(
+                len(self.image_list), use_cache, time_taken
+            )
+        )
 
         self.label_list = np.array(self.label_list)
 
@@ -129,6 +162,7 @@ class PostProcessCoco:
     """
     Post processing for tensorflow ssd-mobilenet style models
     """
+
     def __init__(self):
         self.results = []
         self.good = 0
@@ -139,14 +173,22 @@ def __init__(self):
     def add_results(self, results):
         self.results.extend(results)
 
-    def __call__(self, results, ids, expected=None, result_dict=None, ):
+    def __call__(
+        self,
+        results,
+        ids,
+        expected=None,
+        result_dict=None,
+    ):
         # results come as:
-        #   tensorflow, ssd-mobilenet: num_detections,detection_boxes,detection_scores,detection_classes
+        # tensorflow, ssd-mobilenet:
+        # num_detections,detection_boxes,detection_scores,detection_classes
         processed_results = []
         # batch size
         bs = len(results[0])
         for idx in range(0, bs):
-            # keep the content_id from loadgen to handle content_id's without results
+            # keep the content_id from loadgen to handle content_id's without
+            # results
             self.content_ids.append(ids[idx])
             processed_results.append([])
             detection_num = int(results[0][idx])
@@ -158,10 +200,17 @@ def __call__(self, results, ids, expected=None, result_dict=None, ):
                 if detection_class in expected_classes:
                     self.good += 1
                 box = detection_boxes[detection]
-                processed_results[idx].append([float(ids[idx]),
-                                              box[0], box[1], box[2], box[3],
-                                              results[2][idx][detection],
-                                              float(detection_class)])
+                processed_results[idx].append(
+                    [
+                        float(ids[idx]),
+                        box[0],
+                        box[1],
+                        box[2],
+                        box[3],
+                        results[2][idx][detection],
+                        float(detection_class),
+                    ]
+                )
                 self.total += 1
         return processed_results
 
@@ -181,7 +230,7 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                 annotations = json.load(fin)
             for cnt, cat in enumerate(annotations["categories"]):
                 label_map[cat["id"]] = cnt + 1
-            inv_map = {v:k for k,v in label_map.items()}
+            inv_map = {v: k for k, v in label_map.items()}
 
         detections = []
         image_indices = []
@@ -192,8 +241,13 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                 # this is the index of the coco image
                 image_idx = int(detection[0])
                 if image_idx != self.content_ids[batch]:
-                    # working with the coco index/id is error prone - extra check to make sure it is consistent
-                    log.error("image_idx missmatch, lg={} / result={}".format(image_idx, self.content_ids[batch]))
+                    # working with the coco index/id is error prone - extra
+                    # check to make sure it is consistent
+                    log.error(
+                        "image_idx missmatch, lg={} / result={}".format(
+                            image_idx, self.content_ids[batch]
+                        )
+                    )
                 # map the index to the coco image id
                 detection[0] = ds.image_ids[image_idx]
                 height, width = ds.image_sizes[image_idx]
@@ -211,16 +265,19 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                     cat_id = inv_map.get(int(detection[6]), -1)
                     if cat_id == -1:
                         # FIXME:
-                        log.info("finalize can't map category {}".format(int(detection[6])))
-                    detection[6] =  cat_id
+                        log.info(
+                            "finalize can't map category {}".format(
+                                int(detection[6]))
+                        )
+                    detection[6] = cat_id
                 detections.append(np.array(detection))
 
         # map indices to coco image id's
-        image_ids = [ds.image_ids[i]  for i in image_indices]
+        image_ids = [ds.image_ids[i] for i in image_indices]
         self.results = []
         cocoGt = pycoco.COCO(ds.annotation_file)
         cocoDt = cocoGt.loadRes(np.array(detections))
-        cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
+        cocoEval = COCOeval(cocoGt, cocoDt, iouType="bbox")
         cocoEval.params.imgIds = image_ids
         cocoEval.evaluate()
         cocoEval.accumulate()
@@ -232,11 +289,12 @@ class PostProcessCocoPt(PostProcessCoco):
     """
     Post processing required by ssd-resnet34 / pytorch
     """
-    def __init__(self,use_inv_map,score_threshold):
+
+    def __init__(self, use_inv_map, score_threshold):
         super().__init__()
         self.use_inv_map = use_inv_map
         self.score_threshold = score_threshold
-        
+
     def __call__(self, results, ids, expected=None, result_dict=None):
         # results come as:
         #   detection_boxes,detection_classes,detection_scores
@@ -251,7 +309,7 @@ def __call__(self, results, ids, expected=None, result_dict=None):
             detection_classes = results[1][idx]
             expected_classes = expected[idx][0]
             scores = results[2][idx]
-            #for detection in range(0, len(expected_classes)):
+            # for detection in range(0, len(expected_classes)):
             for detection in range(0, len(scores)):
                 if scores[detection] < self.score_threshold:
                     break
@@ -260,10 +318,17 @@ def __call__(self, results, ids, expected=None, result_dict=None):
                     self.good += 1
                 box = detection_boxes[detection]
                 # comes from model as:  0=xmax 1=ymax 2=xmin 3=ymin
-                processed_results[idx].append([float(ids[idx]),
-                                              box[1], box[0], box[3], box[2],
-                                              scores[detection],
-                                              float(detection_class)])
+                processed_results[idx].append(
+                    [
+                        float(ids[idx]),
+                        box[1],
+                        box[0],
+                        box[3],
+                        box[2],
+                        scores[detection],
+                        float(detection_class),
+                    ]
+                )
                 self.total += 1
         return processed_results
 
@@ -272,12 +337,14 @@ class PostProcessCocoOnnx(PostProcessCoco):
     """
     Post processing required by ssd-resnet34 / onnx
     """
+
     def __init__(self):
         super().__init__()
 
     def __call__(self, results, ids, expected=None, result_dict=None):
         # results come as:
-        #   onnx (from pytorch ssd-resnet34): detection_boxes,detection_classes,detection_scores
+        # onnx (from pytorch ssd-resnet34):
+        # detection_boxes,detection_classes,detection_scores
 
         processed_results = []
 
@@ -298,17 +365,26 @@ def __call__(self, results, ids, expected=None, result_dict=None):
                     self.good += 1
                 box = detection_boxes[detection]
                 # comes from model as:  0=xmax 1=ymax 2=xmin 3=ymin
-                processed_results[idx].append([float(ids[idx]),
-                                              box[1], box[0], box[3], box[2],
-                                              scores[detection],
-                                              float(detection_class)])
+                processed_results[idx].append(
+                    [
+                        float(ids[idx]),
+                        box[1],
+                        box[0],
+                        box[3],
+                        box[2],
+                        scores[detection],
+                        float(detection_class),
+                    ]
+                )
                 self.total += 1
         return processed_results
 
+
 class PostProcessCocoTf(PostProcessCoco):
     """
     Post processing required by ssd-resnet34 / pytorch
     """
+
     def __init__(self):
         super().__init__()
         self.use_inv_map = True
@@ -335,10 +411,16 @@ def __call__(self, results, ids, expected=None, result_dict=None):
                     self.good += 1
                 box = detection_boxes[detection]
                 # comes from model as:  0=xmax 1=ymax 2=xmin 3=ymin
-                processed_results[idx].append([float(ids[idx]),
-                                              box[0], box[1], box[2], box[3],
-                                              scores[detection],
-                                              float(detection_class)])
+                processed_results[idx].append(
+                    [
+                        float(ids[idx]),
+                        box[0],
+                        box[1],
+                        box[2],
+                        box[3],
+                        scores[detection],
+                        float(detection_class),
+                    ]
+                )
                 self.total += 1
         return processed_results
-
diff --git a/vision/classification_and_detection/python/dataset.py b/vision/classification_and_detection/python/dataset.py
index 02aa654c2..47c8c02a1 100755
--- a/vision/classification_and_detection/python/dataset.py
+++ b/vision/classification_and_detection/python/dataset.py
@@ -15,7 +15,8 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("dataset")
 
-class Item():
+
+class Item:
     def __init__(self, label, img, idx):
         self.label = label
         self.img = img
@@ -24,19 +25,25 @@ def __init__(self, label, img, idx):
 
 
 def usleep(sec):
-    if sys.platform == 'win32':
+    if sys.platform == "win32":
         # on windows time.sleep() doesn't work to well
         import ctypes
+
         kernel32 = ctypes.windll.kernel32
-        timer = kernel32.CreateWaitableTimerA(ctypes.c_void_p(), True, ctypes.c_void_p())
+        timer = kernel32.CreateWaitableTimerA(
+            ctypes.c_void_p(), True, ctypes.c_void_p()
+        )
         delay = ctypes.c_longlong(int(-1 * (10 * 1000000 * sec)))
-        kernel32.SetWaitableTimer(timer, ctypes.byref(delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False)
-        kernel32.WaitForSingleObject(timer, 0xffffffff)
+        kernel32.SetWaitableTimer(
+            timer, ctypes.byref(
+                delay), 0, ctypes.c_void_p(), ctypes.c_void_p(), False
+        )
+        kernel32.WaitForSingleObject(timer, 0xFFFFFFFF)
     else:
         time.sleep(sec)
 
 
-class Dataset():
+class Dataset:
     def __init__(self):
         self.arrival = None
         self.image_list = []
@@ -62,7 +69,7 @@ def load_query_samples(self, sample_list):
     def unload_query_samples(self, sample_list):
         if sample_list:
             for sample in sample_list:
-                if sample in self.image_list_inmemory :
+                if sample in self.image_list_inmemory:
                     del self.image_list_inmemory[sample]
         else:
             self.image_list_inmemory = {}
@@ -102,7 +109,7 @@ def start(self):
         self.good = 0
         self.total = 0
 
-    def finalize(self, results, ds=False,  output_dir=None):
+    def finalize(self, results, ds=False, output_dir=None):
         results["good"] = self.good
         results["total"] = self.total
 
@@ -141,6 +148,7 @@ def finalize(self, results, ds=False, output_dir=None):
 # pre-processing
 #
 
+
 def center_crop(img, out_height, out_width):
     height, width, _ = img.shape
     left = int((width - out_width) / 2)
@@ -151,10 +159,12 @@ def center_crop(img, out_height, out_width):
     return img
 
 
-def resize_with_aspectratio(img, out_height, out_width, scale=87.5, inter_pol=cv2.INTER_LINEAR):
+def resize_with_aspectratio(
+    img, out_height, out_width, scale=87.5, inter_pol=cv2.INTER_LINEAR
+):
     height, width, _ = img.shape
-    new_height = int(100. * out_height / scale)
-    new_width = int(100. * out_width / scale)
+    new_height = int(100.0 * out_height / scale)
+    new_width = int(100.0 * out_width / scale)
     if height > width:
         w = new_width
         h = int(new_height * height / width)
@@ -170,9 +180,11 @@ def pre_process_vgg(img, dims=None, need_transpose=False):
 
     output_height, output_width, _ = dims
     cv2_interpol = cv2.INTER_AREA
-    img = resize_with_aspectratio(img, output_height, output_width, inter_pol=cv2_interpol)
+    img = resize_with_aspectratio(
+        img, output_height, output_width, inter_pol=cv2_interpol
+    )
     img = center_crop(img, output_height, output_width)
-    img = np.asarray(img, dtype='float32')
+    img = np.asarray(img, dtype="float32")
 
     # normalize image
     means = np.array([123.68, 116.78, 103.94], dtype=np.float32)
@@ -188,9 +200,11 @@ def pre_process_mobilenet(img, dims=None, need_transpose=False):
     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 
     output_height, output_width, _ = dims
-    img = resize_with_aspectratio(img, output_height, output_width, inter_pol=cv2.INTER_LINEAR)
+    img = resize_with_aspectratio(
+        img, output_height, output_width, inter_pol=cv2.INTER_LINEAR
+    )
     img = center_crop(img, output_height, output_width)
-    img = np.asarray(img, dtype='float32')
+    img = np.asarray(img, dtype="float32")
 
     img /= 255.0
     img -= 0.5
@@ -211,10 +225,12 @@ def pre_process_imagenet_pytorch(img, dims=None, need_transpose=False):
     img = F.resize(img, 256, Image.BILINEAR)
     img = F.center_crop(img, 224)
     img = F.to_tensor(img)
-    img = F.normalize(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=False)
+    img = F.normalize(
+        img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=False
+    )
     if not need_transpose:
-        img = img.permute(1, 2, 0) # NHWC
-    img = np.asarray(img, dtype='float32')
+        img = img.permute(1, 2, 0)  # NHWC
+    img = np.asarray(img, dtype="float32")
     return img
 
 
@@ -224,9 +240,10 @@ def maybe_resize(img, dims):
         # some images might be grayscale
         img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    if dims != None:
+    if dims is not None:
         im_height, im_width, _ = dims
-        img = cv2.resize(img, (im_width, im_height), interpolation=cv2.INTER_LINEAR)
+        img = cv2.resize(img, (im_width, im_height),
+                         interpolation=cv2.INTER_LINEAR)
     return img
 
 
@@ -254,7 +271,7 @@ def pre_process_coco_resnet34(img, dims=None, need_transpose=False):
     mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
     std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
 
-    img = img / 255. - mean
+    img = img / 255.0 - mean
     img = img / std
 
     if need_transpose:
@@ -275,8 +292,8 @@ def pre_process_coco_resnet34_tf(img, dims=None, need_transpose=False):
 
 def pre_process_openimages_retinanet(img, dims=None, need_transpose=False):
     img = maybe_resize(img, dims)
-    img /= 255.
+    img /= 255.0
     # transpose if needed
     if need_transpose:
         img = img.transpose([2, 0, 1])
-    return img
\ No newline at end of file
+    return img
diff --git a/vision/classification_and_detection/python/imagenet.py b/vision/classification_and_detection/python/imagenet.py
index 14734ab5f..31144befb 100644
--- a/vision/classification_and_detection/python/imagenet.py
+++ b/vision/classification_and_detection/python/imagenet.py
@@ -20,8 +20,20 @@
 
 class Imagenet(dataset.Dataset):
 
-    def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
-            image_format="NHWC", pre_process=None, count=None, cache_dir=None, preprocessed_dir=None, threads=os.cpu_count()):
+    def __init__(
+        self,
+        data_path,
+        image_list,
+        name,
+        use_cache=0,
+        image_size=None,
+        image_format="NHWC",
+        pre_process=None,
+        count=None,
+        cache_dir=None,
+        preprocessed_dir=None,
+        threads=os.cpu_count(),
+    ):
         super(Imagenet, self).__init__()
         if image_size is None:
             self.image_size = [224, 224, 3]
@@ -33,13 +45,16 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         self.label_list = []
         self.count = count
         self.data_path = data_path
-        self.pre_process = pre_process # if None we assume data_path is having preprocessed dataset
+        self.pre_process = (
+            pre_process  # if None we assume data_path is having preprocessed dataset
+        )
         self.use_cache = use_cache
 
         if preprocessed_dir:
             self.cache_dir = preprocessed_dir
         elif pre_process:
-            self.cache_dir = os.path.join(cache_dir, "preprocessed", name, image_format)
+            self.cache_dir = os.path.join(
+                cache_dir, "preprocessed", name, image_format)
         else:
             self.cache_dir = cache_dir
 
@@ -50,7 +65,7 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         if image_list is None:
             # by default look for val_map.txt
             image_list = os.path.join(data_path, "val_map.txt")
-        with open(image_list, 'r') as fp:
+        with open(image_list, "r") as fp:
             for count, line in enumerate(fp):
                 pass
         count = count + 1
@@ -64,31 +79,42 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         start = time.time()
         N = threads
         import concurrent.futures
+
         if N > CNT:
             N = CNT
 
         if not pre_process:
-            log.info("Loading {} preprocessed images using {} threads".format(CNT, N))
+            log.info(
+                "Loading {} preprocessed images using {} threads".format(
+                    CNT, N))
         else:
             log.info("Preprocessing {} images using {} threads".format(CNT, N))
 
-        with open(image_list, 'r') as f:
+        with open(image_list, "r") as f:
             lists = []
             image_lists = []
             label_lists = []
             for i in range(N):
-                lists.append([ next(f) for x in range(int(CNT/N)) ])
+                lists.append([next(f) for x in range(int(CNT / N))])
                 image_lists.append([])
                 label_lists.append([])
-            if int(CNT%N) > 0:
-                lists.append([ next(f) for x in range(int(CNT%N)) ])
+            if int(CNT % N) > 0:
+                lists.append([next(f) for x in range(int(CNT % N))])
                 image_lists.append([])
                 label_lists.append([])
         executor = concurrent.futures.ThreadPoolExecutor(N)
-        futures = [executor.submit(self.process, data_path, item, image_lists[lists.index(item)],
-            label_lists[lists.index(item)]) for item in lists]
+        futures = [
+            executor.submit(
+                self.process,
+                data_path,
+                item,
+                image_lists[lists.index(item)],
+                label_lists[lists.index(item)],
+            )
+            for item in lists
+        ]
         concurrent.futures.wait(futures)
-        for i in range (len(image_lists)):
+        for i in range(len(image_lists)):
             self.image_list += image_lists[i]
             self.label_list += label_lists[i]
         time_taken = time.time() - start
@@ -98,8 +124,11 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         if self.not_found > 0:
             log.info("reduced image list, %d images not found", self.not_found)
 
-        log.info("loaded {} images, cache={}, already_preprocessed={}, took={:.1f}sec".format(
-            len(self.image_list), use_cache, pre_process is None, time_taken))
+        log.info(
+            "loaded {} images, cache={}, already_preprocessed={}, took={:.1f}sec".format(
+                len(self.image_list), use_cache, pre_process is None, time_taken
+            )
+        )
         self.label_list = np.array(self.label_list)
 
     def process(self, data_path, files, image_list, label_list):
@@ -107,7 +136,8 @@ def process(self, data_path, files, image_list, label_list):
             image_name, label = re.split(r"\s+", s.strip())
             src = os.path.join(data_path, image_name)
             if not self.pre_process:
-                if not os.path.exists(os.path.join(data_path, image_name) + ".npy"):
+                if not os.path.exists(os.path.join(
+                        data_path, image_name) + ".npy"):
                     # if the image does not exists ignore it
                     self.not_found += 1
                     continue
@@ -116,12 +146,19 @@ def process(self, data_path, files, image_list, label_list):
                     # if the image does not exists ignore it
                     self.not_found += 1
                     continue
-                os.makedirs(os.path.dirname(os.path.join(self.cache_dir, image_name)), exist_ok=True)
+                os.makedirs(
+                    os.path.dirname(os.path.join(self.cache_dir, image_name)),
+                    exist_ok=True,
+                )
                 dst = os.path.join(self.cache_dir, image_name)
                 if not os.path.exists(dst + ".npy"):
                     # cache a preprocessed version of the image
                     img_org = cv2.imread(src)
-                    processed = self.pre_process(img_org, need_transpose=self.need_transpose, dims=self.image_size)
+                    processed = self.pre_process(
+                        img_org,
+                        need_transpose=self.need_transpose,
+                        dims=self.image_size,
+                    )
                     np.save(dst, processed)
             image_list.append(image_name)
             label_list.append(int(label))
@@ -139,4 +176,3 @@ def get_item(self, nr):
     def get_item_loc(self, nr):
         src = os.path.join(self.data_path, self.image_list[nr])
         return src
-
diff --git a/vision/classification_and_detection/python/main.py b/vision/classification_and_detection/python/main.py
index 1c2cd9a5a..b4ff3972d 100755
--- a/vision/classification_and_detection/python/main.py
+++ b/vision/classification_and_detection/python/main.py
@@ -35,45 +35,85 @@
 
 # the datasets we support
 SUPPORTED_DATASETS = {
-    "imagenet":
-        (imagenet.Imagenet, dataset.pre_process_vgg, dataset.PostProcessCommon(offset=-1),
-         {"image_size": [224, 224, 3]}),
-    "imagenet_mobilenet":
-        (imagenet.Imagenet, dataset.pre_process_mobilenet, dataset.PostProcessArgMax(offset=-1),
-         {"image_size": [224, 224, 3]}),
-    "imagenet_pytorch":
-        (imagenet.Imagenet, dataset.pre_process_imagenet_pytorch, dataset.PostProcessArgMax(offset=0),
-         {"image_size": [224, 224, 3]}),
-    "coco-300":
-        (coco.Coco, dataset.pre_process_coco_mobilenet, coco.PostProcessCoco(),
-         {"image_size": [300, 300, 3]}),
-    "coco-300-pt":
-        (coco.Coco, dataset.pre_process_coco_pt_mobilenet, coco.PostProcessCocoPt(False,0.3),
-         {"image_size": [300, 300, 3]}),
-    "openimages-300-retinanet":
-        (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False,0.05,300,300), 
-        {"image_size": [300, 300, 3]}),
-    "openimages-800-retinanet":
-        (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False,0.05,800,800), 
-        {"image_size": [800, 800, 3]}),
-    "openimages-1200-retinanet":
-        (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False,0.05,1200,1200), 
-        {"image_size": [1200, 1200, 3]}),
-    "openimages-800-retinanet-onnx":
-        (openimages.OpenImages, dataset.pre_process_openimages_retinanet, openimages.PostProcessOpenImagesRetinanet(False,0.05,800,800,False), 
-        {"image_size": [800, 800, 3]}),       
-    "coco-1200":
-        (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCoco(),
-         {"image_size": [1200, 1200, 3]}),
-    "coco-1200-onnx":
-        (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoOnnx(),
-         {"image_size": [1200, 1200, 3]}),
-    "coco-1200-pt":
-        (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoPt(True,0.05),
-         {"image_size": [1200, 1200, 3],"use_label_map": True}),
-    "coco-1200-tf":
-        (coco.Coco, dataset.pre_process_coco_resnet34, coco.PostProcessCocoTf(),
-         {"image_size": [1200, 1200, 3],"use_label_map": False}),
+    "imagenet": (
+        imagenet.Imagenet,
+        dataset.pre_process_vgg,
+        dataset.PostProcessCommon(offset=-1),
+        {"image_size": [224, 224, 3]},
+    ),
+    "imagenet_mobilenet": (
+        imagenet.Imagenet,
+        dataset.pre_process_mobilenet,
+        dataset.PostProcessArgMax(offset=-1),
+        {"image_size": [224, 224, 3]},
+    ),
+    "imagenet_pytorch": (
+        imagenet.Imagenet,
+        dataset.pre_process_imagenet_pytorch,
+        dataset.PostProcessArgMax(offset=0),
+        {"image_size": [224, 224, 3]},
+    ),
+    "coco-300": (
+        coco.Coco,
+        dataset.pre_process_coco_mobilenet,
+        coco.PostProcessCoco(),
+        {"image_size": [300, 300, 3]},
+    ),
+    "coco-300-pt": (
+        coco.Coco,
+        dataset.pre_process_coco_pt_mobilenet,
+        coco.PostProcessCocoPt(False, 0.3),
+        {"image_size": [300, 300, 3]},
+    ),
+    "openimages-300-retinanet": (
+        openimages.OpenImages,
+        dataset.pre_process_openimages_retinanet,
+        openimages.PostProcessOpenImagesRetinanet(False, 0.05, 300, 300),
+        {"image_size": [300, 300, 3]},
+    ),
+    "openimages-800-retinanet": (
+        openimages.OpenImages,
+        dataset.pre_process_openimages_retinanet,
+        openimages.PostProcessOpenImagesRetinanet(False, 0.05, 800, 800),
+        {"image_size": [800, 800, 3]},
+    ),
+    "openimages-1200-retinanet": (
+        openimages.OpenImages,
+        dataset.pre_process_openimages_retinanet,
+        openimages.PostProcessOpenImagesRetinanet(False, 0.05, 1200, 1200),
+        {"image_size": [1200, 1200, 3]},
+    ),
+    "openimages-800-retinanet-onnx": (
+        openimages.OpenImages,
+        dataset.pre_process_openimages_retinanet,
+        openimages.PostProcessOpenImagesRetinanet(
+            False, 0.05, 800, 800, False),
+        {"image_size": [800, 800, 3]},
+    ),
+    "coco-1200": (
+        coco.Coco,
+        dataset.pre_process_coco_resnet34,
+        coco.PostProcessCoco(),
+        {"image_size": [1200, 1200, 3]},
+    ),
+    "coco-1200-onnx": (
+        coco.Coco,
+        dataset.pre_process_coco_resnet34,
+        coco.PostProcessCocoOnnx(),
+        {"image_size": [1200, 1200, 3]},
+    ),
+    "coco-1200-pt": (
+        coco.Coco,
+        dataset.pre_process_coco_resnet34,
+        coco.PostProcessCocoPt(True, 0.05),
+        {"image_size": [1200, 1200, 3], "use_label_map": True},
+    ),
+    "coco-1200-tf": (
+        coco.Coco,
+        dataset.pre_process_coco_resnet34,
+        coco.PostProcessCocoTf(),
+        {"image_size": [1200, 1200, 3], "use_label_map": False},
+    ),
 }
 
 # pre-defined command line options so simplify things. They are used as defaults and can be
@@ -86,7 +126,6 @@
         "cache": 0,
         "max-batchsize": 32,
     },
-
     # resnet
     "resnet50-tf": {
         "inputs": "input_tensor:0",
@@ -114,7 +153,6 @@
         "backend": "ncnn",
         "model-name": "resnet50",
     },
-
     # mobilenet
     "mobilenet-tf": {
         "inputs": "input:0",
@@ -129,7 +167,6 @@
         "backend": "onnxruntime",
         "model-name": "mobilenet",
     },
-
     # ssd-mobilenet
     "ssd-mobilenet-tf": {
         "inputs": "image_tensor:0",
@@ -152,7 +189,6 @@
         "data-format": "NHWC",
         "model-name": "ssd-mobilenet",
     },
-
     # ssd-resnet34
     "ssd-resnet34-tf": {
         "inputs": "image:0",
@@ -186,7 +222,6 @@
         "data-format": "NHWC",
         "model-name": "ssd-resnet34",
     },
-
     # retinanet
     "retinanet-pytorch": {
         "inputs": "image",
@@ -201,7 +236,7 @@
         "dataset": "openimages-800-retinanet-onnx",
         "backend": "onnxruntime",
         "model-name": "retinanet",
-        "max-batchsize": 1
+        "max-batchsize": 1,
     },
 }
 
@@ -218,43 +253,107 @@
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS.keys(), help="dataset")
-    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
+    parser.add_argument(
+        "--dataset",
+        choices=SUPPORTED_DATASETS.keys(),
+        help="dataset")
+    parser.add_argument(
+        "--dataset-path",
+        required=True,
+        help="path to the dataset")
     parser.add_argument("--dataset-list", help="path to the dataset list")
-    parser.add_argument("--data-format", choices=["NCHW", "NHWC"], help="data format")
-    parser.add_argument("--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles")
-    parser.add_argument("--scenario", default="SingleStream",
-                        help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())))
-    parser.add_argument("--max-batchsize", type=int, help="max batch size in a single inference")
+    parser.add_argument(
+        "--data-format",
+        choices=[
+            "NCHW",
+            "NHWC"],
+        help="data format")
+    parser.add_argument(
+        "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
+    )
+    parser.add_argument(
+        "--scenario",
+        default="SingleStream",
+        help="mlperf benchmark scenario, one of " +
+        str(list(SCENARIO_MAP.keys())),
+    )
+    parser.add_argument(
+        "--max-batchsize", type=int, help="max batch size in a single inference"
+    )
     parser.add_argument("--model", required=True, help="model file")
     parser.add_argument("--output", default="output", help="test results")
     parser.add_argument("--inputs", help="model inputs")
     parser.add_argument("--outputs", help="model outputs")
     parser.add_argument("--backend", help="runtime to use")
-    parser.add_argument("--model-name", help="name of the mlperf model, ie. resnet50")
-    parser.add_argument("--threads", default=os.cpu_count(), type=int, help="threads")
+    parser.add_argument(
+        "--model-name",
+        help="name of the mlperf model, ie. resnet50")
+    parser.add_argument(
+        "--threads",
+        default=os.cpu_count(),
+        type=int,
+        help="threads")
     parser.add_argument("--qps", type=int, help="target qps")
     parser.add_argument("--cache", type=int, default=0, help="use cache")
-    parser.add_argument("--cache_dir", type=str, default=None, help="dir path for caching")
-    parser.add_argument("--preprocessed_dir", type=str, default=None, help="dir path for storing preprocessed images (overrides cache_dir)")
-    parser.add_argument("--use_preprocessed_dataset", action="store_true", help="use preprocessed dataset instead of the original")
-    parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
-    parser.add_argument("--find-peak-performance", action="store_true", help="enable finding peak performance pass")
-    parser.add_argument("--debug", action="store_true", help="debug, turn traces on")
+    parser.add_argument(
+        "--cache_dir", type=str, default=None, help="dir path for caching"
+    )
+    parser.add_argument(
+        "--preprocessed_dir",
+        type=str,
+        default=None,
+        help="dir path for storing preprocessed images (overrides cache_dir)",
+    )
+    parser.add_argument(
+        "--use_preprocessed_dataset",
+        action="store_true",
+        help="use preprocessed dataset instead of the original",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--find-peak-performance",
+        action="store_true",
+        help="enable finding peak performance pass",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="debug, turn traces on")
 
     # file to use mlperf rules compliant parameters
-    parser.add_argument("--mlperf_conf", default="../../mlperf.conf", help="mlperf rules config")
+    parser.add_argument(
+        "--mlperf_conf", default="../../mlperf.conf", help="mlperf rules config"
+    )
     # file for user LoadGen settings such as target QPS
-    parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS")
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
     # file for LoadGen audit settings
-    parser.add_argument("--audit_conf", default="audit.config", help="config for LoadGen audit settings")
+    parser.add_argument(
+        "--audit_conf", default="audit.config", help="config for LoadGen audit settings"
+    )
 
-    # below will override mlperf rules compliant settings - don't use for official submission
+    # below will override mlperf rules compliant settings - don't use for
+    # official submission
     parser.add_argument("--time", type=int, help="time to scan in seconds")
     parser.add_argument("--count", type=int, help="dataset items to use")
-    parser.add_argument("--performance-sample-count", type=int, help="performance sample count")
-    parser.add_argument("--max-latency", type=float, help="mlperf max latency in pct tile")
-    parser.add_argument("--samples-per-query", default=8, type=int, help="mlperf multi-stream samples per query")
+    parser.add_argument(
+        "--performance-sample-count", type=int, help="performance sample count"
+    )
+    parser.add_argument(
+        "--max-latency", type=float, help="mlperf max latency in pct tile"
+    )
+    parser.add_argument(
+        "--samples-per-query",
+        default=8,
+        type=int,
+        help="mlperf multi-stream samples per query",
+    )
     args = parser.parse_args()
 
     # don't use defaults in argparser. Instead we default to a dict, override that with a profile
@@ -281,27 +380,35 @@ def get_args():
 def get_backend(backend):
     if backend == "tensorflow":
         from backend_tf import BackendTensorflow
+
         backend = BackendTensorflow()
     elif backend == "onnxruntime":
         from backend_onnxruntime import BackendOnnxruntime
+
         backend = BackendOnnxruntime()
     elif backend == "tvm":
         from backend_tvm import BackendTVM
+
         backend = BackendTVM()
     elif backend == "null":
         from backend_null import BackendNull
+
         backend = BackendNull()
     elif backend == "pytorch":
         from backend_pytorch import BackendPytorch
+
         backend = BackendPytorch()
     elif backend == "pytorch-native":
         from backend_pytorch_native import BackendPytorchNative
-        backend = BackendPytorchNative()      
+
+        backend = BackendPytorchNative()
     elif backend == "tflite":
         from backend_tflite import BackendTflite
+
         backend = BackendTflite()
     elif backend == "ncnn":
         from backend_ncnn import BackendNCNN
+
         backend = BackendNCNN()
     else:
         raise ValueError("unknown backend: " + backend)
@@ -344,7 +451,9 @@ def run_one_item(self, qitem):
         processed_results = []
         try:
             results = self.model.predict({self.model.inputs[0]: qitem.img})
-            processed_results = self.post_process(results, qitem.content_id, qitem.label, self.result_dict)
+            processed_results = self.post_process(
+                results, qitem.content_id, qitem.label, self.result_dict
+            )
             if self.take_accuracy:
                 self.post_process.add_results(processed_results)
             self.result_timing.append(time.time() - qitem.start)
@@ -357,7 +466,9 @@ def run_one_item(self, qitem):
             response_array_refs = []
             response = []
             for idx, query_id in enumerate(qitem.query_id):
-                response_array = array.array("B", np.array(processed_results[idx], np.float32).tobytes())
+                response_array = array.array(
+                    "B", np.array(processed_results[idx], np.float32).tobytes()
+                )
                 response_array_refs.append(response_array)
                 bi = response_array.buffer_info()
                 response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1]))
@@ -372,8 +483,10 @@ def enqueue(self, query_samples):
         else:
             bs = self.max_batchsize
             for i in range(0, len(idx), bs):
-                data, label = self.ds.get_samples(idx[i:i+bs])
-                self.run_one_item(Item(query_id[i:i+bs], idx[i:i+bs], data, label))
+                data, label = self.ds.get_samples(idx[i: i + bs])
+                self.run_one_item(
+                    Item(query_id[i: i + bs], idx[i: i + bs], data, label)
+                )
 
     def finish(self):
         pass
@@ -387,7 +500,9 @@ def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
         self.result_dict = {}
 
         for _ in range(self.threads):
-            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
+            worker = threading.Thread(
+                target=self.handle_tasks, args=(
+                    self.tasks,))
             worker.daemon = True
             self.workers.append(worker)
             worker.start()
@@ -424,10 +539,14 @@ def finish(self):
             worker.join()
 
 
-def add_results(final_results, name, result_dict, result_list, took, show_accuracy=False):
-    percentiles = [50., 80., 90., 95., 99., 99.9]
+def add_results(
+    final_results, name, result_dict, result_list, took, show_accuracy=False
+):
+    percentiles = [50.0, 80.0, 90.0, 95.0, 99.0, 99.9]
     buckets = np.percentile(result_list, percentiles).tolist()
-    buckets_str = ",".join(["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)])
+    buckets_str = ",".join(
+        ["{}:{:.4f}".format(p, b) for p, b in zip(percentiles, buckets)]
+    )
 
     if result_dict["total"] == 0:
         result_dict["total"] = len(result_list)
@@ -444,19 +563,27 @@ def add_results(final_results, name, result_dict, result_list, took, show_accura
     }
     acc_str = ""
     if show_accuracy:
-        result["accuracy"] = 100. * result_dict["good"] / result_dict["total"]
+        result["accuracy"] = 100.0 * result_dict["good"] / result_dict["total"]
         acc_str = ", acc={:.3f}%".format(result["accuracy"])
         if "mAP" in result_dict:
-            result["mAP"] = 100. * result_dict["mAP"]
+            result["mAP"] = 100.0 * result_dict["mAP"]
             acc_str += ", mAP={:.3f}%".format(result["mAP"])
 
     # add the result to the result dict
     final_results[name] = result
 
     # to stdout
-    print("{} qps={:.2f}, mean={:.4f}, time={:.3f}{}, queries={}, tiles={}".format(
-        name, result["qps"], result["mean"], took, acc_str,
-        len(result_list), buckets_str))
+    print(
+        "{} qps={:.2f}, mean={:.4f}, time={:.3f}{}, queries={}, tiles={}".format(
+            name,
+            result["qps"],
+            result["mean"],
+            took,
+            acc_str,
+            len(result_list),
+            buckets_str,
+        )
+    )
 
 
 def main():
@@ -468,8 +595,8 @@ def main():
     # find backend
     backend = get_backend(args.backend)
 
-     # If TVM, pass max_batchsize to the backend
-    if args.backend.startswith('tvm'):
+    # If TVM, pass max_batchsize to the backend
+    if args.backend.startswith("tvm"):
         backend.max_batchsize = args.max_batchsize
         backend.arena_num = args.threads
         backend.arena_size = 4
@@ -487,18 +614,20 @@ def main():
     # dataset to use
     wanted_dataset, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
     if args.use_preprocessed_dataset:
-        pre_proc=None
-    ds = wanted_dataset(data_path=args.dataset_path,
-                        image_list=args.dataset_list,
-                        name=args.dataset,
-                        image_format=image_format,
-                        pre_process=pre_proc,
-                        use_cache=args.cache,
-                        count=count,
-                        cache_dir=args.cache_dir,
-                        preprocessed_dir=args.preprocessed_dir,
-                        threads=args.threads,
-                        **kwargs)
+        pre_proc = None
+    ds = wanted_dataset(
+        data_path=args.dataset_path,
+        image_list=args.dataset_list,
+        name=args.dataset,
+        image_format=image_format,
+        pre_process=pre_proc,
+        use_cache=args.cache,
+        count=count,
+        cache_dir=args.cache_dir,
+        preprocessed_dir=args.preprocessed_dir,
+        threads=args.threads,
+        **kwargs
+    )
     # load model to backend
     model = backend.load(args.model, inputs=args.inputs, outputs=args.outputs)
     final_results = {
@@ -543,9 +672,11 @@ def main():
         lg.TestScenario.SingleStream: RunnerBase,
         lg.TestScenario.MultiStream: QueueRunner,
         lg.TestScenario.Server: QueueRunner,
-        lg.TestScenario.Offline: QueueRunner
+        lg.TestScenario.Offline: QueueRunner,
     }
-    runner = runner_map[scenario](model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize)
+    runner = runner_map[scenario](
+        model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+    )
 
     def issue_queries(query_samples):
         runner.enqueue(query_samples)
@@ -588,11 +719,18 @@ def flush_queries():
         settings.multi_stream_samples_per_query = args.samples_per_query
     if args.max_latency:
         settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
-        settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC)
-
-    performance_sample_count = args.performance_sample_count if args.performance_sample_count else min(count, 500)
+        settings.multi_stream_expected_latency_ns = int(
+            args.max_latency * NANO_SEC)
+
+    performance_sample_count = (
+        args.performance_sample_count
+        if args.performance_sample_count
+        else min(count, 500)
+    )
     sut = lg.ConstructSUT(issue_queries, flush_queries)
-    qsl = lg.ConstructQSL(count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples)
+    qsl = lg.ConstructQSL(
+        count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples
+    )
 
     log.info("starting {}".format(scenario))
     result_dict = {"good": 0, "total": 0, "scenario": str(scenario)}
@@ -605,8 +743,14 @@ def flush_queries():
     if args.accuracy:
         post_proc.finalize(result_dict, ds, output_dir=args.output)
 
-    add_results(final_results, "{}".format(scenario),
-                result_dict, last_timeing, time.time() - ds.last_loaded, args.accuracy)
+    add_results(
+        final_results,
+        "{}".format(scenario),
+        result_dict,
+        last_timeing,
+        time.time() - ds.last_loaded,
+        args.accuracy,
+    )
 
     runner.finish()
     lg.DestroyQSL(qsl)
diff --git a/vision/classification_and_detection/python/models/anchor_generator.py b/vision/classification_and_detection/python/models/anchor_generator.py
index 9a2d9d490..be0401268 100644
--- a/vision/classification_and_detection/python/models/anchor_generator.py
+++ b/vision/classification_and_detection/python/models/anchor_generator.py
@@ -2,11 +2,12 @@
 import numpy as np
 
 
-# The following functions were taken from 
+# The following functions were taken from
 # https://github.com/tensorflow/models/tree/master/research/object_detection
 # with minor modifications so that they use
 # torch operations instead
 
+
 def expanded_shape(orig_shape, start_dim, num_dims):
     s = (1,) * num_dims
     return orig_shape[:start_dim] + s + orig_shape[start_dim:]
@@ -45,350 +46,407 @@ def meshgrid(x, y):
     return xgrid, ygrid
 
 
-def tile_anchors(grid_height,
-                 grid_width,
-                 scales,
-                 aspect_ratios,
-                 base_anchor_size,
-                 anchor_stride,
-                 anchor_offset):
-  """Create a tiled set of anchors strided along a grid in image space.
-  This op creates a set of anchor boxes by placing a "basis" collection of
-  boxes with user-specified scales and aspect ratios centered at evenly
-  distributed points along a grid.  The basis collection is specified via the
-  scale and aspect_ratios arguments.  For example, setting scales=[.1, .2, .2]
-  and aspect ratios = [2,2,1/2] means that we create three boxes: one with scale
-  .1, aspect ratio 2, one with scale .2, aspect ratio 2, and one with scale .2
-  and aspect ratio 1/2.  Each box is multiplied by "base_anchor_size" before
-  placing it over its respective center.
-  Grid points are specified via grid_height, grid_width parameters as well as
-  the anchor_stride and anchor_offset parameters.
-  Args:
-    grid_height: size of the grid in the y direction (int or int scalar tensor)
-    grid_width: size of the grid in the x direction (int or int scalar tensor)
-    scales: a 1-d  (float) tensor representing the scale of each box in the
-      basis set.
-    aspect_ratios: a 1-d (float) tensor representing the aspect ratio of each
-      box in the basis set.  The length of the scales and aspect_ratios tensors
-      must be equal.
-    base_anchor_size: base anchor size as [height, width]
-      (float tensor of shape [2])
-    anchor_stride: difference in centers between base anchors for adjacent grid
-                   positions (float tensor of shape [2])
-    anchor_offset: center of the anchor with scale and aspect ratio 1 for the
-                   upper left element of the grid, this should be zero for
-                   feature networks with only VALID padding and even receptive
-                   field size, but may need some additional calculation if other
-                   padding is used (float tensor of shape [2])
-  Returns:
-    a BoxList holding a collection of N anchor boxes
-  """
-  aspect_ratios = torch.as_tensor(aspect_ratios, dtype=torch.float32)
-  scales = torch.as_tensor(scales, dtype=torch.float32)
+def tile_anchors(
+    grid_height,
+    grid_width,
+    scales,
+    aspect_ratios,
+    base_anchor_size,
+    anchor_stride,
+    anchor_offset,
+):
+    """Create a tiled set of anchors strided along a grid in image space.
+    This op creates a set of anchor boxes by placing a "basis" collection of
+    boxes with user-specified scales and aspect ratios centered at evenly
+    distributed points along a grid.  The basis collection is specified via the
+    scale and aspect_ratios arguments.  For example, setting scales=[.1, .2, .2]
+    and aspect ratios = [2,2,1/2] means that we create three boxes: one with scale
+    .1, aspect ratio 2, one with scale .2, aspect ratio 2, and one with scale .2
+    and aspect ratio 1/2.  Each box is multiplied by "base_anchor_size" before
+    placing it over its respective center.
+    Grid points are specified via grid_height, grid_width parameters as well as
+    the anchor_stride and anchor_offset parameters.
+    Args:
+      grid_height: size of the grid in the y direction (int or int scalar tensor)
+      grid_width: size of the grid in the x direction (int or int scalar tensor)
+      scales: a 1-d  (float) tensor representing the scale of each box in the
+        basis set.
+      aspect_ratios: a 1-d (float) tensor representing the aspect ratio of each
+        box in the basis set.  The length of the scales and aspect_ratios tensors
+        must be equal.
+      base_anchor_size: base anchor size as [height, width]
+        (float tensor of shape [2])
+      anchor_stride: difference in centers between base anchors for adjacent grid
+                     positions (float tensor of shape [2])
+      anchor_offset: center of the anchor with scale and aspect ratio 1 for the
+                     upper left element of the grid, this should be zero for
+                     feature networks with only VALID padding and even receptive
+                     field size, but may need some additional calculation if other
+                     padding is used (float tensor of shape [2])
+    Returns:
+      a BoxList holding a collection of N anchor boxes
+    """
+    aspect_ratios = torch.as_tensor(aspect_ratios, dtype=torch.float32)
+    scales = torch.as_tensor(scales, dtype=torch.float32)
 
-  ratio_sqrts = torch.sqrt(aspect_ratios)
-  heights = scales / ratio_sqrts * base_anchor_size[0]
-  widths = scales * ratio_sqrts * base_anchor_size[1]
+    ratio_sqrts = torch.sqrt(aspect_ratios)
+    heights = scales / ratio_sqrts * base_anchor_size[0]
+    widths = scales * ratio_sqrts * base_anchor_size[1]
 
-  # Get a grid of box centers
-  y_centers = torch.arange(grid_height, dtype=torch.float32)
-  y_centers = y_centers * anchor_stride[0] + anchor_offset[0]
-  x_centers = torch.arange(grid_width, dtype=torch.float32)
-  x_centers = x_centers * anchor_stride[1] + anchor_offset[1]
+    # Get a grid of box centers
+    y_centers = torch.arange(grid_height, dtype=torch.float32)
+    y_centers = y_centers * anchor_stride[0] + anchor_offset[0]
+    x_centers = torch.arange(grid_width, dtype=torch.float32)
+    x_centers = x_centers * anchor_stride[1] + anchor_offset[1]
 
-  x_centers, y_centers = meshgrid(x_centers, y_centers)
+    x_centers, y_centers = meshgrid(x_centers, y_centers)
 
-  widths_grid, x_centers_grid = meshgrid(widths, x_centers)
-  heights_grid, y_centers_grid = meshgrid(heights, y_centers)
+    widths_grid, x_centers_grid = meshgrid(widths, x_centers)
+    heights_grid, y_centers_grid = meshgrid(heights, y_centers)
 
-  bbox_centers = torch.stack([y_centers_grid, x_centers_grid], dim=3)
-  bbox_sizes = torch.stack([heights_grid, widths_grid], dim=3)
-  bbox_centers = torch.reshape(bbox_centers, [-1, 2])
-  bbox_sizes = torch.reshape(bbox_sizes, [-1, 2])
-  bbox_corners = _center_size_bbox_to_corners_bbox(bbox_centers, bbox_sizes)
-  return bbox_corners
+    bbox_centers = torch.stack([y_centers_grid, x_centers_grid], dim=3)
+    bbox_sizes = torch.stack([heights_grid, widths_grid], dim=3)
+    bbox_centers = torch.reshape(bbox_centers, [-1, 2])
+    bbox_sizes = torch.reshape(bbox_sizes, [-1, 2])
+    bbox_corners = _center_size_bbox_to_corners_bbox(bbox_centers, bbox_sizes)
+    return bbox_corners
 
 
 def _center_size_bbox_to_corners_bbox(centers, sizes):
-  """Converts bbox center-size representation to corners representation.
-  Args:
-    centers: a tensor with shape [N, 2] representing bounding box centers
-    sizes: a tensor with shape [N, 2] representing bounding boxes
-  Returns:
-    corners: tensor with shape [N, 4] representing bounding boxes in corners
-      representation
-  """
-  return torch.cat([centers - .5 * sizes, centers + .5 * sizes], 1)
-
-
-def create_ssd_anchors(num_layers=6,
-                       min_scale=0.2,
-                       max_scale=0.95,
-                       scales=None,
-                       aspect_ratios=(1.0, 2.0, 1.0 / 2, 3.0, 1.0 / 3),
-                       interpolated_scale_aspect_ratio=1.0,
-                       base_anchor_size=None,
-                       anchor_strides=None,
-                       anchor_offsets=None,
-                       reduce_boxes_in_lowest_layer=True):
-  """Creates MultipleGridAnchorGenerator for SSD anchors.
-  This function instantiates a MultipleGridAnchorGenerator that reproduces
-  ``default box`` construction proposed by Liu et al in the SSD paper.
-  See Section 2.2 for details. Grid sizes are assumed to be passed in
-  at generation time from finest resolution to coarsest resolution --- this is
-  used to (linearly) interpolate scales of anchor boxes corresponding to the
-  intermediate grid sizes.
-  Anchors that are returned by calling the `generate` method on the returned
-  MultipleGridAnchorGenerator object are always in normalized coordinates
-  and clipped to the unit square: (i.e. all coordinates lie in [0, 1]x[0, 1]).
-  Args:
-    num_layers: integer number of grid layers to create anchors for (actual
-      grid sizes passed in at generation time)
-    min_scale: scale of anchors corresponding to finest resolution (float)
-    max_scale: scale of anchors corresponding to coarsest resolution (float)
-    scales: As list of anchor scales to use. When not None and not empty,
-      min_scale and max_scale are not used.
-    aspect_ratios: list or tuple of (float) aspect ratios to place on each
-      grid point.
-    interpolated_scale_aspect_ratio: An additional anchor is added with this
-      aspect ratio and a scale interpolated between the scale for a layer
-      and the scale for the next layer (1.0 for the last layer).
-      This anchor is not included if this value is 0.
-    base_anchor_size: base anchor size as [height, width].
-      The height and width values are normalized to the minimum dimension of the
-      input height and width, so that when the base anchor height equals the
-      base anchor width, the resulting anchor is square even if the input image
-      is not square.
-    anchor_strides: list of pairs of strides in pixels (in y and x directions
-      respectively). For example, setting anchor_strides=[(25, 25), (50, 50)]
-      means that we want the anchors corresponding to the first layer to be
-      strided by 25 pixels and those in the second layer to be strided by 50
-      pixels in both y and x directions. If anchor_strides=None, they are set to
-      be the reciprocal of the corresponding feature map shapes.
-    anchor_offsets: list of pairs of offsets in pixels (in y and x directions
-      respectively). The offset specifies where we want the center of the
-      (0, 0)-th anchor to lie for each layer. For example, setting
-      anchor_offsets=[(10, 10), (20, 20)]) means that we want the
-      (0, 0)-th anchor of the first layer to lie at (10, 10) in pixel space
-      and likewise that we want the (0, 0)-th anchor of the second layer to lie
-      at (25, 25) in pixel space. If anchor_offsets=None, then they are set to
-      be half of the corresponding anchor stride.
-    reduce_boxes_in_lowest_layer: a boolean to indicate whether the fixed 3
-      boxes per location is used in the lowest layer.
-  Returns:
-    a MultipleGridAnchorGenerator
-  """
-  if base_anchor_size is None:
-    base_anchor_size = [1.0, 1.0]
-  base_anchor_size = torch.tensor(base_anchor_size, dtype=torch.float32)
-  box_specs_list = []
-  if scales is None or not scales:
-    scales = [min_scale + (max_scale - min_scale) * i / (num_layers - 1)
-              for i in range(num_layers)] + [1.0]
-  else:
-    # Add 1.0 to the end, which will only be used in scale_next below and used
-    # for computing an interpolated scale for the largest scale in the list.
-    scales += [1.0]
-
-  for layer, scale, scale_next in zip(
-      range(num_layers), scales[:-1], scales[1:]):
-    layer_box_specs = []
-    if layer == 0 and reduce_boxes_in_lowest_layer:
-      layer_box_specs = [(0.1, 1.0), (scale, 2.0), (scale, 0.5)]
-    else:
-      for aspect_ratio in aspect_ratios:
-        layer_box_specs.append((scale, aspect_ratio))
-      # Add one more anchor, with a scale between the current scale, and the
-      # scale for the next layer, with a specified aspect ratio (1.0 by
-      # default).
-      if interpolated_scale_aspect_ratio > 0.0:
-        layer_box_specs.append((np.sqrt(scale*scale_next),
-                                interpolated_scale_aspect_ratio))
-    box_specs_list.append(layer_box_specs)
-
-  return MultipleGridAnchorGenerator(box_specs_list, base_anchor_size,
-                                     anchor_strides, anchor_offsets)
+    """Converts bbox center-size representation to corners representation.
+    Args:
+      centers: a tensor with shape [N, 2] representing bounding box centers
+      sizes: a tensor with shape [N, 2] representing bounding boxes
+    Returns:
+      corners: tensor with shape [N, 4] representing bounding boxes in corners
+        representation
+    """
+    return torch.cat([centers - 0.5 * sizes, centers + 0.5 * sizes], 1)
 
-class MultipleGridAnchorGenerator(object):
-  """Generate a grid of anchors for multiple CNN layers."""
 
-  def __init__(self,
-               box_specs_list,
-               base_anchor_size=None,
-               anchor_strides=None,
-               anchor_offsets=None,
-               clip_window=None):
-    """Constructs a MultipleGridAnchorGenerator.
-    To construct anchors, at multiple grid resolutions, one must provide a
-    list of feature_map_shape_list (e.g., [(8, 8), (4, 4)]), and for each grid
-    size, a corresponding list of (scale, aspect ratio) box specifications.
-    For example:
-    box_specs_list = [[(.1, 1.0), (.1, 2.0)],  # for 8x8 grid
-                      [(.2, 1.0), (.3, 1.0), (.2, 2.0)]]  # for 4x4 grid
-    To support the fully convolutional setting, we pass grid sizes in at
-    generation time, while scale and aspect ratios are fixed at construction
-    time.
+def create_ssd_anchors(
+    num_layers=6,
+    min_scale=0.2,
+    max_scale=0.95,
+    scales=None,
+    aspect_ratios=(1.0, 2.0, 1.0 / 2, 3.0, 1.0 / 3),
+    interpolated_scale_aspect_ratio=1.0,
+    base_anchor_size=None,
+    anchor_strides=None,
+    anchor_offsets=None,
+    reduce_boxes_in_lowest_layer=True,
+):
+    """Creates MultipleGridAnchorGenerator for SSD anchors.
+    This function instantiates a MultipleGridAnchorGenerator that reproduces
+    ``default box`` construction proposed by Liu et al in the SSD paper.
+    See Section 2.2 for details. Grid sizes are assumed to be passed in
+    at generation time from finest resolution to coarsest resolution --- this is
+    used to (linearly) interpolate scales of anchor boxes corresponding to the
+    intermediate grid sizes.
+    Anchors that are returned by calling the `generate` method on the returned
+    MultipleGridAnchorGenerator object are always in normalized coordinates
+    and clipped to the unit square: (i.e. all coordinates lie in [0, 1]x[0, 1]).
     Args:
-      box_specs_list: list of list of (scale, aspect ratio) pairs with the
-        outside list having the same number of entries as feature_map_shape_list
-        (which is passed in at generation time).
-      base_anchor_size: base anchor size as [height, width]
-                        (length-2 float tensor, default=[1.0, 1.0]).
-                        The height and width values are normalized to the
-                        minimum dimension of the input height and width, so that
-                        when the base anchor height equals the base anchor
-                        width, the resulting anchor is square even if the input
-                        image is not square.
+      num_layers: integer number of grid layers to create anchors for (actual
+        grid sizes passed in at generation time)
+      min_scale: scale of anchors corresponding to finest resolution (float)
+      max_scale: scale of anchors corresponding to coarsest resolution (float)
+      scales: As list of anchor scales to use. When not None and not empty,
+        min_scale and max_scale are not used.
+      aspect_ratios: list or tuple of (float) aspect ratios to place on each
+        grid point.
+      interpolated_scale_aspect_ratio: An additional anchor is added with this
+        aspect ratio and a scale interpolated between the scale for a layer
+        and the scale for the next layer (1.0 for the last layer).
+        This anchor is not included if this value is 0.
+      base_anchor_size: base anchor size as [height, width].
+        The height and width values are normalized to the minimum dimension of the
+        input height and width, so that when the base anchor height equals the
+        base anchor width, the resulting anchor is square even if the input image
+        is not square.
       anchor_strides: list of pairs of strides in pixels (in y and x directions
         respectively). For example, setting anchor_strides=[(25, 25), (50, 50)]
         means that we want the anchors corresponding to the first layer to be
         strided by 25 pixels and those in the second layer to be strided by 50
-        pixels in both y and x directions. If anchor_strides=None, they are set
-        to be the reciprocal of the corresponding feature map shapes.
+        pixels in both y and x directions. If anchor_strides=None, they are set to
+        be the reciprocal of the corresponding feature map shapes.
       anchor_offsets: list of pairs of offsets in pixels (in y and x directions
         respectively). The offset specifies where we want the center of the
         (0, 0)-th anchor to lie for each layer. For example, setting
         anchor_offsets=[(10, 10), (20, 20)]) means that we want the
         (0, 0)-th anchor of the first layer to lie at (10, 10) in pixel space
-        and likewise that we want the (0, 0)-th anchor of the second layer to
-        lie at (25, 25) in pixel space. If anchor_offsets=None, then they are
-        set to be half of the corresponding anchor stride.
-      clip_window: a tensor of shape [4] specifying a window to which all
-        anchors should be clipped. If clip_window is None, then no clipping
-        is performed.
-    Raises:
-      ValueError: if box_specs_list is not a list of list of pairs
-      ValueError: if clip_window is not either None or a tensor of shape [4]
+        and likewise that we want the (0, 0)-th anchor of the second layer to lie
+        at (25, 25) in pixel space. If anchor_offsets=None, then they are set to
+        be half of the corresponding anchor stride.
+      reduce_boxes_in_lowest_layer: a boolean to indicate whether the fixed 3
+        boxes per location is used in the lowest layer.
+    Returns:
+      a MultipleGridAnchorGenerator
     """
-    if isinstance(box_specs_list, list) and all(
-        [isinstance(list_item, list) for list_item in box_specs_list]):
-      self._box_specs = box_specs_list
-    else:
-      raise ValueError('box_specs_list is expected to be a '
-                       'list of lists of pairs')
     if base_anchor_size is None:
-      base_anchor_size = torch.tensor([256, 256], dtype=torch.float32)
-    self._base_anchor_size = base_anchor_size
-    self._anchor_strides = anchor_strides
-    self._anchor_offsets = anchor_offsets
-    if clip_window is not None and list(clip_window.shape) != [4]:
-      raise ValueError('clip_window must either be None or a shape [4] tensor')
-    self._clip_window = clip_window
-    self._scales = []
-    self._aspect_ratios = []
-    for box_spec in self._box_specs:
-      if not all([isinstance(entry, tuple) and len(entry) == 2
-                  for entry in box_spec]):
-        raise ValueError('box_specs_list is expected to be a '
-                         'list of lists of pairs')
-      scales, aspect_ratios = zip(*box_spec)
-      self._scales.append(scales)
-      self._aspect_ratios.append(aspect_ratios)
+        base_anchor_size = [1.0, 1.0]
+    base_anchor_size = torch.tensor(base_anchor_size, dtype=torch.float32)
+    box_specs_list = []
+    if scales is None or not scales:
+        scales = [
+            min_scale + (max_scale - min_scale) * i / (num_layers - 1)
+            for i in range(num_layers)
+        ] + [1.0]
+    else:
+        # Add 1.0 to the end, which will only be used in scale_next below and used
+        # for computing an interpolated scale for the largest scale in the
+        # list.
+        scales += [1.0]
 
-    for arg, arg_name in zip([self._anchor_strides, self._anchor_offsets],
-                             ['anchor_strides', 'anchor_offsets']):
-      if arg and not (isinstance(arg, list) and
-                      len(arg) == len(self._box_specs)):
-        raise ValueError('%s must be a list with the same length '
-                         'as self._box_specs' % arg_name)
-      if arg and not all([
-          isinstance(list_item, tuple) and len(list_item) == 2
-          for list_item in arg
-      ]):
-        raise ValueError('%s must be a list of pairs.' % arg_name)
+    for layer, scale, scale_next in zip(
+            range(num_layers), scales[:-1], scales[1:]):
+        layer_box_specs = []
+        if layer == 0 and reduce_boxes_in_lowest_layer:
+            layer_box_specs = [(0.1, 1.0), (scale, 2.0), (scale, 0.5)]
+        else:
+            for aspect_ratio in aspect_ratios:
+                layer_box_specs.append((scale, aspect_ratio))
+            # Add one more anchor, with a scale between the current scale, and the
+            # scale for the next layer, with a specified aspect ratio (1.0 by
+            # default).
+            if interpolated_scale_aspect_ratio > 0.0:
+                layer_box_specs.append(
+                    (np.sqrt(scale * scale_next), interpolated_scale_aspect_ratio)
+                )
+        box_specs_list.append(layer_box_specs)
 
+    return MultipleGridAnchorGenerator(
+        box_specs_list, base_anchor_size, anchor_strides, anchor_offsets
+    )
 
-  def _generate(self, feature_map_shape_list, im_height=1, im_width=1):
-    """Generates a collection of bounding boxes to be used as anchors.
-    The number of anchors generated for a single grid with shape MxM where we
-    place k boxes over each grid center is k*M^2 and thus the total number of
-    anchors is the sum over all grids. In our box_specs_list example
-    (see the constructor docstring), we would place two boxes over each grid
-    point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and
-    thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the
-    output anchors follows the order of how the grid sizes and box_specs are
-    specified (with box_spec index varying the fastest, followed by width
-    index, then height index, then grid index).
-    Args:
-      feature_map_shape_list: list of pairs of convnet layer resolutions in the
-        format [(height_0, width_0), (height_1, width_1), ...]. For example,
-        setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that
-        correspond to an 8x8 layer followed by a 7x7 layer.
-      im_height: the height of the image to generate the grid for. If both
-        im_height and im_width are 1, the generated anchors default to
-        absolute coordinates, otherwise normalized coordinates are produced.
-      im_width: the width of the image to generate the grid for. If both
-        im_height and im_width are 1, the generated anchors default to
-        absolute coordinates, otherwise normalized coordinates are produced.
-    Returns:
-      boxes_list: a list of BoxLists each holding anchor boxes corresponding to
-        the input feature map shapes.
-    Raises:
-      ValueError: if feature_map_shape_list, box_specs_list do not have the same
-        length.
-      ValueError: if feature_map_shape_list does not consist of pairs of
-        integers
-    """
-    if not (isinstance(feature_map_shape_list, list)
-            and len(feature_map_shape_list) == len(self._box_specs)):
-      raise ValueError('feature_map_shape_list must be a list with the same '
-                       'length as self._box_specs')
-    if not all([isinstance(list_item, tuple) and len(list_item) == 2
-                for list_item in feature_map_shape_list]):
-      raise ValueError('feature_map_shape_list must be a list of pairs.')
 
-    im_height = float(im_height)
-    im_width = float(im_width)
+class MultipleGridAnchorGenerator(object):
+    """Generate a grid of anchors for multiple CNN layers."""
 
-    if not self._anchor_strides:
-      anchor_strides = [(1.0 / float(pair[0]), 1.0 / float(pair[1]))
-                        for pair in feature_map_shape_list]
-    else:
-      anchor_strides = [(float(stride[0]) / im_height,
-                         float(stride[1]) / im_width)
-                        for stride in self._anchor_strides]
-    if not self._anchor_offsets:
-      anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1])
-                        for stride in anchor_strides]
-    else:
-      anchor_offsets = [(float(offset[0]) / im_height,
-                         float(offset[1]) / im_width)
-                        for offset in self._anchor_offsets]
+    def __init__(
+        self,
+        box_specs_list,
+        base_anchor_size=None,
+        anchor_strides=None,
+        anchor_offsets=None,
+        clip_window=None,
+    ):
+        """Constructs a MultipleGridAnchorGenerator.
+        To construct anchors, at multiple grid resolutions, one must provide a
+        list of feature_map_shape_list (e.g., [(8, 8), (4, 4)]), and for each grid
+        size, a corresponding list of (scale, aspect ratio) box specifications.
+        For example:
+        box_specs_list = [[(.1, 1.0), (.1, 2.0)],  # for 8x8 grid
+                          [(.2, 1.0), (.3, 1.0), (.2, 2.0)]]  # for 4x4 grid
+        To support the fully convolutional setting, we pass grid sizes in at
+        generation time, while scale and aspect ratios are fixed at construction
+        time.
+        Args:
+          box_specs_list: list of list of (scale, aspect ratio) pairs with the
+            outside list having the same number of entries as feature_map_shape_list
+            (which is passed in at generation time).
+          base_anchor_size: base anchor size as [height, width]
+                            (length-2 float tensor, default=[1.0, 1.0]).
+                            The height and width values are normalized to the
+                            minimum dimension of the input height and width, so that
+                            when the base anchor height equals the base anchor
+                            width, the resulting anchor is square even if the input
+                            image is not square.
+          anchor_strides: list of pairs of strides in pixels (in y and x directions
+            respectively). For example, setting anchor_strides=[(25, 25), (50, 50)]
+            means that we want the anchors corresponding to the first layer to be
+            strided by 25 pixels and those in the second layer to be strided by 50
+            pixels in both y and x directions. If anchor_strides=None, they are set
+            to be the reciprocal of the corresponding feature map shapes.
+          anchor_offsets: list of pairs of offsets in pixels (in y and x directions
+            respectively). The offset specifies where we want the center of the
+            (0, 0)-th anchor to lie for each layer. For example, setting
+            anchor_offsets=[(10, 10), (20, 20)]) means that we want the
+            (0, 0)-th anchor of the first layer to lie at (10, 10) in pixel space
+            and likewise that we want the (0, 0)-th anchor of the second layer to
+            lie at (25, 25) in pixel space. If anchor_offsets=None, then they are
+            set to be half of the corresponding anchor stride.
+          clip_window: a tensor of shape [4] specifying a window to which all
+            anchors should be clipped. If clip_window is None, then no clipping
+            is performed.
+        Raises:
+          ValueError: if box_specs_list is not a list of list of pairs
+          ValueError: if clip_window is not either None or a tensor of shape [4]
+        """
+        if isinstance(box_specs_list, list) and all(
+            [isinstance(list_item, list) for list_item in box_specs_list]
+        ):
+            self._box_specs = box_specs_list
+        else:
+            raise ValueError(
+                "box_specs_list is expected to be a " "list of lists of pairs"
+            )
+        if base_anchor_size is None:
+            base_anchor_size = torch.tensor([256, 256], dtype=torch.float32)
+        self._base_anchor_size = base_anchor_size
+        self._anchor_strides = anchor_strides
+        self._anchor_offsets = anchor_offsets
+        if clip_window is not None and list(clip_window.shape) != [4]:
+            raise ValueError(
+                "clip_window must either be None or a shape [4] tensor")
+        self._clip_window = clip_window
+        self._scales = []
+        self._aspect_ratios = []
+        for box_spec in self._box_specs:
+            if not all(
+                [isinstance(entry, tuple) and len(entry)
+                 == 2 for entry in box_spec]
+            ):
+                raise ValueError(
+                    "box_specs_list is expected to be a " "list of lists of pairs"
+                )
+            scales, aspect_ratios = zip(*box_spec)
+            self._scales.append(scales)
+            self._aspect_ratios.append(aspect_ratios)
+
+        for arg, arg_name in zip(
+            [self._anchor_strides, self._anchor_offsets],
+            ["anchor_strides", "anchor_offsets"],
+        ):
+            if arg and not (isinstance(arg, list) and len(arg)
+                            == len(self._box_specs)):
+                raise ValueError(
+                    "%s must be a list with the same length "
+                    "as self._box_specs" % arg_name
+                )
+            if arg and not all(
+                [
+                    isinstance(list_item, tuple) and len(list_item) == 2
+                    for list_item in arg
+                ]
+            ):
+                raise ValueError("%s must be a list of pairs." % arg_name)
+
+    def _generate(self, feature_map_shape_list, im_height=1, im_width=1):
+        """Generates a collection of bounding boxes to be used as anchors.
+        The number of anchors generated for a single grid with shape MxM where we
+        place k boxes over each grid center is k*M^2 and thus the total number of
+        anchors is the sum over all grids. In our box_specs_list example
+        (see the constructor docstring), we would place two boxes over each grid
+        point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and
+        thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the
+        output anchors follows the order of how the grid sizes and box_specs are
+        specified (with box_spec index varying the fastest, followed by width
+        index, then height index, then grid index).
+        Args:
+          feature_map_shape_list: list of pairs of convnet layer resolutions in the
+            format [(height_0, width_0), (height_1, width_1), ...]. For example,
+            setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that
+            correspond to an 8x8 layer followed by a 7x7 layer.
+          im_height: the height of the image to generate the grid for. If both
+            im_height and im_width are 1, the generated anchors default to
+            absolute coordinates, otherwise normalized coordinates are produced.
+          im_width: the width of the image to generate the grid for. If both
+            im_height and im_width are 1, the generated anchors default to
+            absolute coordinates, otherwise normalized coordinates are produced.
+        Returns:
+          boxes_list: a list of BoxLists each holding anchor boxes corresponding to
+            the input feature map shapes.
+        Raises:
+          ValueError: if feature_map_shape_list, box_specs_list do not have the same
+            length.
+          ValueError: if feature_map_shape_list does not consist of pairs of
+            integers
+        """
+        if not (
+            isinstance(feature_map_shape_list, list)
+            and len(feature_map_shape_list) == len(self._box_specs)
+        ):
+            raise ValueError(
+                "feature_map_shape_list must be a list with the same "
+                "length as self._box_specs"
+            )
+        if not all(
+            [
+                isinstance(list_item, tuple) and len(list_item) == 2
+                for list_item in feature_map_shape_list
+            ]
+        ):
+            raise ValueError("feature_map_shape_list must be a list of pairs.")
+
+        im_height = float(im_height)
+        im_width = float(im_width)
+
+        if not self._anchor_strides:
+            anchor_strides = [
+                (1.0 / float(pair[0]), 1.0 / float(pair[1]))
+                for pair in feature_map_shape_list
+            ]
+        else:
+            anchor_strides = [
+                (float(stride[0]) / im_height, float(stride[1]) / im_width)
+                for stride in self._anchor_strides
+            ]
+        if not self._anchor_offsets:
+            anchor_offsets = [
+                (0.5 * stride[0], 0.5 * stride[1]) for stride in anchor_strides
+            ]
+        else:
+            anchor_offsets = [
+                (float(offset[0]) / im_height, float(offset[1]) / im_width)
+                for offset in self._anchor_offsets
+            ]
 
-    for arg, arg_name in zip([anchor_strides, anchor_offsets],
-                             ['anchor_strides', 'anchor_offsets']):
-      if not (isinstance(arg, list) and len(arg) == len(self._box_specs)):
-        raise ValueError('%s must be a list with the same length '
-                         'as self._box_specs' % arg_name)
-      if not all([isinstance(list_item, tuple) and len(list_item) == 2
-                  for list_item in arg]):
-        raise ValueError('%s must be a list of pairs.' % arg_name)
+        for arg, arg_name in zip(
+            [anchor_strides, anchor_offsets], [
+                "anchor_strides", "anchor_offsets"]
+        ):
+            if not (isinstance(arg, list) and len(
+                    arg) == len(self._box_specs)):
+                raise ValueError(
+                    "%s must be a list with the same length "
+                    "as self._box_specs" % arg_name
+                )
+            if not all(
+                [
+                    isinstance(list_item, tuple) and len(list_item) == 2
+                    for list_item in arg
+                ]
+            ):
+                raise ValueError("%s must be a list of pairs." % arg_name)
 
-    anchor_grid_list = []
-    min_im_shape = min(im_height, im_width)
-    scale_height = min_im_shape / im_height
-    scale_width = min_im_shape / im_width
-    base_anchor_size = [
-        scale_height * self._base_anchor_size[0],
-        scale_width * self._base_anchor_size[1]
-    ]
-    for feature_map_index, (grid_size, scales, aspect_ratios, stride,
-                            offset) in enumerate(
-                                zip(feature_map_shape_list, self._scales,
-                                    self._aspect_ratios, anchor_strides,
-                                    anchor_offsets)):
-      tiled_anchors = tile_anchors(
-          grid_height=grid_size[0],
-          grid_width=grid_size[1],
-          scales=scales,
-          aspect_ratios=aspect_ratios,
-          base_anchor_size=base_anchor_size,
-          anchor_stride=stride,
-          anchor_offset=offset)
-      if self._clip_window is not None:
-        raise NotImplementedError("Oups!")
-      num_anchors_in_layer = len(tiled_anchors)
-      anchor_indices = feature_map_index * torch.ones(num_anchors_in_layer)
-      anchor_grid_list.append(tiled_anchors)
+        anchor_grid_list = []
+        min_im_shape = min(im_height, im_width)
+        scale_height = min_im_shape / im_height
+        scale_width = min_im_shape / im_width
+        base_anchor_size = [
+            scale_height * self._base_anchor_size[0],
+            scale_width * self._base_anchor_size[1],
+        ]
+        for feature_map_index, (
+            grid_size,
+            scales,
+            aspect_ratios,
+            stride,
+            offset,
+        ) in enumerate(
+            zip(
+                feature_map_shape_list,
+                self._scales,
+                self._aspect_ratios,
+                anchor_strides,
+                anchor_offsets,
+            )
+        ):
+            tiled_anchors = tile_anchors(
+                grid_height=grid_size[0],
+                grid_width=grid_size[1],
+                scales=scales,
+                aspect_ratios=aspect_ratios,
+                base_anchor_size=base_anchor_size,
+                anchor_stride=stride,
+                anchor_offset=offset,
+            )
+            if self._clip_window is not None:
+                raise NotImplementedError("Oups!")
+            num_anchors_in_layer = len(tiled_anchors)
+            anchor_indices = feature_map_index * \
+                torch.ones(num_anchors_in_layer)
+            anchor_grid_list.append(tiled_anchors)
 
-    return anchor_grid_list
+        return anchor_grid_list
diff --git a/vision/classification_and_detection/python/models/base_model_r34.py b/vision/classification_and_detection/python/models/base_model_r34.py
index ea224a7ca..9e8a2da61 100644
--- a/vision/classification_and_detection/python/models/base_model_r34.py
+++ b/vision/classification_and_detection/python/models/base_model_r34.py
@@ -2,7 +2,7 @@
     Load the vgg16 weight and save it to special file
 """
 
-#from torchvision.models.vgg import vgg16
+# from torchvision.models.vgg import vgg16
 import torch.nn as nn
 import torch.nn.functional as F
 import torch
@@ -11,29 +11,32 @@
 
 from torchvision.models.resnet import resnet18, resnet34, resnet50
 
+
 def _ModifyConvStrideDilation(conv, stride=(1, 1), padding=None):
     conv.stride = stride
 
     if padding is not None:
         conv.padding = padding
 
+
 def _ModifyBlock(block, bottleneck=False, **kwargs):
     for m in list(block.children()):
         if bottleneck:
-           _ModifyConvStrideDilation(m.conv2, **kwargs)
+            _ModifyConvStrideDilation(m.conv2, **kwargs)
         else:
-           _ModifyConvStrideDilation(m.conv1, **kwargs)
+            _ModifyConvStrideDilation(m.conv1, **kwargs)
 
         if m.downsample is not None:
             # need to make sure no padding for the 1x1 residual connection
-            _ModifyConvStrideDilation(list(m.downsample.children())[0], **kwargs)
+            _ModifyConvStrideDilation(
+                list(m.downsample.children())[0], **kwargs)
+
 
 class ResNet18(nn.Module):
     def __init__(self):
         super().__init__()
         rn18 = resnet18(pretrained=True)
 
-
         # discard last Resnet block, avrpooling and classification FC
         # layer1 = up to and including conv3 block
         self.layer1 = nn.Sequential(*list(rn18.children())[:6])
@@ -43,7 +46,7 @@ def __init__(self):
         # modify conv4 if necessary
         # Always deal with stride in first block
         modulelist = list(self.layer2.children())
-        _ModifyBlock(modulelist[0], stride=(1,1))
+        _ModifyBlock(modulelist[0], stride=(1, 1))
 
     def forward(self, data):
         layer1_activation = self.layer1(data)
@@ -53,6 +56,7 @@ def forward(self, data):
         # Only need the output of conv4
         return [layer2_activation]
 
+
 class ResNet34(nn.Module):
     def __init__(self):
         super().__init__()
@@ -64,8 +68,7 @@ def __init__(self):
         # modify conv4 if necessary
         # Always deal with stride in first block
         modulelist = list(self.layer2.children())
-        _ModifyBlock(modulelist[0], stride=(1,1))
-
+        _ModifyBlock(modulelist[0], stride=(1, 1))
 
     def forward(self, data):
         layer1_activation = self.layer1(data)
@@ -74,22 +77,28 @@ def forward(self, data):
 
         return [layer2_activation]
 
+
 class L2Norm(nn.Module):
     """
-       Scale shall be learnable according to original paper
-       scale: initial scale number
-       chan_num: L2Norm channel number (norm over all channels)
+    Scale shall be learnable according to original paper
+    scale: initial scale number
+    chan_num: L2Norm channel number (norm over all channels)
     """
+
     def __init__(self, scale=20, chan_num=512):
         super(L2Norm, self).__init__()
         # Scale across channels
-        self.scale = \
-            nn.Parameter(torch.Tensor([scale]*chan_num).view(1, chan_num, 1, 1))
+        self.scale = nn.Parameter(
+            torch.Tensor([scale] * chan_num).view(1, chan_num, 1, 1)
+        )
 
     def forward(self, data):
         # normalize accross channel
-        return self.scale*data*data.pow(2).sum(dim=1, keepdim=True).clamp(min=1e-12).rsqrt()
-
+        return (
+            self.scale
+            * data
+            * data.pow(2).sum(dim=1, keepdim=True).clamp(min=1e-12).rsqrt()
+        )
 
 
 def tailor_module(src_model, src_dir, tgt_model, tgt_dir):
@@ -107,22 +116,23 @@ def tailor_module(src_model, src_dir, tgt_model, tgt_dir):
     for k1, k2 in zip(keys1, keys2):
         # print(k1, k2)
         state[k2] = src_state[k1]
-    #diff_keys = state.keys() - target_model.state_dict().keys()
-    #print("Different Keys:", diff_keys)
+    # diff_keys = state.keys() - target_model.state_dict().keys()
+    # print("Different Keys:", diff_keys)
     # Remove unecessary keys
-    #for k in diff_keys:
+    # for k in diff_keys:
     #    state.pop(k)
     tgt_model.load_state_dict(state)
     torch.save(tgt_model.state_dict(), tgt_dir)
 
+
 # Default vgg16 in pytorch seems different from ssd
 def make_layers(cfg, batch_norm=False):
     layers = []
     in_channels = 3
     for v in cfg:
-        if v == 'M':
+        if v == "M":
             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
-        elif v == 'C':
+        elif v == "C":
             # Notice ceil_mode is true
             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
         else:
@@ -134,42 +144,51 @@ def make_layers(cfg, batch_norm=False):
             in_channels = v
     return layers
 
+
 class Loss(nn.Module):
     """
-        Implements the loss as the sum of the followings:
-        1. Confidence Loss: All labels, with hard negative mining
-        2. Localization Loss: Only on positive labels
-        Suppose input dboxes has the shape 8732x4
+    Implements the loss as the sum of the followings:
+    1. Confidence Loss: All labels, with hard negative mining
+    2. Localization Loss: Only on positive labels
+    Suppose input dboxes has the shape 8732x4
     """
 
     def __init__(self, dboxes):
         super(Loss, self).__init__()
-        self.scale_xy = 1.0/dboxes.scale_xy
-        self.scale_wh = 1.0/dboxes.scale_wh
+        self.scale_xy = 1.0 / dboxes.scale_xy
+        self.scale_wh = 1.0 / dboxes.scale_wh
 
         self.sl1_loss = nn.SmoothL1Loss(reduce=False)
-        self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim = 0),
-            requires_grad=False)
+        self.dboxes = nn.Parameter(
+            dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0), requires_grad=False
+        )
         # Two factor are from following links
         # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
         self.con_loss = nn.CrossEntropyLoss(reduce=False)
 
     def _loc_vec(self, loc):
         """
-            Generate Location Vectors
+        Generate Location Vectors
         """
-        gxy = self.scale_xy*(loc[:, :2, :] - self.dboxes[:, :2, :])/self.dboxes[:, 2:, ]
-        gwh = self.scale_wh*(loc[:, 2:, :]/self.dboxes[:, 2:, :]).log()
+        gxy = (
+            self.scale_xy
+            * (loc[:, :2, :] - self.dboxes[:, :2, :])
+            / self.dboxes[
+                :,
+                2:,
+            ]
+        )
+        gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log()
 
         return torch.cat((gxy, gwh), dim=1).contiguous()
 
     def forward(self, ploc, plabel, gloc, glabel):
         """
-            ploc, plabel: Nx4x8732, Nxlabel_numx8732
-                predicted location and labels
+        ploc, plabel: Nx4x8732, Nxlabel_numx8732
+            predicted location and labels
 
-            gloc, glabel: Nx4x8732, Nx8732
-                ground truth location and labels
+        gloc, glabel: Nx4x8732, Nx8732
+            ground truth location and labels
         """
 
         mask = glabel > 0
@@ -177,7 +196,7 @@ def forward(self, ploc, plabel, gloc, glabel):
         vec_gd = self._loc_vec(gloc)
         # sum on four coordinates, and mask
         sl1 = self.sl1_loss(ploc, vec_gd).sum(dim=1)
-        sl1 = (mask.float()*sl1).sum(dim=1)
+        sl1 = (mask.float() * sl1).sum(dim=1)
 
         # hard negative mining
         con = self.con_loss(plabel, glabel)
@@ -189,16 +208,15 @@ def forward(self, ploc, plabel, gloc, glabel):
         _, con_rank = con_idx.sort(dim=1)
 
         # number of negative three times positive
-        neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1)
+        neg_num = torch.clamp(3 * pos_num, max=mask.size(1)).unsqueeze(-1)
         neg_mask = con_rank < neg_num
 
-        closs = (con*(mask.float() + neg_mask.float())).sum(dim=1)
+        closs = (con * (mask.float() + neg_mask.float())).sum(dim=1)
 
         # avoid no object detected
         total_loss = sl1 + closs
         num_mask = (pos_num > 0).float()
         pos_num = pos_num.float().clamp(min=1e-6)
 
-        ret = (total_loss*num_mask/pos_num).mean(dim=0)
+        ret = (total_loss * num_mask / pos_num).mean(dim=0)
         return ret
-
diff --git a/vision/classification_and_detection/python/models/convert_tf_weights.py b/vision/classification_and_detection/python/models/convert_tf_weights.py
index 02c51bf1b..4d063caef 100644
--- a/vision/classification_and_detection/python/models/convert_tf_weights.py
+++ b/vision/classification_and_detection/python/models/convert_tf_weights.py
@@ -17,7 +17,7 @@ def remap_tf_base_names(orig_weights):
         if "batchnorm" not in k and "pointwise_" not in k
     }
 
-    matcher = re.compile("(.*)Conv2d_(\d+)")
+    matcher = re.compile("(.*)Conv2d_(\\d+)")
     mapping = {}
     for k in convs.keys():
         l = matcher.match(k).group(2)
@@ -52,7 +52,7 @@ def remap_tf_extras(orig_weights):
     }
     weights = {k: v for k, v in weights.items() if "pointwise_" in k}
 
-    matcher = re.compile("(.*)Conv2d_(\d+)_(\d)x(\d)")
+    matcher = re.compile("(.*)Conv2d_(\\d+)_(\\d)x(\\d)")
     mapping = {}
     for k in weights.keys():
         m = matcher.match(k)
@@ -75,7 +75,7 @@ def remap_tf_predictors(orig_weights):
     weights = {k: v for k, v in orig_weights.items() if "BoxPredictor" in k}
     weights = {k: v for k, v in weights.items() if "BoxEncodingPredictor" in k}
 
-    matcher = re.compile("BoxPredictor_(\d+)")
+    matcher = re.compile("BoxPredictor_(\\d+)")
     for k in weights.keys():
         pos = matcher.match(k).group(1)
         wtype = "weight" if "weights" in k else "bias"
@@ -125,13 +125,15 @@ def get_state_dict(weights):
 def read_tf_weights(frozen_model):
     import tensorflow as tf
     from tensorflow.python.framework import tensor_util
+
     weights = {}
     with tf.Session() as sess:
-        with tf.gfile.GFile(frozen_model, 'rb') as f:
+        with tf.gfile.GFile(frozen_model, "rb") as f:
             graph_def = tf.GraphDef()
             graph_def.ParseFromString(f.read())
             tf.import_graph_def(graph_def)
         for n in graph_def.node:
-            if n.op == 'Const':
-                weights[n.name] = tensor_util.MakeNdarray(n.attr['value'].tensor)
+            if n.op == "Const":
+                weights[n.name] = tensor_util.MakeNdarray(
+                    n.attr["value"].tensor)
     return weights
diff --git a/vision/classification_and_detection/python/models/ssd_mobilenet_v1.py b/vision/classification_and_detection/python/models/ssd_mobilenet_v1.py
index eead72ae7..97be21982 100644
--- a/vision/classification_and_detection/python/models/ssd_mobilenet_v1.py
+++ b/vision/classification_and_detection/python/models/ssd_mobilenet_v1.py
@@ -81,7 +81,8 @@ def __init__(self, in_channels, num_classes, num_anchors):
         self.classification = nn.Conv2d(
             in_channels, num_classes * num_anchors, kernel_size=1
         )
-        self.regression = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1)
+        self.regression = nn.Conv2d(
+            in_channels, num_anchors * 4, kernel_size=1)
 
         self.num_classes = num_classes
         self.num_anchors = num_anchors
@@ -161,13 +162,12 @@ def ssd_model(self, x):
             self._feature_map_shapes = shapes
 
         self.coder_weights = self.coder_weights.to(scores)
-        if box_regression.dim()==2:
+        if box_regression.dim() == 2:
             box_regression = box_regression[None]
         boxes = decode_boxes(box_regression, self.priors, self.coder_weights)
         # add a batch dimension
         return scores, boxes
 
-
     def forward(self, images):
         """
         Arguments:
@@ -175,13 +175,15 @@ def forward(self, images):
         """
 
         scores, boxes = self.ssd_model(images)
-        list_boxes=[]; list_labels=[]; list_scores=[]
+        list_boxes = []
+        list_labels = []
+        list_scores = []
         for b in range(len(scores)):
             bboxes, blabels, bscores = self.filter_results(scores[b], boxes[b])
             list_boxes.append(bboxes)
             list_labels.append(blabels.long())
             list_scores.append(bscores)
-        #boxes = self.rescale_boxes(boxes, height, width)
+        # boxes = self.rescale_boxes(boxes, height, width)
         return [list_boxes, list_labels, list_scores]
 
     def filter_results(self, scores, boxes):
@@ -190,8 +192,8 @@ def filter_results(self, scores, boxes):
         # on python. This implementation is faster on the
         # CPU, which is why we run this part on the CPU
         cpu_device = torch.device("cpu")
-        #boxes = boxes[0]
-        #scores = scores[0]
+        # boxes = boxes[0]
+        # scores = scores[0]
         boxes = boxes.to(cpu_device)
         scores = scores.to(cpu_device)
         selected_box_probs = []
@@ -205,7 +207,8 @@ def filter_results(self, scores, boxes):
             box_probs = nms(box_probs, self.nms_threshold)
             selected_box_probs.append(box_probs)
             labels.append(
-                torch.full((box_probs.size(0),), class_index, dtype=torch.int64)
+                torch.full(
+                    (box_probs.size(0),), class_index, dtype=torch.int64)
             )
         selected_box_probs = torch.cat(selected_box_probs)
         labels = torch.cat(labels)
diff --git a/vision/classification_and_detection/python/models/ssd_r34.py b/vision/classification_and_detection/python/models/ssd_r34.py
index 63e596b4a..e8138e3a1 100644
--- a/vision/classification_and_detection/python/models/ssd_r34.py
+++ b/vision/classification_and_detection/python/models/ssd_r34.py
@@ -6,102 +6,114 @@
 import itertools
 import torch.nn.functional as F
 
-##Inspired by https://github.com/kuangliu/pytorch-ssd
+# Inspired by https://github.com/kuangliu/pytorch-ssd
+
 
 class Encoder(object):
     """
-        Transform between (bboxes, lables) <-> SSD output
-        
-        dboxes: default boxes in size 8732 x 4, 
-            encoder: input ltrb format, output xywh format
-            decoder: input xywh format, output ltrb format 
-
-        decode:
-            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
-            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
-            criteria : IoU threshold of bboexes
-            max_output : maximum number of output bboxes
+    Transform between (bboxes, lables) <-> SSD output
+
+    dboxes: default boxes in size 8732 x 4,
+        encoder: input ltrb format, output xywh format
+        decoder: input xywh format, output ltrb format
+
+    decode:
+        input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
+        output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
+        criteria : IoU threshold of bboexes
+        max_output : maximum number of output bboxes
     """
 
     def __init__(self, dboxes):
         self.dboxes = dboxes(order="ltrb")
         self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0)
         self.nboxes = self.dboxes.size(0)
-        #print("# Bounding boxes: {}".format(self.nboxes))
+        # print("# Bounding boxes: {}".format(self.nboxes))
         self.scale_xy = torch.tensor(dboxes.scale_xy)
         self.scale_wh = torch.tensor(dboxes.scale_wh)
-    
-    
-    def decode_batch(self, bboxes_in, scores_in,  criteria = 0.45, max_output=200):
+
+    def decode_batch(self, bboxes_in, scores_in,
+                     criteria=0.45, max_output=200):
         self.dboxes = self.dboxes.to(bboxes_in)
         self.dboxes_xywh = self.dboxes_xywh.to(bboxes_in)
-        bboxes, probs = scale_back_batch(bboxes_in, scores_in,self.scale_xy,self.scale_wh,self.dboxes_xywh)
-        boxes = []; labels=[]; scores=[]
+        bboxes, probs = scale_back_batch(
+            bboxes_in, scores_in, self.scale_xy, self.scale_wh, self.dboxes_xywh
+        )
+        boxes = []
+        labels = []
+        scores = []
         for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
             bbox = bbox.squeeze(0)
             prob = prob.squeeze(0)
-            dbox,dlabel,dscore=self.decode_single(bbox, prob, criteria, max_output)
+            dbox, dlabel, dscore = self.decode_single(
+                bbox, prob, criteria, max_output)
             boxes.append(dbox)
             labels.append(dlabel)
             scores.append(dscore)
-           
-        return [boxes,labels,scores]
+
+        return [boxes, labels, scores]
 
     # perform non-maximum suppression
-    def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
+    def decode_single(self, bboxes_in, scores_in,
+                      criteria, max_output, max_num=200):
         # Reference to https://github.com/amdegroot/ssd.pytorch
-       
-        bboxes_out = []        
+
+        bboxes_out = []
         scores_out = []
         labels_out = []
 
         for i, score in enumerate(scores_in.split(1, 1)):
             # skip background
-            if i == 0: continue
-            
+            if i == 0:
+                continue
+
             score = score.squeeze(1)
             mask = score > 0.05
 
             bboxes, score = bboxes_in[mask, :], score[mask]
-            if score.size(0) == 0: continue
+            if score.size(0) == 0:
+                continue
 
             score_sorted, score_idx_sorted = score.sort(dim=0)
-        
+
             # select max_output indices
             score_idx_sorted = score_idx_sorted[-max_num:]
             candidates = []
-        
+
             while score_idx_sorted.numel() > 0:
                 idx = score_idx_sorted[-1].item()
                 bboxes_sorted = bboxes[score_idx_sorted, :]
                 bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
-                iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
-                # we only need iou < criteria 
+                iou_sorted = calc_iou_tensor(
+                    bboxes_sorted, bboxes_idx).squeeze()
+                # we only need iou < criteria
                 score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
                 candidates.append(idx)
 
             bboxes_out.append(bboxes[candidates, :])
             scores_out.append(score[candidates])
-            labels_out.extend([i]*len(candidates))
-
-        bboxes_out, labels_out, scores_out = torch.cat(bboxes_out, dim=0), \
-               torch.tensor(labels_out, dtype=torch.long), \
-               torch.cat(scores_out, dim=0)
+            labels_out.extend([i] * len(candidates))
 
+        bboxes_out, labels_out, scores_out = (
+            torch.cat(bboxes_out, dim=0),
+            torch.tensor(labels_out, dtype=torch.long),
+            torch.cat(scores_out, dim=0),
+        )
 
         _, max_ids = scores_out.sort(dim=0)
         max_ids = max_ids[-max_output:]
         return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
 
+
 @torch.jit.script
 def calc_iou_tensor(box1, box2):
-    """ Calculation of IoU based on two boxes tensor,
-        Reference to https://github.com/kuangliu/pytorch-ssd
-        input:
-            box1 (N, 4) 
-            box2 (M, 4)
-        output:
-            IoU (N, M)
+    """Calculation of IoU based on two boxes tensor,
+    Reference to https://github.com/kuangliu/pytorch-ssd
+    input:
+        box1 (N, 4)
+        box2 (M, 4)
+    output:
+        IoU (N, M)
     """
     N = box1.size(0)
     M = box2.size(0)
@@ -110,37 +122,42 @@ def calc_iou_tensor(box1, box2):
     be2 = box2.unsqueeze(0).expand(N, -1, -1)
 
     # Left Top & Right Bottom
-    lt = torch.max(be1[:,:,:2], be2[:,:,:2])
-    rb = torch.min(be1[:,:,2:], be2[:,:,2:])
+    lt = torch.max(be1[:, :, :2], be2[:, :, :2])
+    rb = torch.min(be1[:, :, 2:], be2[:, :, 2:])
     delta = rb - lt
-    delta.clone().masked_fill_(delta < 0,0)
-    intersect = delta[:,:,0]*delta[:,:,1]
-    delta1 = be1[:,:,2:] - be1[:,:,:2]
-    area1 = delta1[:,:,0]*delta1[:,:,1]
-    delta2 = be2[:,:,2:] - be2[:,:,:2]
-    area2 = delta2[:,:,0]*delta2[:,:,1]
-
-    iou = intersect/(area1 + area2 - intersect)
+    delta.clone().masked_fill_(delta < 0, 0)
+    intersect = delta[:, :, 0] * delta[:, :, 1]
+    delta1 = be1[:, :, 2:] - be1[:, :, :2]
+    area1 = delta1[:, :, 0] * delta1[:, :, 1]
+    delta2 = be2[:, :, 2:] - be2[:, :, :2]
+    area2 = delta2[:, :, 0] * delta2[:, :, 1]
+
+    iou = intersect / (area1 + area2 - intersect)
     return iou
 
+
 @torch.jit.script
-def scale_back_batch(bboxes_in, scores_in,scale_xy,scale_wh,dboxes_xywh):
+def scale_back_batch(bboxes_in, scores_in, scale_xy, scale_wh, dboxes_xywh):
+    """
+    Do scale and transform from xywh to ltrb
+    suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox
     """
-        Do scale and transform from xywh to ltrb
-        suppose input Nx4xnum_bbox Nxlabel_numxnum_bbox
-    """     
     bboxes_in = bboxes_in.permute(0, 2, 1)
     scores_in = scores_in.permute(0, 2, 1)
 
-    bboxes_in[:, :, :2] = scale_xy*bboxes_in[:, :, :2]
-    bboxes_in[:, :, 2:] = scale_wh*bboxes_in[:, :, 2:]
-    bboxes_in[:, :, :2] = bboxes_in[:, :, :2]*dboxes_xywh[:, :, 2:] + dboxes_xywh[:, :, :2]
-    bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp()*dboxes_xywh[:, :, 2:]
-    # Transform format to ltrb 
-    l, t, r, b = bboxes_in[:, :, 0] - 0.5*bboxes_in[:, :, 2],\
-                 bboxes_in[:, :, 1] - 0.5*bboxes_in[:, :, 3],\
-                 bboxes_in[:, :, 0] + 0.5*bboxes_in[:, :, 2],\
-                 bboxes_in[:, :, 1] + 0.5*bboxes_in[:, :, 3]
+    bboxes_in[:, :, :2] = scale_xy * bboxes_in[:, :, :2]
+    bboxes_in[:, :, 2:] = scale_wh * bboxes_in[:, :, 2:]
+    bboxes_in[:, :, :2] = (
+        bboxes_in[:, :, :2] * dboxes_xywh[:, :, 2:] + dboxes_xywh[:, :, :2]
+    )
+    bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * dboxes_xywh[:, :, 2:]
+    # Transform format to ltrb
+    l, t, r, b = (
+        bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2],
+        bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3],
+        bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2],
+        bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3],
+    )
     bboxes_in[:, :, 0] = l
     bboxes_in[:, :, 1] = t
     bboxes_in[:, :, 2] = r
@@ -149,92 +166,116 @@ def scale_back_batch(bboxes_in, scores_in,scale_xy,scale_wh,dboxes_xywh):
 
 
 class DefaultBoxes(object):
-    def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \
-                       scale_xy=0.1, scale_wh=0.2):
+    def __init__(
+        self,
+        fig_size,
+        feat_size,
+        steps,
+        scales,
+        aspect_ratios,
+        scale_xy=0.1,
+        scale_wh=0.2,
+    ):
 
         self.feat_size = feat_size
-        self.fig_size_w,self.fig_size_h = fig_size
+        self.fig_size_w, self.fig_size_h = fig_size
 
         self.scale_xy_ = scale_xy
         self.scale_wh_ = scale_wh
-        
+
         # According to https://github.com/weiliu89/caffe
         # Calculation method slightly different from paper
         self.steps_w = [st[0] for st in steps]
         self.steps_h = [st[1] for st in steps]
         self.scales = scales
-        fkw = self.fig_size_w//np.array(self.steps_w)
-        fkh = self.fig_size_h//np.array(self.steps_h)
+        fkw = self.fig_size_w // np.array(self.steps_w)
+        fkh = self.fig_size_h // np.array(self.steps_h)
         self.aspect_ratios = aspect_ratios
 
         self.default_boxes = []
         # size of feature and number of feature
         for idx, sfeat in enumerate(self.feat_size):
-            sfeat_w,sfeat_h=sfeat
-            sk1 = scales[idx][0]/self.fig_size_w
-            sk2 = scales[idx+1][1]/self.fig_size_h
-            sk3 = sqrt(sk1*sk2)
+            sfeat_w, sfeat_h = sfeat
+            sk1 = scales[idx][0] / self.fig_size_w
+            sk2 = scales[idx + 1][1] / self.fig_size_h
+            sk3 = sqrt(sk1 * sk2)
             all_sizes = [(sk1, sk1), (sk3, sk3)]
             for alpha in aspect_ratios[idx]:
-                w, h = sk1*sqrt(alpha), sk1/sqrt(alpha)
+                w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha)
                 all_sizes.append((w, h))
                 all_sizes.append((h, w))
             for w, h in all_sizes:
                 for i, j in itertools.product(range(sfeat_w), range(sfeat_h)):
-                    cx, cy = (j+0.5)/fkh[idx], (i+0.5)/fkw[idx]
-                    self.default_boxes.append((cx, cy, w, h)) 
+                    cx, cy = (j + 0.5) / fkh[idx], (i + 0.5) / fkw[idx]
+                    self.default_boxes.append((cx, cy, w, h))
         self.dboxes = torch.tensor(self.default_boxes)
         self.dboxes.clamp_(min=0, max=1)
         # For IoU calculation
         self.dboxes_ltrb = self.dboxes.clone()
-        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5*self.dboxes[:, 2]
-        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5*self.dboxes[:, 3]
-        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5*self.dboxes[:, 2]
-        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5*self.dboxes[:, 3]
-    
+        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]
+        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]
+        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]
+        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]
+
     @property
     def scale_xy(self):
         return self.scale_xy_
-    
-    @property    
+
+    @property
     def scale_wh(self):
         return self.scale_wh_
 
     def __call__(self, order="ltrb"):
-        if order == "ltrb": return self.dboxes_ltrb
-        if order == "xywh": return self.dboxes
+        if order == "ltrb":
+            return self.dboxes_ltrb
+        if order == "xywh":
+            return self.dboxes
+
 
-def dboxes_R34_coco(figsize,strides):
+def dboxes_R34_coco(figsize, strides):
     feat_size = [[50, 50], [25, 25], [13, 13], [7, 7], [3, 3], [3, 3]]
-    steps=[(int(figsize[0]/fs[0]),int(figsize[1]/fs[1])) for fs in feat_size]
-    scales = [(int(s*figsize[0]/300),int(s*figsize[1]/300)) for s in [21, 45, 99, 153, 207, 261, 315]] 
-    aspect_ratios =  [[2], [2, 3], [2, 3], [2, 3], [2], [2]] 
+    steps = [(int(figsize[0] / fs[0]), int(figsize[1] / fs[1]))
+             for fs in feat_size]
+    scales = [
+        (int(s * figsize[0] / 300), int(s * figsize[1] / 300))
+        for s in [21, 45, 99, 153, 207, 261, 315]
+    ]
+    aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
     dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
     return dboxes
 
+
 class SSD_R34(nn.Module):
     """
-        Build a SSD module to take 300x300 image input,
-        and output 8732 per class bounding boxes
+    Build a SSD module to take 300x300 image input,
+    and output 8732 per class bounding boxes
 
-        vggt: pretrained vgg16 (partial) model
-        label_num: number of classes (including background 0)
+    vggt: pretrained vgg16 (partial) model
+    label_num: number of classes (including background 0)
     """
-    def __init__(self, label_num=81, backbone='resnet34', model_path="./resnet34-333f7ec4.pth",strides=[3,3 ,2 ,2 ,2 ,2],extract_shapes=False):
+
+    def __init__(
+        self,
+        label_num=81,
+        backbone="resnet34",
+        model_path="./resnet34-333f7ec4.pth",
+        strides=[3, 3, 2, 2, 2, 2],
+        extract_shapes=False,
+    ):
 
         super(SSD_R34, self).__init__()
 
         self.label_num = label_num
         self.strides = strides
-        if backbone == 'resnet34':
+        if backbone == "resnet34":
             self.model = ResNet34()
             out_channels = 256
             self.out_chan = [out_channels, 512, 512, 256, 256, 256]
         else:
-            raise ValueError('Invalid backbone chosen')
+            raise ValueError("Invalid backbone chosen")
 
         self._build_additional_features(self.out_chan)
-        self.extract_shapes=extract_shapes
+        self.extract_shapes = extract_shapes
         # after l2norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2
         # classifer 1, 2, 3, 4, 5 ,6
 
@@ -242,110 +283,159 @@ def __init__(self, label_num=81, backbone='resnet34', model_path="./resnet34-333
         self.loc = []
         self.conf = []
         for nd, oc in zip(self.num_defaults, self.out_chan):
-            self.loc.append(nn.Conv2d(oc, nd*4, kernel_size=3, padding=1,stride=self.strides[0]))
-            self.conf.append(nn.Conv2d(oc, nd*label_num, kernel_size=3, padding=1,stride=self.strides[1]))
+            self.loc.append(
+                nn.Conv2d(
+                    oc,
+                    nd * 4,
+                    kernel_size=3,
+                    padding=1,
+                    stride=self.strides[0])
+            )
+            self.conf.append(
+                nn.Conv2d(
+                    oc, nd * label_num, kernel_size=3, padding=1, stride=self.strides[1]
+                )
+            )
 
         self.loc = nn.ModuleList(self.loc)
         self.conf = nn.ModuleList(self.conf)
         if not extract_shapes:
-            self.size=(1200,1200)
-            dboxes = dboxes_R34_coco(list(self.size),[3,3,2,2,2,2])
+            self.size = (1200, 1200)
+            dboxes = dboxes_R34_coco(list(self.size), [3, 3, 2, 2, 2, 2])
             self.encoder = Encoder(dboxes)
         # intitalize all weights
         self._init_weights()
         self.device = 1
+
     def _build_additional_features(self, input_channels):
         idx = 0
         self.additional_blocks = []
-        
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 256, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1,stride=self.strides[2]),
-            nn.ReLU(inplace=True),
-        ))
+
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 256, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    256,
+                    input_channels[idx + 1],
+                    kernel_size=3,
+                    padding=1,
+                    stride=self.strides[2],
+                ),
+                nn.ReLU(inplace=True),
+            )
+        )
         idx += 1
 
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 256, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1, stride=self.strides[3]),
-            nn.ReLU(inplace=True),
-        ))
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 256, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    256,
+                    input_channels[idx + 1],
+                    kernel_size=3,
+                    padding=1,
+                    stride=self.strides[3],
+                ),
+                nn.ReLU(inplace=True),
+            )
+        )
         idx += 1
 
         # conv9_1, conv9_2
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, input_channels[idx+1], kernel_size=3, padding=1, stride=self.strides[4]),
-            nn.ReLU(inplace=True),
-        ))
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 128, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    128,
+                    input_channels[idx + 1],
+                    kernel_size=3,
+                    padding=1,
+                    stride=self.strides[4],
+                ),
+                nn.ReLU(inplace=True),
+            )
+        )
         idx += 1
 
         # conv10_1, conv10_2
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, input_channels[idx+1], kernel_size=3,stride=self.strides[5]),
-            nn.ReLU(inplace=True),
-        ))
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 128, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(
+                    128, input_channels[idx + 1], kernel_size=3, stride=self.strides[5]
+                ),
+                nn.ReLU(inplace=True),
+            )
+        )
         idx += 1
 
-
-
         # conv11_1, conv11_2
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, input_channels[idx+1], kernel_size=3),
-            nn.ReLU(inplace=True),
-        ))
+        self.additional_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(input_channels[idx], 128, kernel_size=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(128, input_channels[idx + 1], kernel_size=3),
+                nn.ReLU(inplace=True),
+            )
+        )
 
         self.additional_blocks = nn.ModuleList(self.additional_blocks)
 
     def _init_weights(self):
 
-        layers = [
-            *self.additional_blocks,
-            *self.loc, *self.conf]
+        layers = [*self.additional_blocks, *self.loc, *self.conf]
 
         for layer in layers:
             for param in layer.parameters():
-                if param.dim() > 1: nn.init.xavier_uniform_(param)
+                if param.dim() > 1:
+                    nn.init.xavier_uniform_(param)
 
     # Shape the classifier to the view of bboxes
-    def bbox_view(self, src, loc, conf,extract_shapes=False):
+    def bbox_view(self, src, loc, conf, extract_shapes=False):
         ret = []
         features_shapes = []
         for s, l, c in zip(src, loc, conf):
-            ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1)))
-            # extract shapes for prior box initliziation 
+            ret.append(
+                (l(s).view(s.size(0), 4, -1),
+                 c(s).view(s.size(0), self.label_num, -1))
+            )
+            # extract shapes for prior box initliziation
             if extract_shapes:
-                ls=l(s)
-                features_shapes.append([ls.shape[2],ls.shape[3]])
+                ls = l(s)
+                features_shapes.append([ls.shape[2], ls.shape[3]])
         locs, confs = list(zip(*ret))
-        locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
-        return locs, confs,features_shapes
+        locs, confs = torch.cat(
+            locs, 2).contiguous(), torch.cat(
+            confs, 2).contiguous()
+        return locs, confs, features_shapes
 
     def forward(self, data):
         layers = self.model(data)
 
         # last result from network goes into additional blocks
         x = layers[-1]
-        
+
         additional_results = []
         for i, l in enumerate(self.additional_blocks):
-            
+
             x = l(x)
             additional_results.append(x)
 
         src = [*layers, *additional_results]
-        # Feature maps sizes depend on the image size. For 300x300 with strides=[1,1,2,2,2,1] it is 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4 
-        locs, confs,features_shapes = self.bbox_view(src, self.loc, self.conf,extract_shapes=self.extract_shapes)
+        # Feature maps sizes depend on the image size. For 300x300 with
+        # strides=[1,1,2,2,2,1] it is 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4,
+        # 1x1x4
+        locs, confs, features_shapes = self.bbox_view(
+            src, self.loc, self.conf, extract_shapes=self.extract_shapes
+        )
         if self.extract_shapes:
-            return locs, confs,features_shapes
-        else:    
-            # For SSD 300 with strides=[1,1,2,2,2,1] , shall return nbatch x 8732 x {nlabels, nlocs} results 
-            results=self.encoder.decode_batch(locs, confs, 0.50, 200) #[0]
-            return results #locs, confs,features_shapes
+            return locs, confs, features_shapes
+        else:
+            # For SSD 300 with strides=[1,1,2,2,2,1] , shall return nbatch x
+            # 8732 x {nlabels, nlocs} results
+            results = self.encoder.decode_batch(locs, confs, 0.50, 200)  # [0]
+            return results  # locs, confs,features_shapes
diff --git a/vision/classification_and_detection/python/models/utils.py b/vision/classification_and_detection/python/models/utils.py
index 332d19c17..232838ef0 100644
--- a/vision/classification_and_detection/python/models/utils.py
+++ b/vision/classification_and_detection/python/models/utils.py
@@ -15,7 +15,7 @@ def __init__(self, out):
         self.register_buffer("scale", torch.ones(out))
         self.register_buffer("bias", torch.zeros(out))
 
-    #@torch.jit.script_method
+    # @torch.jit.script_method
     def forward(self, x):
         scale = self.scale.view(1, -1, 1, 1)
         bias = self.bias.view(1, -1, 1, 1)
@@ -31,7 +31,7 @@ def __init__(self, out):
         super(BiasAdd, self).__init__()
         self.register_buffer("bias", torch.zeros(out))
 
-    #@torch.jit.script_method
+    # @torch.jit.script_method
     def forward(self, x):
         bias = self.bias.view(1, -1, 1, 1)
         return x + bias
@@ -52,14 +52,15 @@ def _compute_padding(self, input, dim):
         effective_filter_size = (filter_size - 1) * self.dilation[dim] + 1
         out_size = (input_size + self.stride[dim] - 1) // self.stride[dim]
         total_padding = max(
-            0, (out_size - 1) * self.stride[dim] + effective_filter_size - input_size
+            0, (out_size - 1) * self.stride[dim] +
+            effective_filter_size - input_size
         )
         additional_padding = int(total_padding % 2 != 0)
 
         return additional_padding, total_padding
 
     def forward(self, input):
-        #import pdb; pdb.set_trace()
+        # import pdb; pdb.set_trace()
         if self.padding == "valid":
             return F.conv2d(
                 input,
@@ -151,8 +152,8 @@ def decode_boxes(rel_codes, boxes, weights):
     # type: (torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
 
     # perform some unpacking to make it JIT-fusion friendly
-    
-    #rel_codes=rel_codes[0][None]
+
+    # rel_codes=rel_codes[0][None]
     wx = weights[1]
     wy = weights[0]
     ww = weights[3]
@@ -163,10 +164,10 @@ def decode_boxes(rel_codes, boxes, weights):
     boxes_x2 = boxes[:, 3].unsqueeze(1).unsqueeze(0)
     boxes_y2 = boxes[:, 2].unsqueeze(1).unsqueeze(0)
 
-    dx = rel_codes[:,:, 1].unsqueeze(2)
-    dy = rel_codes[:,:, 0].unsqueeze(2)
-    dw = rel_codes[:,:, 3].unsqueeze(2)
-    dh = rel_codes[:,:, 2].unsqueeze(2)
+    dx = rel_codes[:, :, 1].unsqueeze(2)
+    dy = rel_codes[:, :, 0].unsqueeze(2)
+    dw = rel_codes[:, :, 3].unsqueeze(2)
+    dh = rel_codes[:, :, 2].unsqueeze(2)
 
     # implementation starts here
     widths = boxes_x2 - boxes_x1
@@ -180,7 +181,7 @@ def decode_boxes(rel_codes, boxes, weights):
     dh = dh / wh
 
     pred_ctr_x = dx * widths + ctr_x
-    #import pdb; pdb.set_trace()
+    # import pdb; pdb.set_trace()
     pred_ctr_y = dy * heights + ctr_y
     pred_w = torch.exp(dw) * widths
     pred_h = torch.exp(dh) * heights
@@ -194,5 +195,5 @@ def decode_boxes(rel_codes, boxes, weights):
         ],
         dim=2,
     )
-    #import pdb; pdb.set_trace()
+    # import pdb; pdb.set_trace()
     return pred_boxes
diff --git a/vision/classification_and_detection/python/ncnn_models/__init__.py b/vision/classification_and_detection/python/ncnn_models/__init__.py
index 505969635..99211f4bf 100644
--- a/vision/classification_and_detection/python/ncnn_models/__init__.py
+++ b/vision/classification_and_detection/python/ncnn_models/__init__.py
@@ -1 +1 @@
-from .resnet50 import Resnet50
\ No newline at end of file
+from .resnet50 import Resnet50
diff --git a/vision/classification_and_detection/python/ncnn_models/resnet50.py b/vision/classification_and_detection/python/ncnn_models/resnet50.py
index a62e4a8ab..d92a07474 100644
--- a/vision/classification_and_detection/python/ncnn_models/resnet50.py
+++ b/vision/classification_and_detection/python/ncnn_models/resnet50.py
@@ -3,7 +3,9 @@
 
 
 class Resnet50:
-    def __init__(self, model_param, model_bin, target_size=224, num_threads=1, use_gpu=False):
+    def __init__(
+        self, model_param, model_bin, target_size=224, num_threads=1, use_gpu=False
+    ):
         self.target_size = target_size
         self.num_threads = num_threads
 
@@ -14,11 +16,11 @@ def __init__(self, model_param, model_bin, target_size=224, num_threads=1, use_g
 
     def __del__(self):
         self.net = None
-    
+
     @property
     def input_name(self):
         return "in0"
-    
+
     @property
     def output_name(self):
         return "out0"
diff --git a/vision/classification_and_detection/python/openimages.py b/vision/classification_and_detection/python/openimages.py
index 449f40824..e593f59f4 100644
--- a/vision/classification_and_detection/python/openimages.py
+++ b/vision/classification_and_detection/python/openimages.py
@@ -14,8 +14,21 @@
 
 
 class OpenImages(dataset.Dataset):
-    def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
-                 image_format="NHWC", pre_process=None, count=None, cache_dir=None, preprocessed_dir=None, use_label_map=False, threads=os.cpu_count()):
+    def __init__(
+        self,
+        data_path,
+        image_list,
+        name,
+        use_cache=0,
+        image_size=None,
+        image_format="NHWC",
+        pre_process=None,
+        count=None,
+        cache_dir=None,
+        preprocessed_dir=None,
+        use_label_map=False,
+        threads=os.cpu_count(),
+    ):
         super().__init__()
         self.image_size = image_size
         self.image_list = []
@@ -26,7 +39,7 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         self.use_cache = use_cache
         self.data_path = data_path
         self.pre_process = pre_process
-        self.use_label_map=use_label_map
+        self.use_label_map = use_label_map
 
         if not cache_dir:
             cache_dir = os.getcwd()
@@ -34,16 +47,19 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
             if preprocessed_dir:
                 self.cache_dir = preprocessed_dir
             else:
-                self.cache_dir = os.path.join(cache_dir, "preprocessed", name, image_format)
+                self.cache_dir = os.path.join(
+                    cache_dir, "preprocessed", name, image_format
+                )
         else:
             self.cache_dir = cache_dir
         # input images are in HWC
         self.need_transpose = True if image_format == "NCHW" else False
-        not_found = 0 
+        not_found = 0
         empty_80catageories = 0
         if image_list is None:
             # by default look for val_map.txt
-            image_list = os.path.join(data_path, "annotations/openimages-mlperf.json")
+            image_list = os.path.join(
+                data_path, "annotations/openimages-mlperf.json")
         self.annotation_file = image_list
         if self.use_label_map:
             # for pytorch
@@ -59,28 +75,37 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         with open(image_list, "r") as f:
             openimages = json.load(f)
         for i in openimages["images"]:
-            images[i["id"]] = {"file_name": i["file_name"],
-                               "height": i["height"],
-                               "width": i["width"],
-                               "bbox": [],
-                               "category": []}
+            images[i["id"]] = {
+                "file_name": i["file_name"],
+                "height": i["height"],
+                "width": i["width"],
+                "bbox": [],
+                "category": [],
+            }
         for a in openimages["annotations"]:
             i = images.get(a["image_id"])
             if i is None:
                 continue
-            catagory_ids = label_map[a.get("category_id")] if self.use_label_map else a.get("category_id")
+            catagory_ids = (
+                label_map[a.get("category_id")]
+                if self.use_label_map
+                else a.get("category_id")
+            )
             i["category"].append(catagory_ids)
             i["bbox"].append(a.get("bbox"))
 
         for image_id, img in images.items():
             image_name = img["file_name"]
-            if len(img["category"])==0 and self.use_label_map: 
-                #if an image doesn't have any of the 81 categories in it    
-                empty_80catageories += 1 #should be 48 images - thus the validation sert has 4952 images
-                continue 
+            if len(img["category"]) == 0 and self.use_label_map:
+                # if an image doesn't have any of the 81 categories in it
+                empty_80catageories += (
+                    1  # should be 48 images - thus the validation sert has 4952 images
+                )
+                continue
 
             if not self.pre_process:
-                if not os.path.exists(os.path.join(data_path, image_name) + ".npy"):
+                if not os.path.exists(os.path.join(
+                        data_path, image_name) + ".npy"):
                     # if the image does not exists ignore it
                     not_found += 1
                     continue
@@ -90,12 +115,19 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
                     # if the image does not exists ignore it
                     not_found += 1
                     continue
-                os.makedirs(os.path.dirname(os.path.join(self.cache_dir, image_name)), exist_ok=True)
+                os.makedirs(
+                    os.path.dirname(os.path.join(self.cache_dir, image_name)),
+                    exist_ok=True,
+                )
                 dst = os.path.join(self.cache_dir, image_name)
                 if not os.path.exists(dst + ".npy"):
                     # cache a preprocessed version of the image
                     img_org = cv2.imread(src)
-                    processed = self.pre_process(img_org, need_transpose=self.need_transpose, dims=self.image_size)
+                    processed = self.pre_process(
+                        img_org,
+                        need_transpose=self.need_transpose,
+                        dims=self.image_size,
+                    )
                     np.save(dst, processed)
 
             self.image_ids.append(image_id)
@@ -114,10 +146,16 @@ def __init__(self, data_path, image_list, name, use_cache=0, image_size=None,
         if not_found > 0:
             log.info("reduced image list, %d images not found", not_found)
         if empty_80catageories > 0:
-            log.info("reduced image list, %d images without any of the 80 categories", empty_80catageories)
+            log.info(
+                "reduced image list, %d images without any of the 80 categories",
+                empty_80catageories,
+            )
 
-        log.info("loaded {} images, cache={}, already_preprocessed={}, took={:.1f}sec".format(
-            len(self.image_list), use_cache, pre_process is None, time_taken))
+        log.info(
+            "loaded {} images, cache={}, already_preprocessed={}, took={:.1f}sec".format(
+                len(self.image_list), use_cache, pre_process is None, time_taken
+            )
+        )
 
         self.label_list = np.array(self.label_list, dtype=list)
 
@@ -137,6 +175,7 @@ class PostProcessOpenImages:
     Post processing for open images dataset. Annotations should
     be exported into coco format.
     """
+
     def __init__(self):
         self.results = []
         self.good = 0
@@ -147,14 +186,22 @@ def __init__(self):
     def add_results(self, results):
         self.results.extend(results)
 
-    def __call__(self, results, ids, expected=None, result_dict=None, ):
+    def __call__(
+        self,
+        results,
+        ids,
+        expected=None,
+        result_dict=None,
+    ):
         # results come as:
-        #   tensorflow, ssd-mobilenet: num_detections,detection_boxes,detection_scores,detection_classes
+        # tensorflow, ssd-mobilenet:
+        # num_detections,detection_boxes,detection_scores,detection_classes
         processed_results = []
         # batch size
         bs = len(results[0])
         for idx in range(0, bs):
-            # keep the content_id from loadgen to handle content_id's without results
+            # keep the content_id from loadgen to handle content_id's without
+            # results
             self.content_ids.append(ids[idx])
             processed_results.append([])
             detection_num = int(results[0][idx])
@@ -166,10 +213,17 @@ def __call__(self, results, ids, expected=None, result_dict=None, ):
                 if detection_class in expected_classes:
                     self.good += 1
                 box = detection_boxes[detection]
-                processed_results[idx].append([float(ids[idx]),
-                                              box[0], box[1], box[2], box[3],
-                                              results[2][idx][detection],
-                                              float(detection_class)])
+                processed_results[idx].append(
+                    [
+                        float(ids[idx]),
+                        box[0],
+                        box[1],
+                        box[2],
+                        box[3],
+                        results[2][idx][detection],
+                        float(detection_class),
+                    ]
+                )
                 self.total += 1
         return processed_results
 
@@ -189,7 +243,7 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                 annotations = json.load(fin)
             for cnt, cat in enumerate(annotations["categories"]):
                 label_map[cat["id"]] = cnt + 1
-            inv_map = {v:k for k,v in label_map.items()}
+            inv_map = {v: k for k, v in label_map.items()}
 
         detections = []
         image_indices = []
@@ -200,8 +254,13 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                 # this is the index of the coco image
                 image_idx = int(detection[0])
                 if image_idx != self.content_ids[batch]:
-                    # working with the coco index/id is error prone - extra check to make sure it is consistent
-                    log.error("image_idx missmatch, lg={} / result={}".format(image_idx, self.content_ids[batch]))
+                    # working with the coco index/id is error prone - extra
+                    # check to make sure it is consistent
+                    log.error(
+                        "image_idx missmatch, lg={} / result={}".format(
+                            image_idx, self.content_ids[batch]
+                        )
+                    )
                 # map the index to the coco image id
                 detection[0] = ds.image_ids[image_idx]
                 height, width = ds.image_sizes[image_idx]
@@ -219,16 +278,19 @@ def finalize(self, result_dict, ds=None, output_dir=None):
                     cat_id = inv_map.get(int(detection[6]), -1)
                     if cat_id == -1:
                         # FIXME:
-                        log.info("finalize can't map category {}".format(int(detection[6])))
-                    detection[6] =  cat_id
+                        log.info(
+                            "finalize can't map category {}".format(
+                                int(detection[6]))
+                        )
+                    detection[6] = cat_id
                 detections.append(np.array(detection))
 
         # map indices to coco image id's
-        image_ids = [ds.image_ids[i]  for i in image_indices]
+        image_ids = [ds.image_ids[i] for i in image_indices]
         self.results = []
         cocoGt = pycoco.COCO(ds.annotation_file)
         cocoDt = cocoGt.loadRes(np.array(detections))
-        cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
+        cocoEval = COCOeval(cocoGt, cocoDt, iouType="bbox")
         cocoEval.params.imgIds = image_ids
         cocoEval.evaluate()
         cocoEval.accumulate()
@@ -240,7 +302,9 @@ class PostProcessOpenImagesRetinanet(PostProcessOpenImages):
     """
     Post processing required by retinanet / pytorch & onnx
     """
-    def __init__(self, use_inv_map, score_threshold, height, width, dict_format=True):
+
+    def __init__(self, use_inv_map, score_threshold,
+                 height, width, dict_format=True):
         """
         Args:
             height (int): Height of the input image
@@ -259,9 +323,9 @@ def __call__(self, results, ids, expected=None, result_dict=None):
         if self.dict_format:
             # If the output of the model is in dictionary format. This happens
             # for the model retinanet-pytorch
-            bboxes_ = [e['boxes'].cpu() for e in results]
-            labels_ = [e['labels'].cpu() for e in results]
-            scores_ = [e['scores'].cpu() for e in results]
+            bboxes_ = [e["boxes"].cpu() for e in results]
+            labels_ = [e["labels"].cpu() for e in results]
+            scores_ = [e["scores"].cpu() for e in results]
             results = [bboxes_, labels_, scores_]
         else:
             bboxes_ = [results[0]]
@@ -288,8 +352,8 @@ def __call__(self, results, ids, expected=None, result_dict=None):
                     self.good += 1
                 box = detection_boxes[detection]
                 # box comes from model as: xmin, ymin, xmax, ymax
-                # box comes with dimentions in the range of [0, height] 
-                # and [0, width] respectively. It is necesary to scale 
+                # box comes with dimentions in the range of [0, height]
+                # and [0, width] respectively. It is necesary to scale
                 # them in the range [0, 1]
                 processed_results[idx].append(
                     [
diff --git a/vision/classification_and_detection/python/pycoco.py b/vision/classification_and_detection/python/pycoco.py
index 931863569..f9d5f2e87 100644
--- a/vision/classification_and_detection/python/pycoco.py
+++ b/vision/classification_and_detection/python/pycoco.py
@@ -1,5 +1,5 @@
-__author__ = 'tylin'
-__version__ = '2.0'
+__author__ = "tylin"
+__version__ = "2.0"
 # Interface for accessing the Microsoft COCO dataset.
 
 # Microsoft COCO is a large image dataset designed for object detection,
@@ -46,9 +46,10 @@
 
 import json
 import time
-#import matplotlib.pyplot as plt
-#from matplotlib.collections import PatchCollection
-#from matplotlib.patches import Polygon
+
+# import matplotlib.pyplot as plt
+# from matplotlib.collections import PatchCollection
+# from matplotlib.patches import Polygon
 import numpy as np
 import copy
 import itertools
@@ -56,6 +57,7 @@
 import os
 from collections import defaultdict
 import sys
+
 PYTHON_VERSION = sys.version_info[0]
 if PYTHON_VERSION == 2:
     from urllib import urlretrieve
@@ -64,7 +66,7 @@
 
 
 def _isArrayLike(obj):
-    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+    return hasattr(obj, "__iter__") and hasattr(obj, "__len__")
 
 
 class COCO:
@@ -76,40 +78,42 @@ def __init__(self, annotation_file=None):
         :return:
         """
         # load dataset
-        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
+        self.dataset, self.anns, self.cats, self.imgs = dict(), dict(), dict(), dict()
         self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
-        if not annotation_file == None:
-            print('loading annotations into memory...')
+        if not annotation_file is None:
+            print("loading annotations into memory...")
             tic = time.time()
-            dataset = json.load(open(annotation_file, 'r'))
-            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
-            print('Done (t={:0.2f}s)'.format(time.time()- tic))
+            dataset = json.load(open(annotation_file, "r"))
+            assert (
+                isinstance(dataset, dict)
+            ), "annotation file format {} not supported".format(type(dataset))
+            print("Done (t={:0.2f}s)".format(time.time() - tic))
             self.dataset = dataset
             self.createIndex()
 
     def createIndex(self):
         # create index
-        print('creating index...')
+        print("creating index...")
         anns, cats, imgs = {}, {}, {}
-        imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
-        if 'annotations' in self.dataset:
-            for ann in self.dataset['annotations']:
-                imgToAnns[ann['image_id']].append(ann)
-                anns[ann['id']] = ann
+        imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
+        if "annotations" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                imgToAnns[ann["image_id"]].append(ann)
+                anns[ann["id"]] = ann
 
-        if 'images' in self.dataset:
-            for img in self.dataset['images']:
-                imgs[img['id']] = img
+        if "images" in self.dataset:
+            for img in self.dataset["images"]:
+                imgs[img["id"]] = img
 
-        if 'categories' in self.dataset:
-            for cat in self.dataset['categories']:
-                cats[cat['id']] = cat
+        if "categories" in self.dataset:
+            for cat in self.dataset["categories"]:
+                cats[cat["id"]] = cat
 
-        if 'annotations' in self.dataset and 'categories' in self.dataset:
-            for ann in self.dataset['annotations']:
-                catToImgs[ann['category_id']].append(ann['image_id'])
+        if "annotations" in self.dataset and "categories" in self.dataset:
+            for ann in self.dataset["annotations"]:
+                catToImgs[ann["category_id"]].append(ann["image_id"])
 
-        print('index created!')
+        print("index created!")
 
         # create class members
         self.anns = anns
@@ -123,8 +127,8 @@ def info(self):
         Print information about the annotation file.
         :return:
         """
-        for key, value in self.dataset['info'].items():
-            print('{}: {}'.format(key, value))
+        for key, value in self.dataset["info"].items():
+            print("{}: {}".format(key, value))
 
     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         """
@@ -139,19 +143,33 @@ def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
         if len(imgIds) == len(catIds) == len(areaRng) == 0:
-            anns = self.dataset['annotations']
+            anns = self.dataset["annotations"]
         else:
             if not len(imgIds) == 0:
-                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
+                lists = [
+                    self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns
+                ]
                 anns = list(itertools.chain.from_iterable(lists))
             else:
-                anns = self.dataset['annotations']
-            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
-            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
-        if not iscrowd == None:
-            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+                anns = self.dataset["annotations"]
+            anns = (
+                anns
+                if len(catIds) == 0
+                else [ann for ann in anns if ann["category_id"] in catIds]
+            )
+            anns = (
+                anns
+                if len(areaRng) == 0
+                else [
+                    ann
+                    for ann in anns
+                    if ann["area"] > areaRng[0] and ann["area"] < areaRng[1]
+                ]
+            )
+        if not iscrowd is None:
+            ids = [ann["id"] for ann in anns if ann["iscrowd"] == iscrowd]
         else:
-            ids = [ann['id'] for ann in anns]
+            ids = [ann["id"] for ann in anns]
         return ids
 
     def getCatIds(self, catNms=[], supNms=[], catIds=[]):
@@ -167,22 +185,34 @@ def getCatIds(self, catNms=[], supNms=[], catIds=[]):
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
         if len(catNms) == len(supNms) == len(catIds) == 0:
-            cats = self.dataset['categories']
+            cats = self.dataset["categories"]
         else:
-            cats = self.dataset['categories']
-            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
-            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
-            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
-        ids = [cat['id'] for cat in cats]
+            cats = self.dataset["categories"]
+            cats = (
+                cats
+                if len(catNms) == 0
+                else [cat for cat in cats if cat["name"] in catNms]
+            )
+            cats = (
+                cats
+                if len(supNms) == 0
+                else [cat for cat in cats if cat["supercategory"] in supNms]
+            )
+            cats = (
+                cats
+                if len(catIds) == 0
+                else [cat for cat in cats if cat["id"] in catIds]
+            )
+        ids = [cat["id"] for cat in cats]
         return ids
 
     def getImgIds(self, imgIds=[], catIds=[]):
-        '''
+        """
         Get img ids that satisfy given filter conditions.
         :param imgIds (int array) : get imgs for given ids
         :param catIds (int array) : get imgs with all given cats
         :return: ids (int array)  : integer array of img ids
-        '''
+        """
         imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
@@ -205,7 +235,7 @@ def loadAnns(self, ids=[]):
         """
         if _isArrayLike(ids):
             return [self.anns[id] for id in ids]
-        elif type(ids) == int:
+        elif isinstance(ids, int):
             return [self.anns[ids]]
 
     def loadCats(self, ids=[]):
@@ -216,7 +246,7 @@ def loadCats(self, ids=[]):
         """
         if _isArrayLike(ids):
             return [self.cats[id] for id in ids]
-        elif type(ids) == int:
+        elif isinstance(ids, int):
             return [self.cats[ids]]
 
     def loadImgs(self, ids=[]):
@@ -227,7 +257,7 @@ def loadImgs(self, ids=[]):
         """
         if _isArrayLike(ids):
             return [self.imgs[id] for id in ids]
-        elif type(ids) == int:
+        elif isinstance(ids, int):
             return [self.imgs[ids]]
 
     def showAnns(self, anns):
@@ -238,61 +268,88 @@ def showAnns(self, anns):
         """
         if len(anns) == 0:
             return 0
-        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
-            datasetType = 'instances'
-        elif 'caption' in anns[0]:
-            datasetType = 'captions'
+        if "segmentation" in anns[0] or "keypoints" in anns[0]:
+            datasetType = "instances"
+        elif "caption" in anns[0]:
+            datasetType = "captions"
         else:
-            raise Exception('datasetType not supported')
-        if datasetType == 'instances':
+            raise Exception("datasetType not supported")
+        if datasetType == "instances":
             ax = plt.gca()
             ax.set_autoscale_on(False)
             polygons = []
             color = []
             for ann in anns:
-                c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
-                if 'segmentation' in ann:
-                    if type(ann['segmentation']) == list:
+                c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
+                if "segmentation" in ann:
+                    if isinstance(ann["segmentation"], list):
                         # polygon
-                        for seg in ann['segmentation']:
-                            poly = np.array(seg).reshape((int(len(seg)/2), 2))
+                        for seg in ann["segmentation"]:
+                            poly = np.array(seg).reshape(
+                                (int(len(seg) / 2), 2))
                             polygons.append(Polygon(poly))
                             color.append(c)
                     else:
                         # mask
-                        t = self.imgs[ann['image_id']]
-                        if type(ann['segmentation']['counts']) == list:
-                            rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
+                        t = self.imgs[ann["image_id"]]
+                        if isinstance(ann["segmentation"]["counts"], list):
+                            rle = maskUtils.frPyObjects(
+                                [ann["segmentation"]], t["height"], t["width"]
+                            )
                         else:
-                            rle = [ann['segmentation']]
+                            rle = [ann["segmentation"]]
                         m = maskUtils.decode(rle)
-                        img = np.ones( (m.shape[0], m.shape[1], 3) )
-                        if ann['iscrowd'] == 1:
-                            color_mask = np.array([2.0,166.0,101.0])/255
-                        if ann['iscrowd'] == 0:
+                        img = np.ones((m.shape[0], m.shape[1], 3))
+                        if ann["iscrowd"] == 1:
+                            color_mask = np.array([2.0, 166.0, 101.0]) / 255
+                        if ann["iscrowd"] == 0:
                             color_mask = np.random.random((1, 3)).tolist()[0]
                         for i in range(3):
-                            img[:,:,i] = color_mask[i]
-                        ax.imshow(np.dstack( (img, m*0.5) ))
-                if 'keypoints' in ann and type(ann['keypoints']) == list:
+                            img[:, :, i] = color_mask[i]
+                        ax.imshow(np.dstack((img, m * 0.5)))
+                if "keypoints" in ann and isinstance(ann["keypoints"], list):
                     # turn skeleton into zero-based index
-                    sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
-                    kp = np.array(ann['keypoints'])
+                    sks = np.array(
+                        self.loadCats(
+                            ann["category_id"])[0]["skeleton"]) - 1
+                    kp = np.array(ann["keypoints"])
                     x = kp[0::3]
                     y = kp[1::3]
                     v = kp[2::3]
                     for sk in sks:
-                        if np.all(v[sk]>0):
-                            plt.plot(x[sk],y[sk], linewidth=3, color=c)
-                    plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
-                    plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
-            p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
+                        if np.all(v[sk] > 0):
+                            plt.plot(x[sk], y[sk], linewidth=3, color=c)
+                    plt.plot(
+                        x[v > 0],
+                        y[v > 0],
+                        "o",
+                        markersize=8,
+                        markerfacecolor=c,
+                        markeredgecolor="k",
+                        markeredgewidth=2,
+                    )
+                    plt.plot(
+                        x[v > 1],
+                        y[v > 1],
+                        "o",
+                        markersize=8,
+                        markerfacecolor=c,
+                        markeredgecolor=c,
+                        markeredgewidth=2,
+                    )
+            p = PatchCollection(
+                polygons,
+                facecolor=color,
+                linewidths=0,
+                alpha=0.4)
             ax.add_collection(p)
-            p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
+            p = PatchCollection(
+                polygons, facecolor="none", edgecolors=color, linewidths=2
+            )
             ax.add_collection(p)
-        elif datasetType == 'captions':
+        elif datasetType == "captions":
             for ann in anns:
-                print(ann['caption'])
+                print(ann["caption"])
 
     def loadRes(self, resFile):
         """
@@ -301,69 +358,78 @@ def loadRes(self, resFile):
         :return: res (obj)         : result api object
         """
         res = COCO()
-        res.dataset['images'] = [img for img in self.dataset['images']]
+        res.dataset["images"] = [img for img in self.dataset["images"]]
 
-        print('Loading and preparing results...')
+        print("Loading and preparing results...")
         tic = time.time()
-        if type(resFile) == str: #or type(resFile) == unicode:
+        if isinstance(resFile, str):  # or type(resFile) == unicode:
             anns = json.load(open(resFile))
-        elif type(resFile) == np.ndarray:
+        elif isinstance(resFile, np.ndarray):
             anns = self.loadNumpyAnnotations(resFile)
         else:
             anns = resFile
-        assert type(anns) == list, 'results in not an array of objects'
-        annsImgIds = [ann['image_id'] for ann in anns]
-        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
-               'Results do not correspond to current coco set'
-        if 'caption' in anns[0]:
-            imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
-            res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
+        assert isinstance(anns, list), "results in not an array of objects"
+        annsImgIds = [ann["image_id"] for ann in anns]
+        assert set(annsImgIds) == (
+            set(annsImgIds) & set(self.getImgIds())
+        ), "Results do not correspond to current coco set"
+        if "caption" in anns[0]:
+            imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
+                [ann["image_id"] for ann in anns]
+            )
+            res.dataset["images"] = [
+                img for img in res.dataset["images"] if img["id"] in imgIds
+            ]
             for id, ann in enumerate(anns):
-                ann['id'] = id+1
-        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+                ann["id"] = id + 1
+        elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
+            res.dataset["categories"] = copy.deepcopy(
+                self.dataset["categories"])
             for id, ann in enumerate(anns):
-                bb = ann['bbox']
-                x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
-                if not 'segmentation' in ann:
-                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
-                ann['area'] = bb[2]*bb[3]
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'segmentation' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+                bb = ann["bbox"]
+                x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+                if not "segmentation" in ann:
+                    ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann["area"] = bb[2] * bb[3]
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "segmentation" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(
+                self.dataset["categories"])
             for id, ann in enumerate(anns):
-                # now only support compressed RLE format as segmentation results
-                ann['area'] = maskUtils.area(ann['segmentation'])
-                if not 'bbox' in ann:
-                    ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'keypoints' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+                # now only support compressed RLE format as segmentation
+                # results
+                ann["area"] = maskUtils.area(ann["segmentation"])
+                if not "bbox" in ann:
+                    ann["bbox"] = maskUtils.toBbox(ann["segmentation"])
+                ann["id"] = id + 1
+                ann["iscrowd"] = 0
+        elif "keypoints" in anns[0]:
+            res.dataset["categories"] = copy.deepcopy(
+                self.dataset["categories"])
             for id, ann in enumerate(anns):
-                s = ann['keypoints']
+                s = ann["keypoints"]
                 x = s[0::3]
                 y = s[1::3]
-                x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
-                ann['area'] = (x1-x0)*(y1-y0)
-                ann['id'] = id + 1
-                ann['bbox'] = [x0,y0,x1-x0,y1-y0]
-        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+                x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann["area"] = (x1 - x0) * (y1 - y0)
+                ann["id"] = id + 1
+                ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
+        print("DONE (t={:0.2f}s)".format(time.time() - tic))
 
-        res.dataset['annotations'] = anns
+        res.dataset["annotations"] = anns
         res.createIndex()
         return res
 
-    def download(self, tarDir = None, imgIds = [] ):
-        '''
+    def download(self, tarDir=None, imgIds=[]):
+        """
         Download COCO images from mscoco.org server.
         :param tarDir (str): COCO results directory name
                imgIds (list): images to be downloaded
         :return:
-        '''
+        """
         if tarDir is None:
-            print('Please specify target directory')
+            print("Please specify target directory")
             return -1
         if len(imgIds) == 0:
             imgs = self.imgs.values()
@@ -374,10 +440,13 @@ def download(self, tarDir = None, imgIds = [] ):
             os.makedirs(tarDir)
         for i, img in enumerate(imgs):
             tic = time.time()
-            fname = os.path.join(tarDir, img['file_name'])
+            fname = os.path.join(tarDir, img["file_name"])
             if not os.path.exists(fname):
-                urlretrieve(img['coco_url'], fname)
-            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
+                urlretrieve(img["coco_url"], fname)
+            print(
+                "downloaded {}/{} images (t={:0.1f}s)".format(i,
+                                                              N, time.time() - tic)
+            )
 
     def loadNumpyAnnotations(self, data):
         """
@@ -385,21 +454,23 @@ def loadNumpyAnnotations(self, data):
         :param  data (numpy.ndarray)
         :return: annotations (python nested list)
         """
-        print('Converting ndarray to lists...')
-        assert(type(data) == np.ndarray)
+        print("Converting ndarray to lists...")
+        assert isinstance(data, np.ndarray)
         print(data.shape)
-        assert(data.shape[1] == 7)
+        assert data.shape[1] == 7
         N = data.shape[0]
         ann = []
         for i in range(N):
             if i % 1000000 == 0:
-                print('{}/{}'.format(i,N))
-            ann += [{
-                'image_id'  : int(data[i, 0]),
-                'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
-                'score' : data[i, 5],
-                'category_id': int(data[i, 6]),
-                }]
+                print("{}/{}".format(i, N))
+            ann += [
+                {
+                    "image_id": int(data[i, 0]),
+                    "bbox": [data[i, 1], data[i, 2], data[i, 3], data[i, 4]],
+                    "score": data[i, 5],
+                    "category_id": int(data[i, 6]),
+                }
+            ]
         return ann
 
     def annToRLE(self, ann):
@@ -407,20 +478,20 @@ def annToRLE(self, ann):
         Convert annotation which can be polygons, uncompressed RLE to RLE.
         :return: binary mask (numpy 2D array)
         """
-        t = self.imgs[ann['image_id']]
-        h, w = t['height'], t['width']
-        segm = ann['segmentation']
-        if type(segm) == list:
+        t = self.imgs[ann["image_id"]]
+        h, w = t["height"], t["width"]
+        segm = ann["segmentation"]
+        if isinstance(segm, list):
             # polygon -- a single object might consist of multiple parts
             # we merge all parts into one mask rle code
             rles = maskUtils.frPyObjects(segm, h, w)
             rle = maskUtils.merge(rles)
-        elif type(segm['counts']) == list:
+        elif isinstance(segm["counts"], list):
             # uncompressed RLE
             rle = maskUtils.frPyObjects(segm, h, w)
         else:
             # rle
-            rle = ann['segmentation']
+            rle = ann["segmentation"]
         return rle
 
     def annToMask(self, ann):
diff --git a/vision/classification_and_detection/python/version.py b/vision/classification_and_detection/python/version.py
index 1152dbb41..570348596 100644
--- a/vision/classification_and_detection/python/version.py
+++ b/vision/classification_and_detection/python/version.py
@@ -1,3 +1,2 @@
-
-version = '0.1.0'
-git_version = '05df3bae82ef9fc933277385eb778e3f22cd0c6a'
+version = "0.1.0"
+git_version = "05df3bae82ef9fc933277385eb778e3f22cd0c6a"
diff --git a/vision/classification_and_detection/setup.py b/vision/classification_and_detection/setup.py
index c1e2fbcf0..758d874fb 100644
--- a/vision/classification_and_detection/setup.py
+++ b/vision/classification_and_detection/setup.py
@@ -13,17 +13,20 @@
 from setuptools import setup, find_packages, Command
 
 TOP_DIR = os.path.realpath(os.path.dirname(__file__))
-SRC_DIR = os.path.join(TOP_DIR, 'python')
+SRC_DIR = os.path.join(TOP_DIR, "python")
 
 try:
-    git_version = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=TOP_DIR).decode('ascii').strip()
+    git_version = (
+        subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=TOP_DIR)
+        .decode("ascii")
+        .strip()
+    )
 except (OSError, subprocess.CalledProcessError):
     git_version = None
 
-with open(os.path.join(TOP_DIR, 'VERSION_NUMBER')) as version_file:
-    VersionInfo = namedtuple('VersionInfo', ['version', 'git_version'])(
-        version=version_file.read().strip(),
-        git_version=git_version
+with open(os.path.join(TOP_DIR, "VERSION_NUMBER")) as version_file:
+    VersionInfo = namedtuple("VersionInfo", ["version", "git_version"])(
+        version=version_file.read().strip(), git_version=git_version
     )
 
 
@@ -37,49 +40,67 @@ def finalize_options(self):
         pass
 
     def run(self):
-        with open(os.path.join(SRC_DIR, 'version.py'), 'w') as f:
-            f.write(dedent('''
+        with open(os.path.join(SRC_DIR, "version.py"), "w") as f:
+            f.write(
+                dedent(
+                    """
             version = '{version}'
             git_version = '{git_version}'
-            '''.format(**dict(VersionInfo._asdict()))))
+            """.format(
+                        **dict(VersionInfo._asdict())
+                    )
+                )
+            )
 
 
 class build_py(setuptools.command.build_py.build_py):
     def run(self):
-        self.run_command('create_version')
+        self.run_command("create_version")
         setuptools.command.build_py.build_py.run(self)
 
 
 class build(distutils.command.build.build):
     def run(self):
-        self.run_command('build_py')
+        self.run_command("build_py")
 
 
 class develop(setuptools.command.develop.develop):
     def run(self):
-        self.run_command('create_version')
-        self.run_command('build')
+        self.run_command("create_version")
+        self.run_command("build")
         setuptools.command.develop.develop.run(self)
 
 
 cmdclass = {
-    'create_version': create_version,
-    'build_py': build_py,
-    'build': build,
-    'develop': develop,
+    "create_version": create_version,
+    "build_py": build_py,
+    "build": build,
+    "develop": develop,
 }
 
 setup(
     name="mlperf-inference",
     version=VersionInfo.version,
-    description='mlperf inference benchmark',
-    setup_requires=['pytest-runner'],
-    tests_require=['graphviz', 'parameterized', 'pytest', 'pytest-cov', 'pyyaml'],
+    description="mlperf inference benchmark",
+    setup_requires=["pytest-runner"],
+    tests_require=[
+        "graphviz",
+        "parameterized",
+        "pytest",
+        "pytest-cov",
+        "pyyaml"],
     cmdclass=cmdclass,
     packages=find_packages(),
-    author='guschmue@microsoft.com',
-    author_email='guschmue@microsoft.com',
-    url='https://github.com/mlperf/inference',
-    install_requires=['numpy>=1.14.1', 'onnx>=1.5', 'pybind11', 'Cython',
-                        'pycocotools', 'mlperf_loadgen', 'opencv-python-headless']
+    author="guschmue@microsoft.com",
+    author_email="guschmue@microsoft.com",
+    url="https://github.com/mlperf/inference",
+    install_requires=[
+        "numpy>=1.14.1",
+        "onnx>=1.5",
+        "pybind11",
+        "Cython",
+        "pycocotools",
+        "mlperf_loadgen",
+        "opencv-python-headless",
+    ],
 )
diff --git a/vision/classification_and_detection/tools/accuracy-coco.py b/vision/classification_and_detection/tools/accuracy-coco.py
index 1e15999f1..95e04193b 100644
--- a/vision/classification_and_detection/tools/accuracy-coco.py
+++ b/vision/classification_and_detection/tools/accuracy-coco.py
@@ -19,15 +19,29 @@
 
 # pylint: disable=missing-docstring
 
+
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json")
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
     parser.add_argument("--coco-dir", required=True, help="coco directory")
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
-    parser.add_argument("--output-file", default="coco-results.json", help="path to output file")
-    parser.add_argument("--use-inv-map", action="store_true", help="use inverse label map")
-    parser.add_argument("--remove-48-empty-images", action="store_true", help="used in case you removed 48 empty images while preprocessing the dataset")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--output-file", default="coco-results.json", help="path to output file"
+    )
+    parser.add_argument(
+        "--use-inv-map", action="store_true", help="use inverse label map"
+    )
+    parser.add_argument(
+        "--remove-48-empty-images",
+        action="store_true",
+        help="used in case you removed 48 empty images while preprocessing the dataset",
+    )
     args = parser.parse_args()
     return args
 
@@ -35,10 +49,14 @@ def get_args():
 def main():
     args = get_args()
 
-    cocoGt = COCO(os.path.join(args.coco_dir, "annotations/instances_val2017.json"))
+    cocoGt = COCO(
+        os.path.join(
+            args.coco_dir,
+            "annotations/instances_val2017.json"))
 
     if args.use_inv_map:
-        inv_map = [0] + cocoGt.getCatIds() # First label in inv_map is not used
+        # First label in inv_map is not used
+        inv_map = [0] + cocoGt.getCatIds()
 
     with open(args.mlperf_accuracy_file, "r") as f:
         results = json.load(f)
@@ -47,7 +65,7 @@ def main():
     image_ids = set()
     seen = set()
     no_results = 0
-    if args.remove_48_empty_images:        
+    if args.remove_48_empty_images:
         im_ids = []
         for i in cocoGt.getCatIds():
             im_ids += cocoGt.catToImgs[i]
@@ -57,7 +75,7 @@ def main():
         image_map = cocoGt.dataset["images"]
 
     for j in results:
-        idx = j['qsl_idx']
+        idx = j["qsl_idx"]
         # de-dupe in case loadgen sends the same image multiple times
         if idx in seen:
             continue
@@ -66,12 +84,14 @@ def main():
         # reconstruct from mlperf accuracy log
         # what is written by the benchmark is an array of float32's:
         # id, box[0], box[1], box[2], box[3], score, detection_class
-        # note that id is a index into instances_val2017.json, not the actual image_id
-        data = np.frombuffer(bytes.fromhex(j['data']), np.float32)
+        # note that id is a index into instances_val2017.json, not the actual
+        # image_id
+        data = np.frombuffer(bytes.fromhex(j["data"]), np.float32)
         if len(data) < 7:
             # handle images that had no results
             image = image_map[idx]
-            # by adding the id to image_ids we make pycoco aware of the no-result image
+            # by adding the id to image_ids we make pycoco aware of the
+            # no-result image
             image_ids.add(image["id"])
             no_results += 1
             if args.verbose:
@@ -79,11 +99,15 @@ def main():
             continue
 
         for i in range(0, len(data), 7):
-            image_idx, ymin, xmin, ymax, xmax, score, label = data[i:i + 7]
+            image_idx, ymin, xmin, ymax, xmax, score, label = data[i: i + 7]
             image = image_map[idx]
             image_idx = int(image_idx)
             if image_idx != idx:
-                print("ERROR: loadgen({}) and payload({}) disagree on image_idx".format(idx, image_idx))
+                print(
+                    "ERROR: loadgen({}) and payload({}) disagree on image_idx".format(
+                        idx, image_idx
+                    )
+                )
             image_id = image["id"]
             height, width = image["height"], image["width"]
             ymin *= height
@@ -95,25 +119,35 @@ def main():
             if args.use_inv_map:
                 label = inv_map[label]
             # pycoco wants {imageID,x1,y1,w,h,score,class}
-            detections.append({
-                "image_id": image_id,
-                "image_loc": loc,
-                "category_id": label,
-                "bbox": [float(xmin), float(ymin), float(xmax - xmin), float(ymax - ymin)],
-                "score": float(score)})
+            detections.append(
+                {
+                    "image_id": image_id,
+                    "image_loc": loc,
+                    "category_id": label,
+                    "bbox": [
+                        float(xmin),
+                        float(ymin),
+                        float(xmax - xmin),
+                        float(ymax - ymin),
+                    ],
+                    "score": float(score),
+                }
+            )
             image_ids.add(image_id)
 
     with open(args.output_file, "w") as fp:
         json.dump(detections, fp, sort_keys=True, indent=4)
 
-    cocoDt = cocoGt.loadRes(args.output_file) # Load from file to bypass error with Python3
-    cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
+    cocoDt = cocoGt.loadRes(
+        args.output_file
+    )  # Load from file to bypass error with Python3
+    cocoEval = COCOeval(cocoGt, cocoDt, iouType="bbox")
     cocoEval.params.imgIds = list(image_ids)
     cocoEval.evaluate()
     cocoEval.accumulate()
     cocoEval.summarize()
 
-    print("mAP={:.3f}%".format(100. * cocoEval.stats[0]))
+    print("mAP={:.3f}%".format(100.0 * cocoEval.stats[0]))
     if args.verbose:
         print("found {} results".format(len(results)))
         print("found {} images".format(len(image_ids)))
diff --git a/vision/classification_and_detection/tools/accuracy-imagenet.py b/vision/classification_and_detection/tools/accuracy-imagenet.py
index 1879e0f09..a57810891 100644
--- a/vision/classification_and_detection/tools/accuracy-imagenet.py
+++ b/vision/classification_and_detection/tools/accuracy-imagenet.py
@@ -15,21 +15,32 @@
 
 # pylint: disable=missing-docstring
 
+
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--imagenet-val-file", required=True, help="path to imagenet val_map.txt")
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
-    parser.add_argument("--dtype", default="float32", choices=["float32", "int32", "int64"], help="data type of the label")
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--imagenet-val-file", required=True, help="path to imagenet val_map.txt"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float32", "int32", "int64"],
+        help="data type of the label",
+    )
     args = parser.parse_args()
     return args
 
-dtype_map = {
-    "float32": np.float32,
-    "int32": np.int32,
-    "int64": np.int64
-}
+
+dtype_map = {"float32": np.float32, "int32": np.int32, "int64": np.int64}
+
 
 def main():
     args = get_args()
@@ -46,7 +57,7 @@ def main():
     seen = set()
     good = 0
     for j in results:
-        idx = j['qsl_idx']
+        idx = j["qsl_idx"]
 
         # de-dupe in case loadgen sends the same image multiple times
         if idx in seen:
@@ -57,7 +68,7 @@ def main():
         img, label = imagenet[idx]
 
         # reconstruct label from mlperf accuracy log
-        data = np.frombuffer(bytes.fromhex(j['data']), dtype_map[args.dtype])
+        data = np.frombuffer(bytes.fromhex(j["data"]), dtype_map[args.dtype])
         found = int(data[0])
         if label == found:
             good += 1
@@ -65,7 +76,11 @@ def main():
             if args.verbose:
                 print("{}, expected: {}, found {}".format(img, label, found))
 
-    print("accuracy={:.3f}%, good={}, total={}".format(100. * good / len(seen), good, len(seen)))
+    print(
+        "accuracy={:.3f}%, good={}, total={}".format(
+            100.0 * good / len(seen), good, len(seen)
+        )
+    )
     if args.verbose:
         print("found and ignored {} dupes".format(len(results) - len(seen)))
 
diff --git a/vision/classification_and_detection/tools/accuracy-openimages.py b/vision/classification_and_detection/tools/accuracy-openimages.py
index 655ae5c16..0a0831d24 100644
--- a/vision/classification_and_detection/tools/accuracy-openimages.py
+++ b/vision/classification_and_detection/tools/accuracy-openimages.py
@@ -19,27 +19,43 @@
 
 # pylint: disable=missing-docstring
 
+
 def get_args():
     """Parse commandline."""
     parser = argparse.ArgumentParser()
-    parser.add_argument("--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json")
-    parser.add_argument("--openimages-dir", required=True, help="openimages directory")
-    parser.add_argument("--verbose", action="store_true", help="verbose messages")
-    parser.add_argument("--output-file", default="openimages-results.json", help="path to output file")
-    parser.add_argument("--use-inv-map", action="store_true", help="use inverse label map")
+    parser.add_argument(
+        "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
+    )
+    parser.add_argument(
+        "--openimages-dir",
+        required=True,
+        help="openimages directory")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--output-file", default="openimages-results.json", help="path to output file"
+    )
+    parser.add_argument(
+        "--use-inv-map", action="store_true", help="use inverse label map"
+    )
     args = parser.parse_args()
     return args
 
 
 def main():
     args = get_args()
-    annotations_file = os.environ.get('DATASET_ANNOTATIONS_FILE_PATH')
+    annotations_file = os.environ.get("DATASET_ANNOTATIONS_FILE_PATH")
     if not annotations_file:
-        annotations_file = os.path.join(args.openimages_dir, "annotations/openimages-mlperf.json")
+        annotations_file = os.path.join(
+            args.openimages_dir, "annotations/openimages-mlperf.json"
+        )
     cocoGt = COCO(annotations_file)
 
     if args.use_inv_map:
-        inv_map = [0] + cocoGt.getCatIds() # First label in inv_map is not used
+        # First label in inv_map is not used
+        inv_map = [0] + cocoGt.getCatIds()
 
     with open(args.mlperf_accuracy_file, "r") as f:
         results = json.load(f)
@@ -51,7 +67,7 @@ def main():
     image_map = cocoGt.dataset["images"]
 
     for j in results:
-        idx = j['qsl_idx']
+        idx = j["qsl_idx"]
         # de-dupe in case loadgen sends the same image multiple times
         if idx in seen:
             continue
@@ -60,12 +76,14 @@ def main():
         # reconstruct from mlperf accuracy log
         # what is written by the benchmark is an array of float32's:
         # id, box[0], box[1], box[2], box[3], score, detection_class
-        # note that id is a index into instances_val2017.json, not the actual image_id
-        data = np.frombuffer(bytes.fromhex(j['data']), np.float32)
+        # note that id is a index into instances_val2017.json, not the actual
+        # image_id
+        data = np.frombuffer(bytes.fromhex(j["data"]), np.float32)
         if len(data) < 7:
             # handle images that had no results
             image = image_map[idx]
-            # by adding the id to image_ids we make pycoco aware of the no-result image
+            # by adding the id to image_ids we make pycoco aware of the
+            # no-result image
             image_ids.add(image["id"])
             no_results += 1
             if args.verbose:
@@ -73,41 +91,57 @@ def main():
             continue
 
         for i in range(0, len(data), 7):
-            image_idx, ymin, xmin, ymax, xmax, score, label = data[i:i + 7]
+            image_idx, ymin, xmin, ymax, xmax, score, label = data[i: i + 7]
             image = image_map[idx]
             image_idx = int(image_idx)
             if image_idx != idx:
-                print("ERROR: loadgen({}) and payload({}) disagree on image_idx".format(idx, image_idx))
+                print(
+                    "ERROR: loadgen({}) and payload({}) disagree on image_idx".format(
+                        idx, image_idx
+                    )
+                )
             image_id = image["id"]
             height, width = image["height"], image["width"]
             ymin *= height
             xmin *= width
             ymax *= height
             xmax *= width
-            loc = os.path.join(args.openimages_dir, "validation/data", image["file_name"])
+            loc = os.path.join(
+                args.openimages_dir, "validation/data", image["file_name"]
+            )
             label = int(label)
             if args.use_inv_map:
                 label = inv_map[label]
             # pycoco wants {imageID,x1,y1,w,h,score,class}
-            detections.append({
-                "image_id": image_id,
-                "image_loc": loc,
-                "category_id": label,
-                "bbox": [float(xmin), float(ymin), float(xmax - xmin), float(ymax - ymin)],
-                "score": float(score)})
+            detections.append(
+                {
+                    "image_id": image_id,
+                    "image_loc": loc,
+                    "category_id": label,
+                    "bbox": [
+                        float(xmin),
+                        float(ymin),
+                        float(xmax - xmin),
+                        float(ymax - ymin),
+                    ],
+                    "score": float(score),
+                }
+            )
             image_ids.add(image_id)
 
     with open(args.output_file, "w") as fp:
         json.dump(detections, fp, sort_keys=True, indent=4)
 
-    cocoDt = cocoGt.loadRes(args.output_file) # Load from file to bypass error with Python3
-    cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
+    cocoDt = cocoGt.loadRes(
+        args.output_file
+    )  # Load from file to bypass error with Python3
+    cocoEval = COCOeval(cocoGt, cocoDt, iouType="bbox")
     cocoEval.params.imgIds = list(image_ids)
     cocoEval.evaluate()
     cocoEval.accumulate()
     cocoEval.summarize()
 
-    print("mAP={:.3f}%".format(100. * cocoEval.stats[0]))
+    print("mAP={:.3f}%".format(100.0 * cocoEval.stats[0]))
     if args.verbose:
         print("found {} results".format(len(results)))
         print("found {} images".format(len(image_ids)))
diff --git a/vision/classification_and_detection/tools/calibrate_torchvision_model.py b/vision/classification_and_detection/tools/calibrate_torchvision_model.py
index 815e5fe20..3b002003a 100644
--- a/vision/classification_and_detection/tools/calibrate_torchvision_model.py
+++ b/vision/classification_and_detection/tools/calibrate_torchvision_model.py
@@ -12,12 +12,13 @@
 
 class CalibrationDataset(Dataset):
     def __init__(self, root, files, transform):
-        with open(files, 'r') as f:
-            self.files = [os.path.join(root, fn.strip()) for fn in f.readlines()]
+        with open(files, "r") as f:
+            self.files = [os.path.join(root, fn.strip())
+                          for fn in f.readlines()]
         self.transform = transform
 
     def __getitem__(self, idx):
-        image = Image.open(self.files[idx]).convert('RGB')
+        image = Image.open(self.files[idx]).convert("RGB")
         image = self.transform(image)
         return image
 
@@ -25,7 +26,7 @@ def __len__(self):
         return len(self.files)
 
 
-def quantize_model(model, dataloader, backend='fbgemm'):
+def quantize_model(model, dataloader, backend="fbgemm"):
     if backend not in torch.backends.quantized.supported_engines:
         raise RuntimeError("Quantized backend not supported ")
     torch.backends.quantized.engine = backend
@@ -36,29 +37,40 @@ def quantize_model(model, dataloader, backend='fbgemm'):
     # Make sure that weight qconfig matches that of the serialized models
     model.qconfig = torch.quantization.get_default_qconfig(backend)
     torch.quantization.prepare(model, inplace=True)
-    print('calibrating...')
+    print("calibrating...")
     for x in tqdm(dataloader):
         model(x)
-    print('calibration DONE!')
+    print("calibration DONE!")
     torch.quantization.convert(model, inplace=True)
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, default='resnet50')
-    parser.add_argument('--image-dir', type=str, default='imagenet/val')
-    parser.add_argument('--image-list', type=str, default='../../calibration/ImageNet/cal_image_list_option_1.txt')
+    parser.add_argument("--model", type=str, default="resnet50")
+    parser.add_argument("--image-dir", type=str, default="imagenet/val")
+    parser.add_argument(
+        "--image-list",
+        type=str,
+        default="../../calibration/ImageNet/cal_image_list_option_1.txt",
+    )
     args = parser.parse_args()
     print(args)
 
-    transform = transforms.Compose([                                                   
-        transforms.Resize(256),                                                        
-        transforms.CenterCrop(224),                                                    
-        transforms.ToTensor(),                                                         
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),   
-    ])                                                                                 
-
-    dataset = CalibrationDataset(root=args.image_dir, files=args.image_list, transform=transform)
+    transform = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[
+                    0.485, 0.456, 0.406], std=[
+                    0.229, 0.224, 0.225]),
+        ]
+    )
+
+    dataset = CalibrationDataset(
+        root=args.image_dir, files=args.image_list, transform=transform
+    )
     dataloader = DataLoader(dataset, batch_size=1)
 
     model = eval(args.model)(pretrained=True, progress=True, quantize=False)
@@ -67,11 +79,10 @@ def main():
 
     inp = torch.rand(1, 3, 224, 224)
     script_module = torch.jit.trace(model, inp)
-    save_path = f'{args.model}.pt'
+    save_path = f"{args.model}.pt"
     torch.jit.save(script_module, save_path)
-    print(f'saved: {save_path}')
+    print(f"saved: {save_path}")
 
 
-if __name__=='__main__':
+if __name__ == "__main__":
     main()
-
diff --git a/vision/classification_and_detection/tools/coco-analyze.py b/vision/classification_and_detection/tools/coco-analyze.py
index 7f22daa84..f58677f9d 100755
--- a/vision/classification_and_detection/tools/coco-analyze.py
+++ b/vision/classification_and_detection/tools/coco-analyze.py
@@ -36,7 +36,7 @@ def annotate_image(results, cocoGt, output):
 
     new_results = collections.defaultdict(list)
     for result in results:
-        new_results[result['image_id']].append(result)
+        new_results[result["image_id"]].append(result)
     print("Unique images = {}".format(len(new_results)))
     results = new_results
 
@@ -44,32 +44,40 @@ def annotate_image(results, cocoGt, output):
         draw = None
         image = None
         for v in result:
-            box = v['bbox']
-            score = v['score']
+            box = v["bbox"]
+            score = v["score"]
             predicted_class = v["category_id"]
             try:
                 predicted_class = cocoGt.loadCats(predicted_class)[0]["name"]
             except Exception as ex:
-                print("category {} not found, image {}".format(predicted_class, v["image_loc"]))
+                print(
+                    "category {} not found, image {}".format(
+                        predicted_class, v["image_loc"]
+                    )
+                )
             # predicted_class = self.class_names[c]
             # "image_loc": "/home/gs/data/coco300/val2017/000000397133.jpg",
             if not draw:
-                image = Image.open(v['image_loc'])
-                if image.mode != 'RGB':
-                    image = image.convert('RGB')
+                image = Image.open(v["image_loc"])
+                if image.mode != "RGB":
+                    image = image.convert("RGB")
 
                 draw = ImageDraw.Draw(image)
             # font = ImageFont.truetype(font='FreeMono.ttf',
-            #            size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
+            # size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
             try:
                 left, top, w, h = box
                 bottom = top + h
                 right = left + w
-                top = max(0, np.floor(top + 0.5).astype('int32'))
-                left = max(0, np.floor(left + 0.5).astype('int32'))
-                bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
-                right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
-                label = '{} {:.2f}'.format(predicted_class, score)
+                top = max(0, np.floor(top + 0.5).astype("int32"))
+                left = max(0, np.floor(left + 0.5).astype("int32"))
+                bottom = min(
+                    image.size[1], np.floor(
+                        bottom + 0.5).astype("int32"))
+                right = min(
+                    image.size[0], np.floor(
+                        right + 0.5).astype("int32"))
+                label = "{} {:.2f}".format(predicted_class, score)
                 # label_size = draw.textsize(label, font)
                 label_size = draw.textsize(label)
 
@@ -80,11 +88,19 @@ def annotate_image(results, cocoGt, output):
 
                 color = ImageColor.getrgb("red")
                 thickness = 0
-                draw.rectangle([left + thickness, top + thickness, right - thickness, bottom - thickness], outline=color)
+                draw.rectangle(
+                    [
+                        left + thickness,
+                        top + thickness,
+                        right - thickness,
+                        bottom - thickness,
+                    ],
+                    outline=color,
+                )
                 draw.text(text_origin, label, fill=color)  # , font=font)
             except Exception as ex:
-                print("{} failed, ex {}".format(v['image_loc'], ex))
-        image.save(os.path.join(output, os.path.basename(v['image_loc'])))
+                print("{} failed, ex {}".format(v["image_loc"], ex))
+        image.save(os.path.join(output, os.path.basename(v["image_loc"])))
         del draw
 
 
@@ -93,7 +109,7 @@ def calculate_map(results, cocoGt, output):
     # x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
 
     cocoDt = cocoGt.loadRes(results)
-    cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox')
+    cocoEval = COCOeval(cocoGt, cocoDt, iouType="bbox")
     cocoEval.evaluate()
     cocoEval.accumulate()
     cocoEval.summarize()
@@ -110,11 +126,11 @@ def calculate_map(results, cocoGt, output):
         "DetectionBoxes_Recall/AR@100": cocoEval.stats[8],
         "DetectionBoxes_Recall/AR@100 (small)": cocoEval.stats[9],
         "DetectionBoxes_Recall/AR@100 (medium)": cocoEval.stats[10],
-        "DetectionBoxes_Recall/AR@100 (large)": cocoEval.stats[11]
+        "DetectionBoxes_Recall/AR@100 (large)": cocoEval.stats[11],
     }
 
-    mAP = all_metrics['DetectionBoxes_Precision/mAP']
-    recall = all_metrics['DetectionBoxes_Recall/AR@100']
+    mAP = all_metrics["DetectionBoxes_Precision/mAP"]
+    recall = all_metrics["DetectionBoxes_Recall/AR@100"]
     print("mAP={}, recall={}".format(mAP, recall))
 
 
@@ -124,7 +140,8 @@ def main():
     with open(args.input, "r") as f:
         results = json.load(f)
 
-    annotation_file = os.path.join(args.coco, "annotations/instances_val2017.json")
+    annotation_file = os.path.join(
+        args.coco, "annotations/instances_val2017.json")
     cocoGt = COCO(annotation_file)
     annotate_image(results, cocoGt, args.output)
     calculate_map(args.input, cocoGt, args.output)
diff --git a/vision/classification_and_detection/tools/lglog2csv.py b/vision/classification_and_detection/tools/lglog2csv.py
index 901bd795d..6f32433f2 100644
--- a/vision/classification_and_detection/tools/lglog2csv.py
+++ b/vision/classification_and_detection/tools/lglog2csv.py
@@ -36,32 +36,43 @@ def main():
     with open(args.input, "r") as fp:
         mode, mean, latency_90, latency_99, qps = None, 0, 0, 0, 0
         for line in fp:
-            m = re.match("^Scenario\s*:\s*(\w+).*", line)
+            m = re.match("^Scenario\\s*:\\s*(\\w+).*", line)
             if m:
                 mode = m.group(1)
-            m = re.match("^90.00 percentile latency.*:\s*(\d+).*", line)
+            m = re.match("^90.00 percentile latency.*:\\s*(\\d+).*", line)
             if m:
                 latency_90 = m.group(1)
-            m = re.match("^99.00 percentile latency.*:\s*(\d+).*", line)
+            m = re.match("^99.00 percentile latency.*:\\s*(\\d+).*", line)
             if m:
                 latency_99 = m.group(1)
-            m = re.match("^Mean latency.*:\s*(\d+).*", line)
+            m = re.match("^Mean latency.*:\\s*(\\d+).*", line)
             if m:
                 mean = m.group(1)
-            m = re.match("^Completed samples per second.*:\s*(\d+).*", line)
+            m = re.match("^Completed samples per second.*:\\s*(\\d+).*", line)
             if m:
                 qps = m.group(1)
-            m = re.match("^QPS w/ loadgen overhead.*:\s*(\d+).*", line)
+            m = re.match("^QPS w/ loadgen overhead.*:\\s*(\\d+).*", line)
             if m:
                 qps = m.group(1)
-            m = re.match("^Samples per second.*:\s*(\d+).*", line)
+            m = re.match("^Samples per second.*:\\s*(\\d+).*", line)
             if m:
                 qps = m.group(1)
             m = re.match("Test Parameters Used.*", line)
             if m:
-                print("{},{},{},{},{},{},{},{},{},{}".format(
-                    args.name, now, args.machine, args.runtime, args.model,
-                    mode, qps, mean, latency_90, latency_99))
+                print(
+                    "{},{},{},{},{},{},{},{},{},{}".format(
+                        args.name,
+                        now,
+                        args.machine,
+                        args.runtime,
+                        args.model,
+                        mode,
+                        qps,
+                        mean,
+                        latency_90,
+                        latency_99,
+                    )
+                )
                 mode, mean, latency_90, latency_99, qps = None, 0, 0, 0, 0
 
 
diff --git a/vision/classification_and_detection/tools/openimages.py b/vision/classification_and_detection/tools/openimages.py
index 2d14bdd22..b6e22f3ab 100644
--- a/vision/classification_and_detection/tools/openimages.py
+++ b/vision/classification_and_detection/tools/openimages.py
@@ -52,17 +52,20 @@
 
 
 BUCKET_NAME = "open-images-dataset"
-BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv"
+BBOX_ANNOTATIONS_URL = (
+    "https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv"
+)
 ANNOTATIONS_FILE = "validation-annotations-bbox.csv"
-MAP_CLASSES_URL = "https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv"
+MAP_CLASSES_URL = (
+    "https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv"
+)
 MAP_CLASSES_FILE = "class-descriptions-boxable.csv"
 CHUNK_SIZE = 1024 * 8
 
 
 def get_args():
     parser = argparse.ArgumentParser(
-        description="Download OpenImages", add_help=True
-    )
+        description="Download OpenImages", add_help=True)
     parser.add_argument(
         "--dataset-dir",
         default="/open-images-v6",
@@ -138,20 +141,26 @@ def export_to_coco(
     image_list_df.columns = ["image_list"]
     image_list_df[["height", "width"]] = image_list_df.apply(
         lambda x: extract_dims(
-            os.path.join(dataset_path, f"{x['image_list']}.jpg")
-        ),
+            os.path.join(
+                dataset_path,
+                f"{x['image_list']}.jpg")),
         axis=1,
         result_type="expand",
     )
-    annotations = pd.merge(annotations, image_list_df, how="inner", left_on="ImageID", right_on="image_list")
+    annotations = pd.merge(
+        annotations,
+        image_list_df,
+        how="inner",
+        left_on="ImageID",
+        right_on="image_list",
+    )
     annotations = annotations.merge(class_map, on="LabelName", how="inner")
     annotations = annotations.sort_values(by=["ImageID"])
     annotations["image_id"] = pd.factorize(annotations["ImageID"].tolist())[0]
     # Images
     images_ = []
-    for i, row in (
-        annotations.groupby(["image_id", "ImageID"]).first().iterrows()
-    ):
+    for i, row in annotations.groupby(
+            ["image_id", "ImageID"]).first().iterrows():
         id, ImageID = i
         images_.append(
             {
@@ -237,8 +246,7 @@ def download_one_image(bucket, split, image_id, download_folder):
         )
     except botocore.exceptions.ClientError as exception:
         sys.exit(
-            f"ERROR when downloading image `{split}/{image_id}`: {str(exception)}"
-        )
+            f"ERROR when downloading image `{split}/{image_id}`: {str(exception)}")
 
 
 def download_all_images(args):
@@ -285,8 +293,7 @@ def download_all_images(args):
         if args.max_images is not None:
             np.random.seed(args.seed)
             selected_index = np.random.choice(
-                len(image_list), size=args.max_images
-            )
+                len(image_list), size=args.max_images)
             image_list = [image_list[i] for i in selected_index]
     except ValueError as exception:
         sys.exit(exception)
@@ -294,9 +301,7 @@ def download_all_images(args):
     progress_bar = tqdm.tqdm(
         total=len(image_list), desc="Downloading images", leave=True
     )
-    with futures.ThreadPoolExecutor(
-        max_workers=args.num_processes
-    ) as executor:
+    with futures.ThreadPoolExecutor(max_workers=args.num_processes) as executor:
         all_futures = [
             executor.submit(
                 download_one_image,
diff --git a/vision/classification_and_detection/tools/openimages_calibration.py b/vision/classification_and_detection/tools/openimages_calibration.py
index 468e488b1..a2a4aa4be 100644
--- a/vision/classification_and_detection/tools/openimages_calibration.py
+++ b/vision/classification_and_detection/tools/openimages_calibration.py
@@ -35,9 +35,13 @@
 
 
 BUCKET_NAME = "open-images-dataset"
-BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-bbox.csv"
+BBOX_ANNOTATIONS_URL = (
+    "https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-bbox.csv"
+)
 ANNOTATIONS_FILE = "oidv6-train-annotations-bbox.csv"
-MAP_CLASSES_URL = "https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv"
+MAP_CLASSES_URL = (
+    "https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv"
+)
 MAP_CLASSES_FILE = "class-descriptions-boxable.csv"
 CHUNK_SIZE = 1024 * 8
 
@@ -116,16 +120,16 @@ def export_to_coco(
     annotations["image_id"] = pd.factorize(annotations["ImageID"].tolist())[0]
     annotations[["height", "width"]] = annotations.apply(
         lambda x: extract_dims(
-            os.path.join(dataset_path, f"{x['ImageID']}.jpg")
-        ),
+            os.path.join(
+                dataset_path,
+                f"{x['ImageID']}.jpg")),
         axis=1,
         result_type="expand",
     )
     # Images
     images_ = []
-    for i, row in (
-        annotations.groupby(["image_id", "ImageID"]).first().iterrows()
-    ):
+    for i, row in annotations.groupby(
+            ["image_id", "ImageID"]).first().iterrows():
         id, ImageID = i
         images_.append(
             {
@@ -197,8 +201,7 @@ def download_one_image(bucket, split, image_id, download_folder):
         )
     except botocore.exceptions.ClientError as exception:
         sys.exit(
-            f"ERROR when downloading image `{split}/{image_id}`: {str(exception)}"
-        )
+            f"ERROR when downloading image `{split}/{image_id}`: {str(exception)}")
 
 
 def download_all_images(args, image_list):
@@ -216,9 +219,8 @@ def download_all_images(args, image_list):
     if not os.path.exists(os.path.join(download_folder, "annotations")):
         os.makedirs(os.path.join(download_folder, "annotations"))
 
-    if not os.path.exists(
-        os.path.join(download_folder, "calibration", "data")
-    ):
+    if not os.path.exists(os.path.join(
+            download_folder, "calibration", "data")):
         os.makedirs(os.path.join(download_folder, "calibration", "data"))
 
     try:
@@ -246,9 +248,7 @@ def download_all_images(args, image_list):
     progress_bar = tqdm.tqdm(
         total=len(image_list), desc="Downloading images", leave=True
     )
-    with futures.ThreadPoolExecutor(
-        max_workers=args.num_processes
-    ) as executor:
+    with futures.ThreadPoolExecutor(max_workers=args.num_processes) as executor:
         all_futures = [
             executor.submit(
                 download_one_image,
@@ -280,8 +280,9 @@ def download_all_images(args, image_list):
         # Try to find the calibration file in case it was not provided
         repo_root = os.path.dirname(
             os.path.dirname(
-                os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-            )
+                os.path.dirname(
+                    os.path.dirname(
+                        os.path.abspath(__file__))))
         )
         calibration_file = os.path.join(
             repo_root,
diff --git a/vision/classification_and_detection/tools/resnet50_v1_to_ncnn.py b/vision/classification_and_detection/tools/resnet50_v1_to_ncnn.py
index 01d7401e5..6a13e9043 100644
--- a/vision/classification_and_detection/tools/resnet50_v1_to_ncnn.py
+++ b/vision/classification_and_detection/tools/resnet50_v1_to_ncnn.py
@@ -1,3 +1,6 @@
+import ncnn
+import numpy as np
+import resnet50_v1_pnnx
 import os
 import torch
 import torchvision.models as models
@@ -22,12 +25,11 @@
 os.system("pnnx resnet50_v1.pt inputshape=[1,3,224,224] fp16=0")
 
 # pnnx inference
-import resnet50_v1_pnnx
+
 b = resnet50_v1_pnnx.test_inference()
 
 # ncnn inference
-import numpy as np
-import ncnn
+
 with ncnn.Net() as net:
     net.opt.use_fp16_packed = False
     net.opt.use_fp16_storage = False
diff --git a/vision/classification_and_detection/tools/resnet_save.py b/vision/classification_and_detection/tools/resnet_save.py
index fca66ea26..0eaa0ef58 100755
--- a/vision/classification_and_detection/tools/resnet_save.py
+++ b/vision/classification_and_detection/tools/resnet_save.py
@@ -43,256 +43,308 @@
 from official.utils.misc import model_helpers
 
 
-
-
 def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
-  """Serving input fn for raw jpeg images."""
-
-  def _preprocess_image(image_bytes):
-    """Preprocess a single raw image."""
-    # Bounding box around the whole image.
-    bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
-    height, width, num_channels = image_shape
-    image = imagenet_preprocessing.preprocess_image(
-        image_bytes, bbox, height, width, num_channels, is_training=False)
-    return image
-
-  image_bytes_list = tf.placeholder(
-      shape=[None], dtype=tf.string, name='input_tensor')
-  images = tf.map_fn(
-      _preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
-  return tf.estimator.export.TensorServingInputReceiver(
-      images, {'image_bytes': image_bytes_list})
-
-
-
-
-def resnet_model_fn(features, labels, mode, model_class,
-                    resnet_size, weight_decay, learning_rate_fn, momentum,
-                    data_format, resnet_version, loss_scale,
-                    loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE,
-                    fine_tune=False):
-  """Shared functionality for different resnet model_fns.
-
-  Initializes the ResnetModel representing the model layers
-  and uses that model to build the necessary EstimatorSpecs for
-  the `mode` in question. For training, this means building losses,
-  the optimizer, and the train op that get passed into the EstimatorSpec.
-  For evaluation and prediction, the EstimatorSpec is returned without
-  a train op, but with the necessary parameters for the given mode.
-
-  Args:
-    features: tensor representing input images
-    labels: tensor representing class labels for all input images
-    mode: current estimator mode; should be one of
-      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
-    model_class: a class representing a TensorFlow model that has a __call__
-      function. We assume here that this is a subclass of ResnetModel.
-    resnet_size: A single integer for the size of the ResNet model.
-    weight_decay: weight decay loss rate used to regularize learned variables.
-    learning_rate_fn: function that returns the current learning rate given
-      the current global_step
-    momentum: momentum term used for optimization
-    data_format: Input format ('channels_last', 'channels_first', or None).
-      If set to None, the format is dependent on whether a GPU is available.
-    resnet_version: Integer representing which version of the ResNet network to
-      use. See README for details. Valid values: [1, 2]
-    loss_scale: The factor to scale the loss for numerical stability. A detailed
-      summary is present in the arg parser help text.
-    loss_filter_fn: function that takes a string variable name and returns
-      True if the var should be included in loss calculation, and False
-      otherwise. If None, batch_normalization variables will be excluded
-      from the loss.
-    dtype: the TensorFlow dtype to use for calculations.
-    fine_tune: If True only train the dense layers(final layers).
-
-  Returns:
-    EstimatorSpec parameterized according to the input params and the
-    current mode.
-  """
-
-  model = model_class(resnet_size, data_format, resnet_version=resnet_version,
-                      dtype=dtype)
-
-  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
-
-  # This acts as a no-op if the logits are already in fp32 (provided logits are
-  # not a SparseTensor). If dtype is is low precision, logits must be cast to
-  # fp32 for numerical stability.
-  logits = tf.cast(logits, tf.float32)
-
-  predictions = {
-      'classes': tf.argmax(logits, axis=1),
-      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
-  }
-
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    # Return the predictions and the specification for serving a SavedModel
+    """Serving input fn for raw jpeg images."""
+
+    def _preprocess_image(image_bytes):
+        """Preprocess a single raw image."""
+        # Bounding box around the whole image.
+        bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
+        height, width, num_channels = image_shape
+        image = imagenet_preprocessing.preprocess_image(
+            image_bytes, bbox, height, width, num_channels, is_training=False
+        )
+        return image
+
+    image_bytes_list = tf.placeholder(
+        shape=[None], dtype=tf.string, name="input_tensor"
+    )
+    images = tf.map_fn(
+        _preprocess_image, image_bytes_list, back_prop=False, dtype=dtype
+    )
+    return tf.estimator.export.TensorServingInputReceiver(
+        images, {"image_bytes": image_bytes_list}
+    )
+
+
+def resnet_model_fn(
+    features,
+    labels,
+    mode,
+    model_class,
+    resnet_size,
+    weight_decay,
+    learning_rate_fn,
+    momentum,
+    data_format,
+    resnet_version,
+    loss_scale,
+    loss_filter_fn=None,
+    dtype=resnet_model.DEFAULT_DTYPE,
+    fine_tune=False,
+):
+    """Shared functionality for different resnet model_fns.
+
+    Initializes the ResnetModel representing the model layers
+    and uses that model to build the necessary EstimatorSpecs for
+    the `mode` in question. For training, this means building losses,
+    the optimizer, and the train op that get passed into the EstimatorSpec.
+    For evaluation and prediction, the EstimatorSpec is returned without
+    a train op, but with the necessary parameters for the given mode.
+
+    Args:
+      features: tensor representing input images
+      labels: tensor representing class labels for all input images
+      mode: current estimator mode; should be one of
+        `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
+      model_class: a class representing a TensorFlow model that has a __call__
+        function. We assume here that this is a subclass of ResnetModel.
+      resnet_size: A single integer for the size of the ResNet model.
+      weight_decay: weight decay loss rate used to regularize learned variables.
+      learning_rate_fn: function that returns the current learning rate given
+        the current global_step
+      momentum: momentum term used for optimization
+      data_format: Input format ('channels_last', 'channels_first', or None).
+        If set to None, the format is dependent on whether a GPU is available.
+      resnet_version: Integer representing which version of the ResNet network to
+        use. See README for details. Valid values: [1, 2]
+      loss_scale: The factor to scale the loss for numerical stability. A detailed
+        summary is present in the arg parser help text.
+      loss_filter_fn: function that takes a string variable name and returns
+        True if the var should be included in loss calculation, and False
+        otherwise. If None, batch_normalization variables will be excluded
+        from the loss.
+      dtype: the TensorFlow dtype to use for calculations.
+      fine_tune: If True only train the dense layers(final layers).
+
+    Returns:
+      EstimatorSpec parameterized according to the input params and the
+      current mode.
+    """
+
+    model = model_class(
+        resnet_size, data_format, resnet_version=resnet_version, dtype=dtype
+    )
+
+    logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
+
+    # This acts as a no-op if the logits are already in fp32 (provided logits are
+    # not a SparseTensor). If dtype is is low precision, logits must be cast to
+    # fp32 for numerical stability.
+    logits = tf.cast(logits, tf.float32)
+
+    predictions = {
+        "classes": tf.argmax(logits, axis=1),
+        "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
+    }
+
+    if mode == tf.estimator.ModeKeys.PREDICT:
+        # Return the predictions and the specification for serving a SavedModel
+        return tf.estimator.EstimatorSpec(
+            mode=mode,
+            predictions=predictions,
+            export_outputs={
+                "predict": tf.estimator.export.PredictOutput(predictions)},
+        )
+
+    # Calculate loss, which includes softmax cross entropy and L2
+    # regularization.
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+        logits=logits, labels=labels)
+
+    # Create a tensor named cross_entropy for logging purposes.
+    tf.identity(cross_entropy, name="cross_entropy")
+
+    # If no loss_filter_fn is passed, assume we want the default behavior,
+    # which is that batch_normalization variables are excluded from loss.
+    def exclude_batch_norm(name):
+        return "batch_normalization" not in name
+
+    loss_filter_fn = loss_filter_fn or exclude_batch_norm
+
+    # Add weight decay to the loss.
+    l2_loss = weight_decay * tf.add_n(
+        # loss is computed using fp32 for numerical stability.
+        [
+            tf.nn.l2_loss(tf.cast(v, tf.float32))
+            for v in tf.trainable_variables()
+            if loss_filter_fn(v.name)
+        ]
+    )
+    tf.summary.scalar("l2_loss", l2_loss)
+    loss = cross_entropy + l2_loss
+
     return tf.estimator.EstimatorSpec(
         mode=mode,
         predictions=predictions,
-        export_outputs={
-            'predict': tf.estimator.export.PredictOutput(predictions)
-        })
-
-  # Calculate loss, which includes softmax cross entropy and L2 regularization.
-  cross_entropy = tf.losses.sparse_softmax_cross_entropy(
-      logits=logits, labels=labels)
-
-  # Create a tensor named cross_entropy for logging purposes.
-  tf.identity(cross_entropy, name='cross_entropy')
-
-  # If no loss_filter_fn is passed, assume we want the default behavior,
-  # which is that batch_normalization variables are excluded from loss.
-  def exclude_batch_norm(name):
-    return 'batch_normalization' not in name
-  loss_filter_fn = loss_filter_fn or exclude_batch_norm
-
-  # Add weight decay to the loss.
-  l2_loss = weight_decay * tf.add_n(
-      # loss is computed using fp32 for numerical stability.
-      [tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()
-       if loss_filter_fn(v.name)])
-  tf.summary.scalar('l2_loss', l2_loss)
-  loss = cross_entropy + l2_loss
-
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=predictions,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=metrics)
-
-
-def resnet_main(
-    flags_obj, model_function, input_function, dataset_name, shape=None):
-  """Shared main loop for ResNet Models.
-
-  Args:
-    flags_obj: An object containing parsed flags. See define_resnet_flags()
-      for details.
-    model_function: the function that instantiates the Model and builds the
-      ops for train/eval. This will be passed directly into the estimator.
-    input_function: the function that processes the dataset and returns a
-      dataset that the estimator can train on. This will be wrapped with
-      all the relevant flags for running and passed to estimator.
-    dataset_name: the name of the dataset for training and evaluation. This is
-      used for logging purpose.
-    shape: list of ints representing the shape of the images used for training.
-      This is only used if flags_obj.export_dir is passed.
-  """
-
-  print("RESNET MAIN")
-  model_helpers.apply_clean(flags.FLAGS)
-
-  # Ensures flag override logic is only executed if explicitly triggered.
-  if flags_obj.tf_gpu_thread_mode:
-    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
-
-  # Creates session config. allow_soft_placement = True, is required for
-  # multi-GPU and is not harmful for other modes.
-  session_config = tf.ConfigProto(allow_soft_placement=True)
-
-  run_config = tf.estimator.RunConfig(
-      session_config=session_config,
-      save_checkpoints_secs=60*60*24)
-
-  # Initializes model with all but the dense layer from pretrained ResNet.
-  if flags_obj.pretrained_model_checkpoint_path is not None:
-    warm_start_settings = tf.estimator.WarmStartSettings(
-        flags_obj.pretrained_model_checkpoint_path,
-        vars_to_warm_start='^(?!.*dense)')
-  else:
-    warm_start_settings = None
-
-  classifier = tf.estimator.Estimator(
-      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
-      warm_start_from=warm_start_settings, params={
-          'resnet_size': int(flags_obj.resnet_size),
-          'data_format': flags_obj.data_format,
-          'batch_size': flags_obj.batch_size,
-          'resnet_version': int(flags_obj.resnet_version),
-          'loss_scale': flags_core.get_loss_scale(flags_obj),
-          'dtype': flags_core.get_tf_dtype(flags_obj),
-          'fine_tune': flags_obj.fine_tune
-      })
-
-  run_params = {
-      'batch_size': flags_obj.batch_size,
-      'dtype': flags_core.get_tf_dtype(flags_obj),
-      'resnet_size': flags_obj.resnet_size,
-      'resnet_version': flags_obj.resnet_version,
-      'synthetic_data': flags_obj.use_synthetic_data,
-      'train_epochs': flags_obj.train_epochs,
-  }
-
-  def input_fn_eval():
-    return input_function(
-        is_training=False,
-        data_dir=flags_obj.data_dir,
-        batch_size=distribution_utils.per_device_batch_size(
-            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
-        num_epochs=1,
-        dtype=flags_core.get_tf_dtype(flags_obj))
-
-  schedule, n_loops = [0], 1
-  if flags_obj.export_dir is not None:
-    # Exports a saved model for the given classifier.
-    export_dtype = flags_core.get_tf_dtype(flags_obj)
-    if flags_obj.image_bytes_as_serving_input:
-      input_receiver_fn = functools.partial(
-          image_bytes_serving_input_fn, shape, dtype=export_dtype)
+        loss=loss,
+        train_op=train_op,
+        eval_metric_ops=metrics,
+    )
+
+
+def resnet_main(flags_obj, model_function,
+                input_function, dataset_name, shape=None):
+    """Shared main loop for ResNet Models.
+
+    Args:
+      flags_obj: An object containing parsed flags. See define_resnet_flags()
+        for details.
+      model_function: the function that instantiates the Model and builds the
+        ops for train/eval. This will be passed directly into the estimator.
+      input_function: the function that processes the dataset and returns a
+        dataset that the estimator can train on. This will be wrapped with
+        all the relevant flags for running and passed to estimator.
+      dataset_name: the name of the dataset for training and evaluation. This is
+        used for logging purpose.
+      shape: list of ints representing the shape of the images used for training.
+        This is only used if flags_obj.export_dir is passed.
+    """
+
+    print("RESNET MAIN")
+    model_helpers.apply_clean(flags.FLAGS)
+
+    # Ensures flag override logic is only executed if explicitly triggered.
+    if flags_obj.tf_gpu_thread_mode:
+        override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
+
+    # Creates session config. allow_soft_placement = True, is required for
+    # multi-GPU and is not harmful for other modes.
+    session_config = tf.ConfigProto(allow_soft_placement=True)
+
+    run_config = tf.estimator.RunConfig(
+        session_config=session_config, save_checkpoints_secs=60 * 60 * 24
+    )
+
+    # Initializes model with all but the dense layer from pretrained ResNet.
+    if flags_obj.pretrained_model_checkpoint_path is not None:
+        warm_start_settings = tf.estimator.WarmStartSettings(
+            flags_obj.pretrained_model_checkpoint_path,
+            vars_to_warm_start="^(?!.*dense)",
+        )
     else:
-      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
-          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
-    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
-                                 strip_default_attrs=True)
+        warm_start_settings = None
+
+    classifier = tf.estimator.Estimator(
+        model_fn=model_function,
+        model_dir=flags_obj.model_dir,
+        config=run_config,
+        warm_start_from=warm_start_settings,
+        params={
+            "resnet_size": int(flags_obj.resnet_size),
+            "data_format": flags_obj.data_format,
+            "batch_size": flags_obj.batch_size,
+            "resnet_version": int(flags_obj.resnet_version),
+            "loss_scale": flags_core.get_loss_scale(flags_obj),
+            "dtype": flags_core.get_tf_dtype(flags_obj),
+            "fine_tune": flags_obj.fine_tune,
+        },
+    )
+
+    run_params = {
+        "batch_size": flags_obj.batch_size,
+        "dtype": flags_core.get_tf_dtype(flags_obj),
+        "resnet_size": flags_obj.resnet_size,
+        "resnet_version": flags_obj.resnet_version,
+        "synthetic_data": flags_obj.use_synthetic_data,
+        "train_epochs": flags_obj.train_epochs,
+    }
+
+    def input_fn_eval():
+        return input_function(
+            is_training=False,
+            data_dir=flags_obj.data_dir,
+            batch_size=distribution_utils.per_device_batch_size(
+                flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)
+            ),
+            num_epochs=1,
+            dtype=flags_core.get_tf_dtype(flags_obj),
+        )
+
+    schedule, n_loops = [0], 1
+    if flags_obj.export_dir is not None:
+        # Exports a saved model for the given classifier.
+        export_dtype = flags_core.get_tf_dtype(flags_obj)
+        if flags_obj.image_bytes_as_serving_input:
+            input_receiver_fn = functools.partial(
+                image_bytes_serving_input_fn, shape, dtype=export_dtype
+            )
+        else:
+            input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
+                shape, batch_size=flags_obj.batch_size, dtype=export_dtype
+            )
+        classifier.export_savedmodel(
+            flags_obj.export_dir, input_receiver_fn, strip_default_attrs=True
+        )
 
 
 def define_resnet_flags(resnet_size_choices=None):
-  """Add flags and validators for ResNet."""
-  flags_core.define_base()
-  flags_core.define_performance(num_parallel_calls=False,
-                                tf_gpu_thread_mode=True,
-                                datasets_num_private_threads=True,
-                                datasets_num_parallel_batches=True)
-  flags_core.define_image()
-  flags_core.define_benchmark()
-  flags.adopt_module_key_flags(flags_core)
-
-  flags.DEFINE_enum(
-      name='resnet_version', short_name='rv', default='1',
-      enum_values=['1', '2'],
-      help=flags_core.help_wrap(
-          'Version of ResNet. (1 or 2) See README.md for details.'))
-  flags.DEFINE_bool(
-      name='fine_tune', short_name='ft', default=False,
-      help=flags_core.help_wrap(
-          'If True do not train any parameters except for the final layer.'))
-  flags.DEFINE_string(
-      name='pretrained_model_checkpoint_path', short_name='pmcp', default=None,
-      help=flags_core.help_wrap(
-          'If not None initialize all the network except the final layer with '
-          'these values'))
-  flags.DEFINE_boolean(
-      name='eval_only', default=False,
-      help=flags_core.help_wrap('Skip training and only perform evaluation on '
-                                'the latest checkpoint.'))
-  flags.DEFINE_boolean(
-      name='image_bytes_as_serving_input', default=False,
-      help=flags_core.help_wrap(
-          'If True exports savedmodel with serving signature that accepts '
-          'JPEG image bytes instead of a fixed size [HxWxC] tensor that '
-          'represents the image. The former is easier to use for serving at '
-          'the expense of image resize/cropping being done as part of model '
-          'inference. Note, this flag only applies to ImageNet and cannot '
-          'be used for CIFAR.'))
-
-  choice_kwargs = dict(
-      name='resnet_size', short_name='rs', default='50',
-      help=flags_core.help_wrap('The size of the ResNet model to use.'))
-
-  if resnet_size_choices is None:
-    flags.DEFINE_string(**choice_kwargs)
-  else:
-    flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
+    """Add flags and validators for ResNet."""
+    flags_core.define_base()
+    flags_core.define_performance(
+        num_parallel_calls=False,
+        tf_gpu_thread_mode=True,
+        datasets_num_private_threads=True,
+        datasets_num_parallel_batches=True,
+    )
+    flags_core.define_image()
+    flags_core.define_benchmark()
+    flags.adopt_module_key_flags(flags_core)
+
+    flags.DEFINE_enum(
+        name="resnet_version",
+        short_name="rv",
+        default="1",
+        enum_values=["1", "2"],
+        help=flags_core.help_wrap(
+            "Version of ResNet. (1 or 2) See README.md for details."
+        ),
+    )
+    flags.DEFINE_bool(
+        name="fine_tune",
+        short_name="ft",
+        default=False,
+        help=flags_core.help_wrap(
+            "If True do not train any parameters except for the final layer."
+        ),
+    )
+    flags.DEFINE_string(
+        name="pretrained_model_checkpoint_path",
+        short_name="pmcp",
+        default=None,
+        help=flags_core.help_wrap(
+            "If not None initialize all the network except the final layer with "
+            "these values"
+        ),
+    )
+    flags.DEFINE_boolean(
+        name="eval_only",
+        default=False,
+        help=flags_core.help_wrap(
+            "Skip training and only perform evaluation on " "the latest checkpoint."
+        ),
+    )
+    flags.DEFINE_boolean(
+        name="image_bytes_as_serving_input",
+        default=False,
+        help=flags_core.help_wrap(
+            "If True exports savedmodel with serving signature that accepts "
+            "JPEG image bytes instead of a fixed size [HxWxC] tensor that "
+            "represents the image. The former is easier to use for serving at "
+            "the expense of image resize/cropping being done as part of model "
+            "inference. Note, this flag only applies to ImageNet and cannot "
+            "be used for CIFAR."
+        ),
+    )
+
+    choice_kwargs = dict(
+        name="resnet_size",
+        short_name="rs",
+        default="50",
+        help=flags_core.help_wrap("The size of the ResNet model to use."),
+    )
+
+    if resnet_size_choices is None:
+        flags.DEFINE_string(**choice_kwargs)
+    else:
+        flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
diff --git a/vision/classification_and_detection/tools/retinanet_pytorch_to_onnx.py b/vision/classification_and_detection/tools/retinanet_pytorch_to_onnx.py
index cee4af6c6..3a0fb2d99 100644
--- a/vision/classification_and_detection/tools/retinanet_pytorch_to_onnx.py
+++ b/vision/classification_and_detection/tools/retinanet_pytorch_to_onnx.py
@@ -35,7 +35,8 @@ def get_args():
     num_classes = 264
     image_size = [800, 800]
 
-    model = retinanet_from_backbone(backbone, num_classes, image_size=image_size)
+    model = retinanet_from_backbone(
+        backbone, num_classes, image_size=image_size)
     checkpoint = torch.load(args.weights, map_location="cpu")
     model.load_state_dict(checkpoint["model"])
     model.eval()
diff --git a/vision/classification_and_detection/tools/ssd-nhwc.py b/vision/classification_and_detection/tools/ssd-nhwc.py
index f255fd965..d7ef9599e 100644
--- a/vision/classification_and_detection/tools/ssd-nhwc.py
+++ b/vision/classification_and_detection/tools/ssd-nhwc.py
@@ -4,33 +4,39 @@
 import tensorflow as tf
 from tensorflow.core.framework import graph_pb2
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument('pbfile')
+    parser.add_argument("pbfile")
     return parser.parse_args()
 
+
 def insert_transpose(graph, a, b, to_nchw):
     if not isinstance(b, list):
         b = [b]
     trans_perm = graph.node.add()
-    trans_perm.name = a.name + '/transpose/perm'
-    trans_perm.op = 'Const'
-    trans_perm.attr['dtype'].type = 3 # DT_INT32
-    trans_perm.attr['value'].tensor.dtype = 3 # DT_INT32
-    trans_perm.attr['value'].tensor.tensor_shape.dim.add()
-    trans_perm.attr['value'].tensor.tensor_shape.dim[0].size = 4
+    trans_perm.name = a.name + "/transpose/perm"
+    trans_perm.op = "Const"
+    trans_perm.attr["dtype"].type = 3  # DT_INT32
+    trans_perm.attr["value"].tensor.dtype = 3  # DT_INT32
+    trans_perm.attr["value"].tensor.tensor_shape.dim.add()
+    trans_perm.attr["value"].tensor.tensor_shape.dim[0].size = 4
     if to_nchw:
-        trans_perm.attr['value'].tensor.tensor_content = b'\000\000\000\000\003\000\000\000\001\000\000\000\002\000\000\000'
+        trans_perm.attr["value"].tensor.tensor_content = (
+            b"\000\000\000\000\003\000\000\000\001\000\000\000\002\000\000\000"
+        )
     else:
-        trans_perm.attr['value'].tensor.tensor_content = b'\000\000\000\000\002\000\000\000\003\000\000\000\001\000\000\000'
-    
+        trans_perm.attr["value"].tensor.tensor_content = (
+            b"\000\000\000\000\002\000\000\000\003\000\000\000\001\000\000\000"
+        )
+
     trans = graph.node.add()
-    trans.name = a.name + '/transpose'
-    trans.op = 'Transpose'
+    trans.name = a.name + "/transpose"
+    trans.op = "Transpose"
     trans.input.append(a.name)
     trans.input.append(trans_perm.name)
-    trans.attr['T'].type = 1
-    trans.attr['Tperm'].type = 3
+    trans.attr["T"].type = 1
+    trans.attr["Tperm"].type = 3
 
     for n in b:
         inputs = []
@@ -45,6 +51,7 @@ def insert_transpose(graph, a, b, to_nchw):
         for i in range(0, cnt):
             n.input.append(inputs[i])
 
+
 def convert_list_nhwc(l):
     c = l.i[1]
     h = l.i[2]
@@ -52,102 +59,116 @@ def convert_list_nhwc(l):
     l.i[1] = h
     l.i[2] = w
     l.i[3] = c
-    
+
+
 def convert_conv_nhwc(node_conv):
-    node_conv.attr['data_format'].s = b'NHWC'
-    convert_list_nhwc(node_conv.attr['dilations'].list)
-    convert_list_nhwc(node_conv.attr['strides'].list)
+    node_conv.attr["data_format"].s = b"NHWC"
+    convert_list_nhwc(node_conv.attr["dilations"].list)
+    convert_list_nhwc(node_conv.attr["strides"].list)
+
 
 def convert_general_nhwc(node):
-    node.attr['data_format'].s = b'NHWC'
+    node.attr["data_format"].s = b"NHWC"
+
 
 def convert_mp_nhwc(node_mp):
-    node_mp.attr['data_format'].s = b'NHWC'
-    convert_list_nhwc(node_mp.attr['ksize'].list)
-    convert_list_nhwc(node_mp.attr['strides'].list)
+    node_mp.attr["data_format"].s = b"NHWC"
+    convert_list_nhwc(node_mp.attr["ksize"].list)
+    convert_list_nhwc(node_mp.attr["strides"].list)
+
 
 def convert_image_nhwc(node_image):
-    c = node_image.attr['shape'].shape.dim[1].size
-    del node_image.attr['shape'].shape.dim[1]
-    d = node_image.attr['shape'].shape.dim.add()
+    c = node_image.attr["shape"].shape.dim[1].size
+    del node_image.attr["shape"].shape.dim[1]
+    d = node_image.attr["shape"].shape.dim.add()
     d.size = c
 
+
 def init_node(n):
     node = {}
-    node['node'] = n
-    node['inputs'] = []
-    node['outputs'] = []
+    node["node"] = n
+    node["inputs"] = []
+    node["outputs"] = []
     return node
 
+
 def connect_nodes(n1, n2):
-    if n2['node'].name not in n1['outputs']:
-        n1['outputs'].append(n2['node'].name)
-        n2['inputs'].append(n1['node'].name)
+    if n2["node"].name not in n1["outputs"]:
+        n1["outputs"].append(n2["node"].name)
+        n2["inputs"].append(n1["node"].name)
     else:
-        print('{} -> {} already connected'.format(n1['node'].name, n2['node'].name))
+        print(
+            "{} -> {} already connected".format(n1["node"].name, n2["node"].name))
+
 
 def disconnect_nodes(n1, n2):
-    if n1['node'].name not in n2['inputs'] or n2['node'].name not in n1['outputs']:
-        print('{} -> {} not connected'.format(n1['node'].name, n2['node'].name))
-    for i in range(0, len(n1['outputs'])):
-        if n1['outputs'][i] == n2['node'].name:
-            del n1['outputs'][i]
+    if n1["node"].name not in n2["inputs"] or n2["node"].name not in n1["outputs"]:
+        print(
+            "{} -> {} not connected".format(n1["node"].name, n2["node"].name))
+    for i in range(0, len(n1["outputs"])):
+        if n1["outputs"][i] == n2["node"].name:
+            del n1["outputs"][i]
             break
-    for i in range(0, len(n2['inputs'])):
-        if n2['inputs'][i] == n1['node'].name:
-            del n2['inputs'][i]
+    for i in range(0, len(n2["inputs"])):
+        if n2["inputs"][i] == n1["node"].name:
+            del n2["inputs"][i]
             break
-            
+
+
 def build_graph(graph):
     node_map = {}
     for n in graph.node:
         node = init_node(n)
         node_map[n.name] = node
     for n in node_map:
-        for i in node_map[n]['node'].input:
-            if ':' in i:
-                i = i[:i.find(':')]
-            i = i.lstrip('^')
+        for i in node_map[n]["node"].input:
+            if ":" in i:
+                i = i[: i.find(":")]
+            i = i.lstrip("^")
             if i not in node_map:
-                print('node {} not found'.format(i))
+                print("node {} not found".format(i))
             else:
                 connect_nodes(node_map[i], node_map[n])
     return node_map
 
+
 def trim_const_from_graph(node_map):
     trim_list = []
     for n in node_map:
-        if node_map[n]['node'].op == 'Const':
+        if node_map[n]["node"].op == "Const":
             trim_list.append(n)
     for n in trim_list:
-        print('trimming {}'.format(n))
-        for o in node_map[n]['outputs']:
+        print("trimming {}".format(n))
+        for o in node_map[n]["outputs"]:
             disconnect_nodes(node_map[n], node_map[o])
         del node_map[n]
 
     trim_list = []
     for n in node_map:
-        if node_map[n]['node'].op == 'Identity' and len(node_map[n]['inputs']) == 0:
+        if node_map[n]["node"].op == "Identity" and len(
+                node_map[n]["inputs"]) == 0:
             trim_list.append(n)
     for n in trim_list:
-        print('trimming {}'.format(n))
-        for o in node_map[n]['outputs']:
+        print("trimming {}".format(n))
+        for o in node_map[n]["outputs"]:
             disconnect_nodes(node_map[n], node_map[o])
         del node_map[n]
 
 
 def all_input_in_nhwc(n, node_map, nhwc_nodes):
-    for i in node_map[n]['inputs']:
+    for i in node_map[n]["inputs"]:
         if i not in nhwc_nodes:
             return False
     return True
 
+
 def all_output_in_nhwc(n, node_map, nhwc_nodes):
-    for o in node_map[n]['outputs']:
+    for o in node_map[n]["outputs"]:
         if o not in nhwc_nodes:
             return False
     return True
 
+
 def find_nhwc_region(node_map):
     transpose_nhwc_nodes = {}
     transpose_nchw_nodes = {}
@@ -156,7 +177,7 @@ def find_nhwc_region(node_map):
     transpose_nhwc_nodes_append_list = []
     transpose_nchw_nodes_append_list = []
     for n in node_map:
-        if node_map[n]['node'].op == 'Conv2D':
+        if node_map[n]["node"].op == "Conv2D":
             transpose_nhwc_nodes_append_list.append(n)
             transpose_nchw_nodes_append_list.append(n)
             nhwc_nodes.append(n)
@@ -168,12 +189,18 @@ def find_nhwc_region(node_map):
             transpose_nchw_nodes[n] = 1
 
     prev_cnt_nhwc_nodes = len(nhwc_nodes)
-    nhwc_op_list = ['Conv2D', 'Relu', 'FusedBatchNorm', 'MaxPool', 'BiasAdd', 'Add']
+    nhwc_op_list = [
+        "Conv2D",
+        "Relu",
+        "FusedBatchNorm",
+        "MaxPool",
+        "BiasAdd",
+        "Add"]
     while True:
         transpose_nchw_nodes_append_list = []
         for n in transpose_nchw_nodes:
-            for o in node_map[n]['outputs']:
-                if o not in nhwc_nodes and node_map[o]['node'].op in nhwc_op_list:
+            for o in node_map[n]["outputs"]:
+                if o not in nhwc_nodes and node_map[o]["node"].op in nhwc_op_list:
                     if all_input_in_nhwc(o, node_map, nhwc_nodes):
                         nhwc_nodes.append(o)
                         if o not in transpose_nchw_nodes_append_list:
@@ -182,15 +209,19 @@ def find_nhwc_region(node_map):
         transpose_nhwc_nodes_remove_list = []
         transpose_nchw_nodes_remove_list = []
         for n in transpose_nhwc_nodes:
-            if (all_input_in_nhwc(n, node_map, nhwc_nodes) and
-                n not in transpose_nhwc_nodes_remove_list):
+            if (
+                all_input_in_nhwc(n, node_map, nhwc_nodes)
+                and n not in transpose_nhwc_nodes_remove_list
+            ):
                 transpose_nhwc_nodes_remove_list.append(n)
         for n in transpose_nhwc_nodes_remove_list:
             del transpose_nhwc_nodes[n]
 
         for n in transpose_nchw_nodes:
-            if (all_output_in_nhwc(n, node_map, nhwc_nodes) and
-                n not in transpose_nchw_nodes_remove_list):
+            if (
+                all_output_in_nhwc(n, node_map, nhwc_nodes)
+                and n not in transpose_nchw_nodes_remove_list
+            ):
                 transpose_nchw_nodes_remove_list.append(n)
         for n in transpose_nchw_nodes_remove_list:
             del transpose_nchw_nodes[n]
@@ -202,66 +233,81 @@ def find_nhwc_region(node_map):
         if len(nhwc_nodes) == prev_cnt_nhwc_nodes:
             break
         prev_cnt_nhwc_nodes = len(nhwc_nodes)
-                        
-    print('\n\nTranspose to NHWC at nodes:')
+
+    print("\n\nTranspose to NHWC at nodes:")
     for n in transpose_nhwc_nodes:
-        print('    {}'.format(n))
-    
-    print('\n\nTranspose to NCHW at nodes:')
+        print("    {}".format(n))
+
+    print("\n\nTranspose to NCHW at nodes:")
     for n in transpose_nchw_nodes:
-        print('    {}'.format(n))
-    
+        print("    {}".format(n))
+
     return nhwc_nodes, transpose_nhwc_nodes, transpose_nchw_nodes
 
+
 def main():
     args = get_args()
 
     graph = graph_pb2.GraphDef()
-    with open(args.pbfile, 'rb') as f:
+    with open(args.pbfile, "rb") as f:
         graph.ParseFromString(f.read())
 
     node_map = build_graph(graph)
     trim_const_from_graph(node_map)
 
-    nhwc_nodes, transpose_nhwc_nodes, transpose_nchw_nodes = find_nhwc_region(node_map)
+    nhwc_nodes, transpose_nhwc_nodes, transpose_nchw_nodes = find_nhwc_region(
+        node_map)
 
-    nhwc_op_list = ['Conv2D', 'Relu', 'FusedBatchNorm', 'MaxPool', 'BiasAdd', 'Add']
+    nhwc_op_list = [
+        "Conv2D",
+        "Relu",
+        "FusedBatchNorm",
+        "MaxPool",
+        "BiasAdd",
+        "Add"]
     for n in nhwc_nodes:
-        if node_map[n]['node'].op == 'Conv2D':
-            convert_conv_nhwc(node_map[n]['node'])
-        elif node_map[n]['node'].op in ['FusedBatchNorm', 'BiasAdd']:
-            convert_general_nhwc(node_map[n]['node'])
-        elif node_map[n]['node'].op == 'MaxPool':
-            convert_mp_nhwc(node_map[n]['node'])
-      
+        if node_map[n]["node"].op == "Conv2D":
+            convert_conv_nhwc(node_map[n]["node"])
+        elif node_map[n]["node"].op in ["FusedBatchNorm", "BiasAdd"]:
+            convert_general_nhwc(node_map[n]["node"])
+        elif node_map[n]["node"].op == "MaxPool":
+            convert_mp_nhwc(node_map[n]["node"])
+
     done_nhwc = False
     if len(transpose_nhwc_nodes) == 1:
         for n in transpose_nhwc_nodes:
-            if len(node_map[n]['inputs']) == 1 and node_map[n]['inputs'][0] == 'image':
+            if len(node_map[n]["inputs"]
+                   ) == 1 and node_map[n]["inputs"][0] == "image":
                 image_outputs = []
-                for o in node_map['image']['outputs']:
+                for o in node_map["image"]["outputs"]:
                     if o != n:
-                        image_outputs.append(node_map[o]['node'])
-                insert_transpose(graph, node_map['image']['node'], image_outputs, True)
-                convert_image_nhwc(node_map['image']['node'])
+                        image_outputs.append(node_map[o]["node"])
+                insert_transpose(
+                    graph,
+                    node_map["image"]["node"],
+                    image_outputs,
+                    True)
+                convert_image_nhwc(node_map["image"]["node"])
                 done_nhwc = True
 
     if not done_nhwc:
         for n in transpose_nhwc_nodes:
-            for i in node_map[n]['inputs']:
+            for i in node_map[n]["inputs"]:
                 if i not in nhwc_nodes:
-                    insert_transpose(graph, node_map[i]['node'], node_map[n]['node'], False)
+                    insert_transpose(
+                        graph, node_map[i]["node"], node_map[n]["node"], False
+                    )
 
     for n in transpose_nchw_nodes:
         node_outputs = []
-        for o in node_map[n]['outputs']:
+        for o in node_map[n]["outputs"]:
             if o not in nhwc_nodes:
-                node_outputs.append(node_map[o]['node'])
-        insert_transpose(graph, node_map[n]['node'], node_outputs, True)
+                node_outputs.append(node_map[o]["node"])
+        insert_transpose(graph, node_map[n]["node"], node_outputs, True)
 
-    with open(args.pbfile+'.patch', 'wb') as f:
+    with open(args.pbfile + ".patch", "wb") as f:
         f.write(graph.SerializeToString())
 
-if __name__ == '__main__':
-    main()
 
+if __name__ == "__main__":
+    main()
diff --git a/vision/medical_imaging/3d-unet-brats19/Task043_BraTS_2019.py b/vision/medical_imaging/3d-unet-brats19/Task043_BraTS_2019.py
index 6c196077b..3d942e8a1 100644
--- a/vision/medical_imaging/3d-unet-brats19/Task043_BraTS_2019.py
+++ b/vision/medical_imaging/3d-unet-brats19/Task043_BraTS_2019.py
@@ -14,8 +14,13 @@
 #    limitations under the License.
 
 # This file is copied from nnUnet/nnunet/dataset_conversion/Task043_BraTS_2019.py, except that
-# the validation/test set part is removed and downloaded_data_dir is now configurable.
+# the validation/test set part is removed and downloaded_data_dir is now
+# configurable.
 
+import shutil
+import SimpleITK as sitk
+from nnunet.paths import nnUNet_raw_data
+from batchgenerators.utilities.file_and_folder_operations import *
 import argparse
 import numpy as np
 from collections import OrderedDict
@@ -24,27 +29,29 @@
 
 sys.path.insert(0, os.path.join(os.getcwd(), "nnUnet"))
 
-from batchgenerators.utilities.file_and_folder_operations import *
-from nnunet.paths import nnUNet_raw_data
-import SimpleITK as sitk
-import shutil
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--downloaded_data_dir", default="build/MICCAI_BraTS_2019_Data_Training", help="path to MICCAI_BraTS_2019_Data_Training")
+    parser.add_argument(
+        "--downloaded_data_dir",
+        default="build/MICCAI_BraTS_2019_Data_Training",
+        help="path to MICCAI_BraTS_2019_Data_Training",
+    )
     args = parser.parse_args()
     return args
 
+
 def copy_BraTS_segmentation_and_convert_labels(in_file, out_file):
     # use this for segmentation only!!!
-    # nnUNet wants the labels to be continuous. BraTS is 0, 1, 2, 4 -> we make that into 0, 1, 2, 3
+    # nnUNet wants the labels to be continuous. BraTS is 0, 1, 2, 4 -> we make
+    # that into 0, 1, 2, 3
     img = sitk.ReadImage(in_file)
     img_npy = sitk.GetArrayFromImage(img)
 
     uniques = np.unique(img_npy)
     for u in uniques:
         if u not in [0, 1, 2, 4]:
-            raise RuntimeError('unexpected label')
+            raise RuntimeError("unexpected label")
 
     seg_new = np.zeros_like(img_npy)
     seg_new[img_npy == 4] = 3
@@ -54,6 +61,7 @@ def copy_BraTS_segmentation_and_convert_labels(in_file, out_file):
     img_corr.CopyInformation(img)
     sitk.WriteImage(img_corr, out_file)
 
+
 def main():
 
     args = get_args()
@@ -89,47 +97,65 @@ def main():
             flair = join(patdir, p + "_flair.nii.gz")
             seg = join(patdir, p + "_seg.nii.gz")
 
-            assert all([
-                isfile(t1),
-                isfile(t1c),
-                isfile(t2),
-                isfile(flair),
-                isfile(seg)
-            ]), "%s" % patient_name
-
-            shutil.copy(t1, join(target_imagesTr, patient_name + "_0000.nii.gz"))
-            shutil.copy(t1c, join(target_imagesTr, patient_name + "_0001.nii.gz"))
-            shutil.copy(t2, join(target_imagesTr, patient_name + "_0002.nii.gz"))
-            shutil.copy(flair, join(target_imagesTr, patient_name + "_0003.nii.gz"))
-
-            copy_BraTS_segmentation_and_convert_labels(seg, join(target_labelsTr, patient_name + ".nii.gz"))
+            assert all(
+                [isfile(t1), isfile(t1c), isfile(t2),
+                 isfile(flair), isfile(seg)]
+            ), ("%s" % patient_name)
+
+            shutil.copy(
+                t1,
+                join(
+                    target_imagesTr,
+                    patient_name +
+                    "_0000.nii.gz"))
+            shutil.copy(
+                t1c,
+                join(
+                    target_imagesTr,
+                    patient_name +
+                    "_0001.nii.gz"))
+            shutil.copy(
+                t2,
+                join(
+                    target_imagesTr,
+                    patient_name +
+                    "_0002.nii.gz"))
+            shutil.copy(
+                flair,
+                join(
+                    target_imagesTr,
+                    patient_name +
+                    "_0003.nii.gz"))
+
+            copy_BraTS_segmentation_and_convert_labels(
+                seg, join(target_labelsTr, patient_name + ".nii.gz")
+            )
 
     json_dict = OrderedDict()
-    json_dict['name'] = "BraTS2019"
-    json_dict['description'] = "nothing"
-    json_dict['tensorImageSize'] = "4D"
-    json_dict['reference'] = "see BraTS2019"
-    json_dict['licence'] = "see BraTS2019 license"
-    json_dict['release'] = "0.0"
-    json_dict['modality'] = {
-        "0": "T1",
-        "1": "T1ce",
-        "2": "T2",
-        "3": "FLAIR"
-    }
-    json_dict['labels'] = {
+    json_dict["name"] = "BraTS2019"
+    json_dict["description"] = "nothing"
+    json_dict["tensorImageSize"] = "4D"
+    json_dict["reference"] = "see BraTS2019"
+    json_dict["licence"] = "see BraTS2019 license"
+    json_dict["release"] = "0.0"
+    json_dict["modality"] = {"0": "T1", "1": "T1ce", "2": "T2", "3": "FLAIR"}
+    json_dict["labels"] = {
         "0": "background",
         "1": "edema",
         "2": "non-enhancing",
         "3": "enhancing",
     }
-    json_dict['numTraining'] = len(patient_names)
-    json_dict['numTest'] = 0
-    json_dict['training'] = [{'image': "./imagesTr/%s.nii.gz" % i, "label": "./labelsTr/%s.nii.gz" % i} for i in
-                             patient_names]
-    json_dict['test'] = []
+    json_dict["numTraining"] = len(patient_names)
+    json_dict["numTest"] = 0
+    json_dict["training"] = [
+        {"image": "./imagesTr/%s.nii.gz" %
+            i, "label": "./labelsTr/%s.nii.gz" % i}
+        for i in patient_names
+    ]
+    json_dict["test"] = []
 
     save_json(json_dict, join(target_base, "dataset.json"))
 
+
 if __name__ == "__main__":
     main()
diff --git a/vision/medical_imaging/3d-unet-brats19/accuracy-brats.py b/vision/medical_imaging/3d-unet-brats19/accuracy-brats.py
index 82c9ce6c7..ad2e82146 100644
--- a/vision/medical_imaging/3d-unet-brats19/accuracy-brats.py
+++ b/vision/medical_imaging/3d-unet-brats19/accuracy-brats.py
@@ -14,6 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nnunet.inference.segmentation_export import save_segmentation_nifti_from_softmax
+from nnunet.evaluation.region_based_evaluation import (
+    evaluate_regions,
+    get_brats_regions,
+)
+from multiprocessing import Pool
 import argparse
 import json
 import numpy as np
@@ -23,9 +29,6 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "nnUnet"))
 
-from multiprocessing import Pool
-from nnunet.evaluation.region_based_evaluation import evaluate_regions, get_brats_regions
-from nnunet.inference.segmentation_export import save_segmentation_nifti_from_softmax
 
 dtype_map = {
     "int8": np.int8,
@@ -34,43 +37,101 @@
     "int64": np.int64,
     "float16": np.float16,
     "float32": np.float32,
-    "float64": np.float64
+    "float64": np.float64,
 }
 
+
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--log_file", default="build/logs/mlperf_log_accuracy.json", help="Path to accuracy log json file")
-    parser.add_argument("--output_dtype", default="float16", choices=dtype_map.keys(), help="Output data type")
-    parser.add_argument("--preprocessed_data_dir", default="build/preprocessed_data", help="Path to the directory containing preprocessed data")
-    parser.add_argument("--postprocessed_data_dir", default="build/postprocessed_data", help="Path to the directory containing postprocessed data")
-    parser.add_argument("--label_data_dir", default="build/raw_data/nnUNet_raw_data/Task043_BraTS2019/labelsTr",
-        help="Path to the directory containing ground truth labels")
-    parser.add_argument("--num_threads_nifti_save", type=int, default=12, help="Number of threads to run the postprocessing with")
+    parser.add_argument(
+        "--log_file",
+        default="build/logs/mlperf_log_accuracy.json",
+        help="Path to accuracy log json file",
+    )
+    parser.add_argument(
+        "--output_dtype",
+        default="float16",
+        choices=dtype_map.keys(),
+        help="Output data type",
+    )
+    parser.add_argument(
+        "--preprocessed_data_dir",
+        default="build/preprocessed_data",
+        help="Path to the directory containing preprocessed data",
+    )
+    parser.add_argument(
+        "--postprocessed_data_dir",
+        default="build/postprocessed_data",
+        help="Path to the directory containing postprocessed data",
+    )
+    parser.add_argument(
+        "--label_data_dir",
+        default="build/raw_data/nnUNet_raw_data/Task043_BraTS2019/labelsTr",
+        help="Path to the directory containing ground truth labels",
+    )
+    parser.add_argument(
+        "--num_threads_nifti_save",
+        type=int,
+        default=12,
+        help="Number of threads to run the postprocessing with",
+    )
     args = parser.parse_args()
     return args
 
-def save_predictions_MLPerf(predictions, output_folder, output_files, dictionaries, num_threads_nifti_save, all_in_gpu, force_separate_z=None, interp_order=3, interp_order_z=0):
+
+def save_predictions_MLPerf(
+    predictions,
+    output_folder,
+    output_files,
+    dictionaries,
+    num_threads_nifti_save,
+    all_in_gpu,
+    force_separate_z=None,
+    interp_order=3,
+    interp_order_z=0,
+):
     print("Saving predictions...")
     pool = Pool(num_threads_nifti_save)
     results = []
     for i, output_filename in enumerate(output_files):
         print(i, "/", len(output_files))
-        output_filename = os.path.join(output_folder, output_filename + ".nii.gz")
+        output_filename = os.path.join(
+            output_folder, output_filename + ".nii.gz")
         softmax_mean = predictions[i]
         dct = dictionaries[i]
         bytes_per_voxel = 4
         if all_in_gpu:
-            bytes_per_voxel = 2  # if all_in_gpu then the return value is half (float16)
-        if np.prod(softmax_mean.shape) > (2e9 / bytes_per_voxel * 0.85):  # * 0.85 just to be save
+            # if all_in_gpu then the return value is half (float16)
+            bytes_per_voxel = 2
+        if np.prod(softmax_mean.shape) > (
+            2e9 / bytes_per_voxel * 0.85
+        ):  # * 0.85 just to be save
             print(
-                "This output is too large for python process-process communication. Saving output temporarily to disk")
+                "This output is too large for python process-process communication. Saving output temporarily to disk"
+            )
             np.save(output_filename[:-7] + ".npy", softmax_mean)
             softmax_mean = output_filename[:-7] + ".npy"
 
-        results.append(pool.starmap_async(save_segmentation_nifti_from_softmax,
-                                          ((softmax_mean, output_filename, dct, interp_order, None, None, None,
-                                            None, None, force_separate_z, interp_order_z),)
-                                          ))
+        results.append(
+            pool.starmap_async(
+                save_segmentation_nifti_from_softmax,
+                (
+                    (
+                        softmax_mean,
+                        output_filename,
+                        dct,
+                        interp_order,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        force_separate_z,
+                        interp_order_z,
+                    ),
+                ),
+            )
+        )
     _ = [i.get() for i in results]
 
     pool.close()
@@ -78,11 +139,14 @@ def save_predictions_MLPerf(predictions, output_folder, output_files, dictionari
 
     del predictions
 
+
 def load_loadgen_log(log_file, result_dtype, dictionaries):
     with open(log_file) as f:
         predictions = json.load(f)
 
-    assert len(predictions) == len(dictionaries), "Number of predictions does not match number of samples in validation set!"
+    assert len(predictions) == len(
+        dictionaries
+    ), "Number of predictions does not match number of samples in validation set!"
 
     padded_shape = [224, 224, 160]
     results = [None for i in range(len(predictions))]
@@ -92,15 +156,27 @@ def load_loadgen_log(log_file, result_dtype, dictionaries):
         raw_shape = list(dictionaries[qsl_idx]["size_after_cropping"])
         # Remove the padded part
         pad_before = [(p - r) // 2 for p, r in zip(padded_shape, raw_shape)]
-        pad_after = [-(p - r - b) for p, r, b in zip(padded_shape, raw_shape, pad_before)]
+        pad_after = [
+            -(p - r - b) for p, r, b in zip(padded_shape, raw_shape, pad_before)
+        ]
         result_shape = (4,) + tuple(padded_shape)
-        result = np.frombuffer(bytes.fromhex(prediction["data"]), result_dtype).reshape(result_shape).astype(np.float16)
-        results[qsl_idx] = result[:, pad_before[0]:pad_after[0], pad_before[1]:pad_after[1], pad_before[2]:pad_after[2]]
+        result = (
+            np.frombuffer(bytes.fromhex(prediction["data"]), result_dtype)
+            .reshape(result_shape)
+            .astype(np.float16)
+        )
+        results[qsl_idx] = result[
+            :,
+            pad_before[0]: pad_after[0],
+            pad_before[1]: pad_after[1],
+            pad_before[2]: pad_after[2],
+        ]
 
     assert all([i is not None for i in results]), "Missing some results!"
 
     return results
 
+
 def main():
     args = get_args()
     log_file = args.log_file
@@ -120,7 +196,10 @@ def main():
         preprocessed_files = pickle.load(f)
     dictionaries = []
     for preprocessed_file in preprocessed_files:
-        with open(os.path.join(preprocessed_data_dir, preprocessed_file + ".pkl"), "rb") as f:
+        with open(
+            os.path.join(preprocessed_data_dir,
+                         preprocessed_file + ".pkl"), "rb"
+        ) as f:
             dct = pickle.load(f)[1]
             dictionaries.append(dct)
 
@@ -131,7 +210,17 @@ def main():
     # Save predictions
     # This runs in multiprocess
     print("Running postprocessing with multiple threads...")
-    save_predictions_MLPerf(predictions, output_folder, preprocessed_files, dictionaries, num_threads_nifti_save, all_in_gpu, force_separate_z, interp_order, interp_order_z)
+    save_predictions_MLPerf(
+        predictions,
+        output_folder,
+        preprocessed_files,
+        dictionaries,
+        num_threads_nifti_save,
+        all_in_gpu,
+        force_separate_z,
+        interp_order,
+        interp_order_z,
+    )
 
     # Run evaluation
     print("Running evaluation...")
@@ -147,10 +236,15 @@ def main():
                 core = float(words[2])
                 enhancing = float(words[3])
                 mean = (whole + core + enhancing) / 3
-                print("Accuracy: mean = {:.5f}, whole tumor = {:.4f}, tumor core = {:.4f}, enhancing tumor = {:.4f}".format(mean, whole, core, enhancing))
+                print(
+                    "Accuracy: mean = {:.5f}, whole tumor = {:.4f}, tumor core = {:.4f}, enhancing tumor = {:.4f}".format(
+                        mean, whole, core, enhancing
+                    )
+                )
                 break
 
     print("Done!")
 
+
 if __name__ == "__main__":
     main()
diff --git a/vision/medical_imaging/3d-unet-brats19/brats_QSL.py b/vision/medical_imaging/3d-unet-brats19/brats_QSL.py
index 7be8e1e3c..810a23cb4 100644
--- a/vision/medical_imaging/3d-unet-brats19/brats_QSL.py
+++ b/vision/medical_imaging/3d-unet-brats19/brats_QSL.py
@@ -14,21 +14,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nnUnet.nnunet.inference.predict import preprocess_multithreaded
+import mlperf_loadgen as lg
 import os
 import pickle
 import sys
+
 sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
 
 sys.path.insert(0, os.path.join(os.getcwd(), "nnUnet"))
-from nnUnet.nnunet.inference.predict import preprocess_multithreaded
 
-class BraTS_2019_QSL():
+
+class BraTS_2019_QSL:
     def __init__(self, preprocessed_data_dir, perf_count):
         print("Constructing QSL...")
         self.preprocessed_data_dir = preprocessed_data_dir
-        with open(os.path.join(self.preprocessed_data_dir, "preprocessed_files.pkl"), "rb") as f:
+        with open(
+            os.path.join(self.preprocessed_data_dir,
+                         "preprocessed_files.pkl"), "rb"
+        ) as f:
             self.preprocess_files = pickle.load(f)
 
         self.count = len(self.preprocess_files)
@@ -37,14 +42,24 @@ def __init__(self, preprocessed_data_dir, perf_count):
         print("Using performance count = {:d}".format(self.perf_count))
 
         self.loaded_files = {}
-        self.qsl = lg.ConstructQSL(self.count, self.perf_count, self.load_query_samples, self.unload_query_samples)
+        self.qsl = lg.ConstructQSL(
+            self.count,
+            self.perf_count,
+            self.load_query_samples,
+            self.unload_query_samples,
+        )
         print("Finished constructing QSL.")
 
     def load_query_samples(self, sample_list):
         for sample_id in sample_list:
             file_name = self.preprocess_files[sample_id]
             print("Loading file {:}".format(file_name))
-            with open(os.path.join(self.preprocessed_data_dir, "{:}.pkl".format(file_name)), "rb") as f:
+            with open(
+                os.path.join(
+                    self.preprocessed_data_dir,
+                    "{:}.pkl".format(file_name)),
+                "rb",
+            ) as f:
                 self.loaded_files[sample_id] = pickle.load(f)[0]
 
     def unload_query_samples(self, sample_list):
@@ -54,5 +69,7 @@ def unload_query_samples(self, sample_list):
     def get_features(self, sample_id):
         return self.loaded_files[sample_id]
 
-def get_brats_QSL(preprocessed_data_dir="build/preprocessed_data", perf_count=None):
+
+def get_brats_QSL(
+        preprocessed_data_dir="build/preprocessed_data", perf_count=None):
     return BraTS_2019_QSL(preprocessed_data_dir, perf_count)
diff --git a/vision/medical_imaging/3d-unet-brats19/onnxruntime_SUT.py b/vision/medical_imaging/3d-unet-brats19/onnxruntime_SUT.py
index ba5cce411..9398b3640 100644
--- a/vision/medical_imaging/3d-unet-brats19/onnxruntime_SUT.py
+++ b/vision/medical_imaging/3d-unet-brats19/onnxruntime_SUT.py
@@ -14,19 +14,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from brats_QSL import get_brats_QSL
+import onnxruntime
+import numpy as np
+import mlperf_loadgen as lg
 import array
 import json
 import os
 import sys
-sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
-import numpy as np
-import onnxruntime
+sys.path.insert(0, os.getcwd())
 
-from brats_QSL import get_brats_QSL
 
-class _3DUNET_ONNXRuntime_SUT():
+class _3DUNET_ONNXRuntime_SUT:
     def __init__(self, model_path, preprocessed_data_dir, performance_count):
         print("Loading ONNX model...")
         self.sess = onnxruntime.InferenceSession(model_path)
@@ -40,19 +40,31 @@ def issue_queries(self, query_samples):
         for i in range(len(query_samples)):
             data = self.qsl.get_features(query_samples[i].index)
 
-            print("Processing sample id {:d} with shape = {:}".format(query_samples[i].index, data.shape))
+            print(
+                "Processing sample id {:d} with shape = {:}".format(
+                    query_samples[i].index, data.shape
+                )
+            )
 
             # Follow the PyTorch implementation.
-            # The ONNX file has five outputs, but we only care about the one named "output".
-            output = self.sess.run(["output"], {"input": data[np.newaxis, ...]})[0].squeeze(0).astype(np.float16)
+            # The ONNX file has five outputs, but we only care about the one
+            # named "output".
+            output = (
+                self.sess.run(["output"], {"input": data[np.newaxis, ...]})[0]
+                .squeeze(0)
+                .astype(np.float16)
+            )
 
             response_array = array.array("B", output.tobytes())
             bi = response_array.buffer_info()
-            response = lg.QuerySampleResponse(query_samples[i].id, bi[0], bi[1])
+            response = lg.QuerySampleResponse(
+                query_samples[i].id, bi[0], bi[1])
             lg.QuerySamplesComplete([response])
 
     def flush_queries(self):
         pass
 
+
 def get_onnxruntime_sut(model_path, preprocessed_data_dir, performance_count):
-    return _3DUNET_ONNXRuntime_SUT(model_path, preprocessed_data_dir, performance_count)
\ No newline at end of file
+    return _3DUNET_ONNXRuntime_SUT(
+        model_path, preprocessed_data_dir, performance_count)
diff --git a/vision/medical_imaging/3d-unet-brats19/ov_SUT.py b/vision/medical_imaging/3d-unet-brats19/ov_SUT.py
index cd4254673..4ed732c60 100644
--- a/vision/medical_imaging/3d-unet-brats19/ov_SUT.py
+++ b/vision/medical_imaging/3d-unet-brats19/ov_SUT.py
@@ -14,6 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from scipy.special import softmax
+from openvino.inference_engine import IECore
+from brats_QSL import get_brats_QSL
+import numpy as np
+import mlperf_loadgen as lg
 import array
 import json
 import os
@@ -21,21 +26,13 @@
 
 sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
-import numpy as np
-
-from brats_QSL import get_brats_QSL
-
-from openvino.inference_engine import IECore
-from scipy.special import softmax
-
 
-class _3DUNET_OV_SUT():
+class _3DUNET_OV_SUT:
     def __init__(self, model_path, preprocessed_data_dir, performance_count):
         print("Loading OV model...")
 
         model_xml = model_path
-        model_bin = os.path.splitext(model_xml)[0] + '.bin'
+        model_bin = os.path.splitext(model_xml)[0] + ".bin"
 
         ie = IECore()
         net = ie.read_network(model=model_xml, weights=model_bin)
@@ -49,7 +46,7 @@ def __init__(self, model_path, preprocessed_data_dir, performance_count):
             if max_channels < net.outputs[output].shape[-1]:
                 _3DUNET_OV_SUT.output_name = output
 
-        self.exec_net = ie.load_network(network=net, device_name='CPU')
+        self.exec_net = ie.load_network(network=net, device_name="CPU")
 
         print("Constructing SUT...")
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
@@ -60,17 +57,20 @@ def issue_queries(self, query_samples):
         for i in range(len(query_samples)):
             data = self.qsl.get_features(query_samples[i].index)
 
-            print("Processing sample id {:d} with shape = {:}".format(
-                query_samples[i].index, data.shape))
+            print(
+                "Processing sample id {:d} with shape = {:}".format(
+                    query_samples[i].index, data.shape
+                )
+            )
 
             output = self.exec_net.infer(
-                inputs={self.input_name: data[np.newaxis, ...]})[
-                _3DUNET_OV_SUT.output_name].astype(np.float16)
+                inputs={self.input_name: data[np.newaxis, ...]}
+            )[_3DUNET_OV_SUT.output_name].astype(np.float16)
 
             response_array = array.array("B", output.tobytes())
             bi = response_array.buffer_info()
-            response = lg.QuerySampleResponse(query_samples[i].id, bi[0],
-                                              bi[1])
+            response = lg.QuerySampleResponse(
+                query_samples[i].id, bi[0], bi[1])
             lg.QuerySamplesComplete([response])
 
     def flush_queries(self):
@@ -78,4 +78,4 @@ def flush_queries(self):
 
 
 def get_ov_sut(model_path, preprocessed_data_dir, performance_count):
-    return _3DUNET_OV_SUT(model_path, preprocessed_data_dir, performance_count)
\ No newline at end of file
+    return _3DUNET_OV_SUT(model_path, preprocessed_data_dir, performance_count)
diff --git a/vision/medical_imaging/3d-unet-brats19/preprocess.py b/vision/medical_imaging/3d-unet-brats19/preprocess.py
index 758d92488..d24625512 100644
--- a/vision/medical_imaging/3d-unet-brats19/preprocess.py
+++ b/vision/medical_imaging/3d-unet-brats19/preprocess.py
@@ -14,6 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nnunet.inference.predict import preprocess_multithreaded
+from nnunet.training.model_restore import load_model_and_checkpoint_files
+from batchgenerators.utilities.file_and_folder_operations import subfiles
+from batchgenerators.augmentations.utils import pad_nd_image
 import argparse
 import numpy
 import os
@@ -23,31 +27,60 @@
 
 sys.path.insert(0, os.path.join(os.getcwd(), "nnUnet"))
 
-from batchgenerators.augmentations.utils import pad_nd_image
-from batchgenerators.utilities.file_and_folder_operations import subfiles
-from nnunet.training.model_restore import load_model_and_checkpoint_files
-from nnunet.inference.predict import preprocess_multithreaded
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_dir", default="build/result/nnUNet/3d_fullres/Task043_BraTS2019/nnUNetTrainerV2__nnUNetPlansv2.mlperf.1",
-        help="Path to the directory containing plans.pkl")
-    parser.add_argument("--raw_data_dir", default="build/raw_data/nnUNet_raw_data/Task043_BraTS2019/imagesTr",
-        help="Path to the directory containing raw nii.gz files")
-    parser.add_argument("--preprocessed_data_dir", default="build/preprocessed_data", help="Path to the directory containing preprocessed data")
-    parser.add_argument("--validation_fold_file", default="folds/fold1_validation.txt", help="Path to the txt file storing all the sample names for the validation fold")
-    parser.add_argument("--num_threads_preprocessing", type=int, default=12, help="Number of threads to run the preprocessing with")
+    parser.add_argument(
+        "--model_dir",
+        default="build/result/nnUNet/3d_fullres/Task043_BraTS2019/nnUNetTrainerV2__nnUNetPlansv2.mlperf.1",
+        help="Path to the directory containing plans.pkl",
+    )
+    parser.add_argument(
+        "--raw_data_dir",
+        default="build/raw_data/nnUNet_raw_data/Task043_BraTS2019/imagesTr",
+        help="Path to the directory containing raw nii.gz files",
+    )
+    parser.add_argument(
+        "--preprocessed_data_dir",
+        default="build/preprocessed_data",
+        help="Path to the directory containing preprocessed data",
+    )
+    parser.add_argument(
+        "--validation_fold_file",
+        default="folds/fold1_validation.txt",
+        help="Path to the txt file storing all the sample names for the validation fold",
+    )
+    parser.add_argument(
+        "--num_threads_preprocessing",
+        type=int,
+        default=12,
+        help="Number of threads to run the preprocessing with",
+    )
     args = parser.parse_args()
     return args
 
-def preprocess_MLPerf(model, checkpoint_name, folds, fp16, list_of_lists, output_filenames, preprocessing_folder, num_threads_preprocessing):
+
+def preprocess_MLPerf(
+    model,
+    checkpoint_name,
+    folds,
+    fp16,
+    list_of_lists,
+    output_filenames,
+    preprocessing_folder,
+    num_threads_preprocessing,
+):
     assert len(list_of_lists) == len(output_filenames)
 
     print("loading parameters for folds", folds)
-    trainer, params = load_model_and_checkpoint_files(model, folds, fp16=fp16, checkpoint_name=checkpoint_name)
+    trainer, params = load_model_and_checkpoint_files(
+        model, folds, fp16=fp16, checkpoint_name=checkpoint_name
+    )
 
     print("starting preprocessing generator")
-    preprocessing = preprocess_multithreaded(trainer, list_of_lists, output_filenames, num_threads_preprocessing, None)
+    preprocessing = preprocess_multithreaded(
+        trainer, list_of_lists, output_filenames, num_threads_preprocessing, None
+    )
     print("Preprocessing images...")
     all_output_files = []
 
@@ -63,11 +96,13 @@ def preprocess_MLPerf(model, checkpoint_name, folds, fp16, list_of_lists, output
         # Pad to the desired full volume
         d = pad_nd_image(d, trainer.patch_size, "constant", None, False, None)
 
-        with open(os.path.join(preprocessing_folder, output_filename+ ".pkl"), "wb") as f:
+        with open(
+            os.path.join(preprocessing_folder, output_filename + ".pkl"), "wb"
+        ) as f:
             pickle.dump([d, dct], f)
         f.close()
 
-    return  all_output_files
+    return all_output_files
 
 
 def main():
@@ -82,7 +117,9 @@ def main():
     # Make sure the model exists
     model_dir = args.model_dir
     model_path = os.path.join(model_dir, "plans.pkl")
-    assert os.path.isfile(model_path), "Cannot find the model file {:}!".format(model_path)
+    assert os.path.isfile(model_path), "Cannot find the model file {:}!".format(
+        model_path
+    )
     checkpoint_name = "model_final_checkpoint"
 
     # Other settings
@@ -103,14 +140,28 @@ def main():
 
     # Create list of images locations (i.e. 4 images per case => 4 modalities)
     all_files = subfiles(raw_data_dir, suffix=".nii.gz", join=False, sort=True)
-    list_of_lists = [[os.path.join(raw_data_dir, i) for i in all_files if i[:len(j)].startswith(j) and
-                      len(i) == (len(j) + 12)] for j in validation_files]
+    list_of_lists = [
+        [
+            os.path.join(raw_data_dir, i)
+            for i in all_files
+            if i[: len(j)].startswith(j) and len(i) == (len(j) + 12)
+        ]
+        for j in validation_files
+    ]
 
     # Preprocess images, returns filenames list
     # This runs in multiprocess
     print("Acually preprocessing data...")
-    preprocessed_files = preprocess_MLPerf(model_dir, checkpoint_name, fold, fp16, list_of_lists,
-        validation_files, preprocessed_data_dir, num_threads_preprocessing)
+    preprocessed_files = preprocess_MLPerf(
+        model_dir,
+        checkpoint_name,
+        fold,
+        fp16,
+        list_of_lists,
+        validation_files,
+        preprocessed_data_dir,
+        num_threads_preprocessing,
+    )
 
     print("Saving metadata of the preprocessed data...")
     with open(os.path.join(preprocessed_data_dir, "preprocessed_files.pkl"), "wb") as f:
@@ -119,5 +170,6 @@ def main():
     print("Preprocessed data saved to {:}".format(preprocessed_data_dir))
     print("Done!")
 
+
 if __name__ == "__main__":
     main()
diff --git a/vision/medical_imaging/3d-unet-brats19/pytorch_SUT.py b/vision/medical_imaging/3d-unet-brats19/pytorch_SUT.py
index 7a9acae72..f68aaad4b 100644
--- a/vision/medical_imaging/3d-unet-brats19/pytorch_SUT.py
+++ b/vision/medical_imaging/3d-unet-brats19/pytorch_SUT.py
@@ -14,30 +14,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nnunet.training.model_restore import load_model_and_checkpoint_files
+from brats_QSL import get_brats_QSL
+import torch.nn.functional as F
+import torch
+import numpy as np
+import mlperf_loadgen as lg
 import array
 import json
 import os
 import sys
+
 sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
-import numpy as np
-import torch
-import torch.nn.functional as F
-from brats_QSL import get_brats_QSL
 
 sys.path.insert(0, os.path.join(os.getcwd(), "nnUnet"))
-from nnunet.training.model_restore import load_model_and_checkpoint_files
 
-class _3DUNET_PyTorch_SUT():
-    def __init__(self, model_dir, preprocessed_data_dir, performance_count, folds, checkpoint_name):
+
+class _3DUNET_PyTorch_SUT:
+    def __init__(
+        self,
+        model_dir,
+        preprocessed_data_dir,
+        performance_count,
+        folds,
+        checkpoint_name,
+    ):
 
         print("Loading PyTorch model...")
         model_path = os.path.join(model_dir, "plans.pkl")
-        assert os.path.isfile(model_path), "Cannot find the model file {:}!".format(model_path)
-        self.trainer, params = load_model_and_checkpoint_files(model_dir, folds, fp16=False, checkpoint_name=checkpoint_name)
+        assert os.path.isfile(model_path), "Cannot find the model file {:}!".format(
+            model_path
+        )
+        self.trainer, params = load_model_and_checkpoint_files(
+            model_dir, folds, fp16=False, checkpoint_name=checkpoint_name
+        )
         self.trainer.load_checkpoint_ram(params[0], False)
-        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.device = torch.device(
+            "cuda:0" if torch.cuda.is_available() else "cpu")
 
         print("Constructing SUT...")
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries)
@@ -49,23 +63,48 @@ def issue_queries(self, query_samples):
             for i in range(len(query_samples)):
                 data = self.qsl.get_features(query_samples[i].index)
 
-                print("Processing sample id {:d} with shape = {:}".format(query_samples[i].index, data.shape))
+                print(
+                    "Processing sample id {:d} with shape = {:}".format(
+                        query_samples[i].index, data.shape
+                    )
+                )
 
-                image = torch.from_numpy(data[np.newaxis,...]).float().to(self.device)
-                output = self.trainer.network(image)[0].cpu().numpy().astype(np.float16)
+                image = torch.from_numpy(
+                    data[np.newaxis, ...]).float().to(self.device)
+                output = self.trainer.network(
+                    image)[0].cpu().numpy().astype(np.float16)
 
                 transpose_forward = self.trainer.plans.get("transpose_forward")
-                transpose_backward = self.trainer.plans.get("transpose_backward")
-                assert transpose_forward == [0, 1, 2], "Unexpected transpose_forward {:}".format(transpose_forward)
-                assert transpose_backward == [0, 1, 2], "Unexpected transpose_backward {:}".format(transpose_backward)
+                transpose_backward = self.trainer.plans.get(
+                    "transpose_backward")
+                assert transpose_forward == [
+                    0,
+                    1,
+                    2,
+                ], "Unexpected transpose_forward {:}".format(transpose_forward)
+                assert transpose_backward == [
+                    0,
+                    1,
+                    2,
+                ], "Unexpected transpose_backward {:}".format(transpose_backward)
 
                 response_array = array.array("B", output.tobytes())
                 bi = response_array.buffer_info()
-                response = lg.QuerySampleResponse(query_samples[i].id, bi[0], bi[1])
+                response = lg.QuerySampleResponse(
+                    query_samples[i].id, bi[0], bi[1])
                 lg.QuerySamplesComplete([response])
 
     def flush_queries(self):
         pass
 
-def get_pytorch_sut(model_dir, preprocessed_data_dir, performance_count, folds=1, checkpoint_name="model_final_checkpoint"):
-    return _3DUNET_PyTorch_SUT(model_dir, preprocessed_data_dir, performance_count, folds, checkpoint_name)
\ No newline at end of file
+
+def get_pytorch_sut(
+    model_dir,
+    preprocessed_data_dir,
+    performance_count,
+    folds=1,
+    checkpoint_name="model_final_checkpoint",
+):
+    return _3DUNET_PyTorch_SUT(
+        model_dir, preprocessed_data_dir, performance_count, folds, checkpoint_name
+    )
diff --git a/vision/medical_imaging/3d-unet-brats19/run.py b/vision/medical_imaging/3d-unet-brats19/run.py
index e69f2b6e7..23f6e9bca 100644
--- a/vision/medical_imaging/3d-unet-brats19/run.py
+++ b/vision/medical_imaging/3d-unet-brats19/run.py
@@ -15,48 +15,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import subprocess
+import mlperf_loadgen as lg
+import argparse
 import os
 import sys
-sys.path.insert(0, os.getcwd())
 
-import argparse
-import mlperf_loadgen as lg
-import subprocess
+sys.path.insert(0, os.getcwd())
 
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--backend",
-                        choices=["pytorch", "onnxruntime", "tf", "ov"],
-                        default="pytorch",
-                        help="Backend")
+    parser.add_argument(
+        "--backend",
+        choices=["pytorch", "onnxruntime", "tf", "ov"],
+        default="pytorch",
+        help="Backend",
+    )
     parser.add_argument(
         "--scenario",
         choices=["SingleStream", "Offline", "Server", "MultiStream"],
         default="Offline",
-        help="Scenario")
-    parser.add_argument("--accuracy",
-                        action="store_true",
-                        help="enable accuracy pass")
-    parser.add_argument("--mlperf_conf",
-                        default="build/mlperf.conf",
-                        help="mlperf rules config")
-    parser.add_argument("--user_conf",
-                        default="user.conf",
-                        help="user config for user LoadGen settings such as target QPS")
+        help="Scenario",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--mlperf_conf", default="build/mlperf.conf", help="mlperf rules config"
+    )
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
     parser.add_argument(
         "--model_dir",
-        default=
-        "build/result/nnUNet/3d_fullres/Task043_BraTS2019/nnUNetTrainerV2__nnUNetPlansv2.mlperf.1",
-        help="Path to the directory containing plans.pkl")
-    parser.add_argument("--model", help="Path to the ONNX, OpenVINO, or TF model")
-    parser.add_argument("--preprocessed_data_dir",
-                        default="build/preprocessed_data",
-                        help="path to preprocessed data")
-    parser.add_argument("--performance_count",
-                        type=int,
-                        default=16,
-                        help="performance count")
+        default="build/result/nnUNet/3d_fullres/Task043_BraTS2019/nnUNetTrainerV2__nnUNetPlansv2.mlperf.1",
+        help="Path to the directory containing plans.pkl",
+    )
+    parser.add_argument(
+        "--model",
+        help="Path to the ONNX, OpenVINO, or TF model")
+    parser.add_argument(
+        "--preprocessed_data_dir",
+        default="build/preprocessed_data",
+        help="path to preprocessed data",
+    )
+    parser.add_argument(
+        "--performance_count", type=int, default=16, help="performance count"
+    )
     args = parser.parse_args()
     return args
 
@@ -65,7 +74,7 @@ def get_args():
     "SingleStream": lg.TestScenario.SingleStream,
     "Offline": lg.TestScenario.Offline,
     "Server": lg.TestScenario.Server,
-    "MultiStream": lg.TestScenario.MultiStream
+    "MultiStream": lg.TestScenario.MultiStream,
 }
 
 
@@ -74,20 +83,30 @@ def main():
 
     if args.backend == "pytorch":
         from pytorch_SUT import get_pytorch_sut
-        sut = get_pytorch_sut(args.model_dir, args.preprocessed_data_dir,
-                              args.performance_count)
+
+        sut = get_pytorch_sut(
+            args.model_dir, args.preprocessed_data_dir, args.performance_count
+        )
     elif args.backend == "onnxruntime":
         from onnxruntime_SUT import get_onnxruntime_sut
-        sut = get_onnxruntime_sut(args.model, args.preprocessed_data_dir,
-                                  args.performance_count)
+
+        sut = get_onnxruntime_sut(
+            args.model, args.preprocessed_data_dir, args.performance_count
+        )
     elif args.backend == "tf":
         from tf_SUT import get_tf_sut
-        sut = get_tf_sut(args.model, args.preprocessed_data_dir,
-                         args.performance_count)
+
+        sut = get_tf_sut(
+            args.model,
+            args.preprocessed_data_dir,
+            args.performance_count)
     elif args.backend == "ov":
         from ov_SUT import get_ov_sut
-        sut = get_ov_sut(args.model, args.preprocessed_data_dir,
-                         args.performance_count)
+
+        sut = get_ov_sut(
+            args.model,
+            args.preprocessed_data_dir,
+            args.performance_count)
     else:
         raise ValueError("Unknown backend: {:}".format(args.backend))
 
diff --git a/vision/medical_imaging/3d-unet-brats19/tf_SUT.py b/vision/medical_imaging/3d-unet-brats19/tf_SUT.py
index 6a38a2ee7..9ceb0a334 100644
--- a/vision/medical_imaging/3d-unet-brats19/tf_SUT.py
+++ b/vision/medical_imaging/3d-unet-brats19/tf_SUT.py
@@ -14,21 +14,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from brats_QSL import get_brats_QSL
+from tensorflow.core.framework import graph_pb2
+import tensorflow as tf
+import numpy as np
+import mlperf_loadgen as lg
 import array
 import json
 import os
 import sys
-sys.path.insert(0, os.getcwd())
 
-import mlperf_loadgen as lg
-import numpy as np
-import tensorflow as tf
-from tensorflow.core.framework import graph_pb2
-
-from brats_QSL import get_brats_QSL
+sys.path.insert(0, os.getcwd())
 
 
-class _3DUNET_TF_SUT():
+class _3DUNET_TF_SUT:
     def __init__(self, model_path, preprocessed_data_dir, performance_count):
         print("Loading TF model...")
         graph_def = graph_pb2.GraphDef()
@@ -50,15 +49,20 @@ def issue_queries(self, query_samples):
         for i in range(len(query_samples)):
             data = self.qsl.get_features(query_samples[i].index)
 
-            print("Processing sample id {:d} with shape = {:}".format(
-                query_samples[i].index, data.shape))
+            print(
+                "Processing sample id {:d} with shape = {:}".format(
+                    query_samples[i].index, data.shape
+                )
+            )
 
-            output = self.sess.run(self.output, feed_dict={self.input: data[np.newaxis, ...]})[0].astype(np.float16)
+            output = self.sess.run(
+                self.output, feed_dict={self.input: data[np.newaxis, ...]}
+            )[0].astype(np.float16)
 
             response_array = array.array("B", output.tobytes())
             bi = response_array.buffer_info()
-            response = lg.QuerySampleResponse(query_samples[i].id, bi[0],
-                                              bi[1])
+            response = lg.QuerySampleResponse(
+                query_samples[i].id, bi[0], bi[1])
             lg.QuerySamplesComplete([response])
 
     def flush_queries(self):
@@ -66,4 +70,4 @@ def flush_queries(self):
 
 
 def get_tf_sut(model_path, preprocessed_data_dir, performance_count):
-    return _3DUNET_TF_SUT(model_path, preprocessed_data_dir, performance_count)
\ No newline at end of file
+    return _3DUNET_TF_SUT(model_path, preprocessed_data_dir, performance_count)
diff --git a/vision/medical_imaging/3d-unet-brats19/unet_onnx_to_tf.py b/vision/medical_imaging/3d-unet-brats19/unet_onnx_to_tf.py
index 472bcd61b..3c2768dfa 100644
--- a/vision/medical_imaging/3d-unet-brats19/unet_onnx_to_tf.py
+++ b/vision/medical_imaging/3d-unet-brats19/unet_onnx_to_tf.py
@@ -14,28 +14,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import onnx_tf
+import onnx
+import argparse
 import os
 import sys
+
 sys.path.insert(0, os.getcwd())
 
-import argparse
-import onnx
-import onnx_tf
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--onnx_model",
-                        default="build/model/224_224_160.onnx",
-                        help="Path to the ONNX model")
-    parser.add_argument("--output_name",
-                        default="224_224_160.pb",
-                        help="Name of output model")
-    parser.add_argument("--output_dir",
-                        default="build/model",
-                        help="Directory to save output model")
+    parser.add_argument(
+        "--onnx_model",
+        default="build/model/224_224_160.onnx",
+        help="Path to the ONNX model",
+    )
+    parser.add_argument(
+        "--output_name", default="224_224_160.pb", help="Name of output model"
+    )
+    parser.add_argument(
+        "--output_dir", default="build/model", help="Directory to save output model"
+    )
     args = parser.parse_args()
     return args
 
+
 def main():
     args = get_args()
 
diff --git a/vision/medical_imaging/3d-unet-brats19/unet_pytorch_to_onnx.py b/vision/medical_imaging/3d-unet-brats19/unet_pytorch_to_onnx.py
index 9652d1a6d..a6785353a 100644
--- a/vision/medical_imaging/3d-unet-brats19/unet_pytorch_to_onnx.py
+++ b/vision/medical_imaging/3d-unet-brats19/unet_pytorch_to_onnx.py
@@ -14,34 +14,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nnunet.training.model_restore import load_model_and_checkpoint_files
+import torch
+import onnx
+import argparse
 import os
 import sys
+
 sys.path.insert(0, os.getcwd())
 
-import argparse
-import onnx
-import torch
 
 sys.path.insert(0, os.path.join(os.getcwd(), "nnUnet"))
-from nnunet.training.model_restore import load_model_and_checkpoint_files
+
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_dir",
-                        default="build/result/nnUNet/3d_fullres/Task043_BraTS2019/nnUNetTrainerV2__nnUNetPlansv2.mlperf.1",
-                        help="Path to the PyTorch model")
-    parser.add_argument("--output_name",
-                        default="224_224_160.onnx",
-                        help="Name of output model")
-    parser.add_argument("--dynamic_bs_output_name",
-                        default="224_224_160_dyanmic_bs.onnx",
-                        help="Name of output model")
-    parser.add_argument("--output_dir",
-                        default="build/model",
-                        help="Directory to save output model")
+    parser.add_argument(
+        "--model_dir",
+        default="build/result/nnUNet/3d_fullres/Task043_BraTS2019/nnUNetTrainerV2__nnUNetPlansv2.mlperf.1",
+        help="Path to the PyTorch model",
+    )
+    parser.add_argument(
+        "--output_name", default="224_224_160.onnx", help="Name of output model"
+    )
+    parser.add_argument(
+        "--dynamic_bs_output_name",
+        default="224_224_160_dyanmic_bs.onnx",
+        help="Name of output model",
+    )
+    parser.add_argument(
+        "--output_dir", default="build/model", help="Directory to save output model"
+    )
     args = parser.parse_args()
     return args
 
+
 def main():
     args = get_args()
 
@@ -51,26 +58,49 @@ def main():
         os.makedirs(args.output_dir)
 
     output_path = "./{}/{}".format(args.output_dir, args.output_name)
-    dynamic_bs_output_path = "./{}/{}".format(args.output_dir, args.dynamic_bs_output_name)
+    dynamic_bs_output_path = "./{}/{}".format(
+        args.output_dir, args.dynamic_bs_output_name
+    )
 
     print("Loading Pytorch model...")
     checkpoint_name = "model_final_checkpoint"
     folds = 1
-    trainer, params = load_model_and_checkpoint_files(args.model_dir, folds, fp16=False, checkpoint_name=checkpoint_name)
+    trainer, params = load_model_and_checkpoint_files(
+        args.model_dir, folds, fp16=False, checkpoint_name=checkpoint_name
+    )
     trainer.load_checkpoint_ram(params[0], False)
     height = 224
     width = 224
     depth = 160
     channels = 4
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    dummy_input = torch.rand([1, channels, height, width, depth]).float().to(device)
-    torch.onnx.export(trainer.network, dummy_input, output_path, opset_version=11,
-                      input_names=['input'], output_names=['output'])
-    torch.onnx.export(trainer.network, dummy_input, dynamic_bs_output_path, opset_version=11,
-                      input_names=['input'], output_names=['output'],
-                      dynamic_axes=({"input": {0: "batch_size"}, "output": {0: "batch_size"}}))
+    dummy_input = torch.rand(
+        [1, channels, height, width, depth]).float().to(device)
+    torch.onnx.export(
+        trainer.network,
+        dummy_input,
+        output_path,
+        opset_version=11,
+        input_names=["input"],
+        output_names=["output"],
+    )
+    torch.onnx.export(
+        trainer.network,
+        dummy_input,
+        dynamic_bs_output_path,
+        opset_version=11,
+        input_names=["input"],
+        output_names=["output"],
+        dynamic_axes=({"input": {0: "batch_size"},
+                      "output": {0: "batch_size"}}),
+    )
+
+    print(
+        "Successfully exported model {} and {}".format(
+            output_path, dynamic_bs_output_path
+        )
+    )
 
-    print("Successfully exported model {} and {}".format(output_path, dynamic_bs_output_path))
 
 if __name__ == "__main__":
     main()
diff --git a/vision/medical_imaging/3d-unet-kits19/accuracy_kits.py b/vision/medical_imaging/3d-unet-kits19/accuracy_kits.py
index 93ab11adc..1beb60155 100644
--- a/vision/medical_imaging/3d-unet-kits19/accuracy_kits.py
+++ b/vision/medical_imaging/3d-unet-kits19/accuracy_kits.py
@@ -41,11 +41,11 @@
 Accuracy check from MLPerf-Inference accuracy log file:
     python3 accuracy_kits.py
     or
-    python3 accuracy_kits.py --log_file $(LOG_DIR)/$(ACCURACY_LOG_FILENAME) 
+    python3 accuracy_kits.py --log_file $(LOG_DIR)/$(ACCURACY_LOG_FILENAME)
                              --output_dtype $(DTYPE)
-                             --preprocessed_data_dir $(PREPROCESSED_DATA_DIR) 
+                             --preprocessed_data_dir $(PREPROCESSED_DATA_DIR)
                              --postprocessed_data_dir $(POSTPROCESSED_DATA_DIR)
-                             --num_proc $(NUMBER_PROCESSES) 
+                             --num_proc $(NUMBER_PROCESSES)
 """
 
 # $(DTYPE) mapping to numpy dtype
@@ -57,7 +57,7 @@
     "uint8": np.uint8,
     "float16": np.float16,
     "float32": np.float32,
-    "float64": np.float64
+    "float64": np.float64,
 }
 
 
@@ -65,25 +65,36 @@ def get_args():
     """
     Args used for postprocessing
     """
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument("--log_file",
-                        default="build/logs/mlperf_log_accuracy.json",
-                        help="Path to accuracy log json file")
-    parser.add_argument("--output_dtype",
-                        default="uint8",
-                        choices=dtype_map.keys(),
-                        help="Output data type")
-    parser.add_argument("--preprocessed_data_dir",
-                        default="build/preprocessed_data",
-                        help="Path to the directory containing preprocessed data")
-    parser.add_argument("--postprocessed_data_dir",
-                        default="build/postprocessed_data",
-                        help="Path to the directory containing postprocessed data")
-    parser.add_argument("--num_proc",
-                        type=int,
-                        default=4,
-                        help="Number of processors running postprocessing")
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawTextHelpFormatter
+    )
+    parser.add_argument(
+        "--log_file",
+        default="build/logs/mlperf_log_accuracy.json",
+        help="Path to accuracy log json file",
+    )
+    parser.add_argument(
+        "--output_dtype",
+        default="uint8",
+        choices=dtype_map.keys(),
+        help="Output data type",
+    )
+    parser.add_argument(
+        "--preprocessed_data_dir",
+        default="build/preprocessed_data",
+        help="Path to the directory containing preprocessed data",
+    )
+    parser.add_argument(
+        "--postprocessed_data_dir",
+        default="build/postprocessed_data",
+        help="Path to the directory containing postprocessed data",
+    )
+    parser.add_argument(
+        "--num_proc",
+        type=int,
+        default=4,
+        help="Number of processors running postprocessing",
+    )
     args = parser.parse_args()
     return args
 
@@ -104,7 +115,7 @@ def prepare_one_hot(my_array, num_classes):
     Reinterprets my_array into one-hot encoded, for classes as many as num_classes
     """
     res = np.eye(num_classes)[np.array(my_array).reshape(-1)]
-    return res.reshape(list(my_array.shape)+[num_classes])
+    return res.reshape(list(my_array.shape) + [num_classes])
 
 
 def get_dice_score(case, prediction, target):
@@ -128,10 +139,12 @@ def get_dice_score(case, prediction, target):
     prediction = prediction[:, 1:]
 
     # calculate dice score
-    assert target.shape == prediction.shape, \
-        f"Different shape -- target: {target.shape}, prediction: {prediction.shape}"
-    assert target.dtype == np.float64 and prediction.dtype == np.float64, \
-        f"Unexpected dtype -- target: {target.dtype}, prediction: {prediction.dtype}"
+    assert (
+        target.shape == prediction.shape
+    ), f"Different shape -- target: {target.shape}, prediction: {prediction.shape}"
+    assert (
+        target.dtype == np.float64 and prediction.dtype == np.float64
+    ), f"Unexpected dtype -- target: {target.dtype}, prediction: {prediction.dtype}"
 
     # intersection for numerator; target/prediction sum for denominator
     # easy b/c one-hot encoded format
@@ -140,24 +153,28 @@ def get_dice_score(case, prediction, target):
     prediction_sum = np.sum(prediction, axis=reduce_axis)
 
     # get DICE score for each class
-    dice_val = (2.0 * intersection + smooth_nr) / \
-        (target_sum + prediction_sum + smooth_dr)
+    dice_val = (2.0 * intersection + smooth_nr) / (
+        target_sum + prediction_sum + smooth_dr
+    )
 
     # return after removing batch dim
     return (case, dice_val[0])
 
 
-def evaluate(target_files, preprocessed_data_dir, postprocessed_data_dir, num_proc):
+def evaluate(target_files, preprocessed_data_dir,
+             postprocessed_data_dir, num_proc):
     """
     Collects and summarizes DICE scores of all the predicted files using multi-processes
     """
     bundle = list()
 
     for case in target_files:
-        groundtruth_path = Path(preprocessed_data_dir,
-                                "nifti", case, "segmentation.nii.gz").absolute()
-        prediction_path = Path(postprocessed_data_dir,
-                               case, "prediction.nii.gz").absolute()
+        groundtruth_path = Path(
+            preprocessed_data_dir, "nifti", case, "segmentation.nii.gz"
+        ).absolute()
+        prediction_path = Path(
+            postprocessed_data_dir, case, "prediction.nii.gz"
+        ).absolute()
 
         groundtruth = nib.load(groundtruth_path).get_fdata().astype(np.uint8)
         prediction = nib.load(prediction_path).get_fdata().astype(np.uint8)
@@ -165,9 +182,11 @@ def evaluate(target_files, preprocessed_data_dir, postprocessed_data_dir, num_pr
         groundtruth = np.expand_dims(groundtruth, 0)
         prediction = np.expand_dims(prediction, 0)
 
-        assert groundtruth.shape == prediction.shape,\
-            "{} -- groundtruth: {} and prediction: {} have different shapes".format(
-                case, groundtruth.shape, prediction.shape)
+        assert (
+            groundtruth.shape == prediction.shape
+        ), "{} -- groundtruth: {} and prediction: {} have different shapes".format(
+            case, groundtruth.shape, prediction.shape
+        )
 
         bundle.append((case, groundtruth, prediction))
 
@@ -190,12 +209,9 @@ def save_evaluation_summary(postprocessed_data_dir, dice_scores):
         tumor = arr[1]
         composite = np.mean(arr)
         df = df.append(
-            {
-                "case": case,
-                "kidney": kidney,
-                "tumor": tumor,
-                "composite": composite
-            }, ignore_index=True)
+            {"case": case, "kidney": kidney, "tumor": tumor, "composite": composite},
+            ignore_index=True,
+        )
 
     df.set_index("case", inplace=True)
     # consider NaN as a crash hence zero
@@ -211,16 +227,18 @@ def save_nifti(bundle):
     # Note that affine has to be valid, otherwise NIFTI image will look weird
     image, affine, path_to_file = bundle
     if len(image.shape) != 3:
-        assert len(image.shape) == 4 and image.shape[0] == 1,\
-            "Unexpected image: {}".format(image.shape)
+        assert (
+            len(image.shape) == 4 and image.shape[0] == 1
+        ), "Unexpected image: {}".format(image.shape)
         image = np.squeeze(image, 0)
     nifti_image = nib.Nifti1Image(image, affine=affine)
     path_to_file.parent.mkdir(parents=True, exist_ok=True)
     nib.save(nifti_image, path_to_file)
 
 
-def save_predictions(predictions, output_dir, preprocessed_data_dir,
-                     preprocessed_files, aux, num_proc):
+def save_predictions(
+    predictions, output_dir, preprocessed_data_dir, preprocessed_files, aux, num_proc
+):
     """
     Saves all the segmentation result from inference into NIFTI files using affine matrices
     Affine matrices were stored for input images during preprocessing
@@ -230,8 +248,10 @@ def save_predictions(predictions, output_dir, preprocessed_data_dir,
     bundle = list()
     for case, case_d in predictions.items():
         pred_file_path = Path(output_dir, case, "prediction.nii.gz")
-        bundle.append((case_d['prediction'], aux[case]
-                       ['reshaped_affine'], pred_file_path))
+        bundle.append(
+            (case_d["prediction"], aux[case]
+             ["reshaped_affine"], pred_file_path)
+        )
 
     with Pool(num_proc) as p:
         p.map(save_nifti, bundle)
@@ -249,8 +269,9 @@ def load_loadgen_log(log_file, result_dtype, file_list, aux):
     with open(log_file) as f:
         predictions = json.load(f)
 
-    assert len(predictions) == len(aux.keys()),\
-        "Number of predictions does not match number of samples in validation set!"
+    assert len(predictions) == len(
+        aux.keys()
+    ), "Number of predictions does not match number of samples in validation set!"
 
     results = dict()
     for prediction in predictions:
@@ -258,12 +279,10 @@ def load_loadgen_log(log_file, result_dtype, file_list, aux):
         case = file_list[qsl_idx]
         assert qsl_idx >= 0 and qsl_idx < len(predictions), "Invalid qsl_idx!"
         result_shape = np.array(list(aux[case]["image_shape"]))
-        result = np.frombuffer(bytes.fromhex(
-            prediction["data"]), result_dtype).reshape(result_shape)
-        results[case] = {
-            'qsl_idx': qsl_idx,
-            'prediction': result
-        }
+        result = np.frombuffer(bytes.fromhex(prediction["data"]), result_dtype).reshape(
+            result_shape
+        )
+        results[case] = {"qsl_idx": qsl_idx, "prediction": result}
 
     assert len(results) == len(predictions), "Missing some results!"
 
@@ -287,8 +306,8 @@ def main():
     print("Loading necessary metadata...")
     with open(Path(preprocessed_data_dir, "preprocessed_files.pkl"), "rb") as f:
         preprocessed_files_content = pickle.load(f)
-    target_files = preprocessed_files_content['file_list']
-    aux = preprocessed_files_content['cases']
+    target_files = preprocessed_files_content["file_list"]
+    aux = preprocessed_files_content["cases"]
 
     # Load predictions from loadgen accuracy log.
     print("Loading loadgen accuracy log...")
@@ -296,23 +315,35 @@ def main():
 
     # Save predictions
     print("Running postprocessing...")
-    save_predictions(predictions, postprocessed_data_dir, preprocessed_data_dir,
-                     target_files, aux, num_proc)
+    save_predictions(
+        predictions,
+        postprocessed_data_dir,
+        preprocessed_data_dir,
+        target_files,
+        aux,
+        num_proc,
+    )
 
     # Run evaluation
     print("Running evaluation...")
-    evaluate(target_files, preprocessed_data_dir,
-             postprocessed_data_dir, num_proc)
+    evaluate(
+        target_files,
+        preprocessed_data_dir,
+        postprocessed_data_dir,
+        num_proc)
 
     # Finalize evaluation from evaluation summary
     print("Processing evaluation summary...")
     df = pd.read_csv(Path(postprocessed_data_dir, "summary.csv"))
-    final = df.loc[df['case'] == 'mean']
-    composite = float(final['composite'])
-    kidney = float(final['kidney'])
-    tumor = float(final['tumor'])
-    print("Accuracy: mean = {:.5f}, kidney = {:.4f}, tumor = {:.4f}".format(
-          composite, kidney, tumor))
+    final = df.loc[df["case"] == "mean"]
+    composite = float(final["composite"])
+    kidney = float(final["kidney"])
+    tumor = float(final["tumor"])
+    print(
+        "Accuracy: mean = {:.5f}, kidney = {:.4f}, tumor = {:.4f}".format(
+            composite, kidney, tumor
+        )
+    )
     print("Done!")
 
 
diff --git a/vision/medical_imaging/3d-unet-kits19/base_SUT.py b/vision/medical_imaging/3d-unet-kits19/base_SUT.py
index 2208fbb63..d94319b77 100644
--- a/vision/medical_imaging/3d-unet-kits19/base_SUT.py
+++ b/vision/medical_imaging/3d-unet-kits19/base_SUT.py
@@ -118,31 +118,39 @@ def infer_single_query(self, query, mystr):
         # prepare arrays
         image = query[np.newaxis, ...]
         result, norm_map, norm_patch = infu.prepare_arrays(image, ROI_SHAPE)
-        t_image, t_result, t_norm_map, t_norm_patch =\
-            self.to_tensor(image), self.to_tensor(result), self.to_tensor(
-                norm_map), self.to_tensor(norm_patch)
+        t_image, t_result, t_norm_map, t_norm_patch = (
+            self.to_tensor(image),
+            self.to_tensor(result),
+            self.to_tensor(norm_map),
+            self.to_tensor(norm_patch),
+        )
 
         # sliding window inference
         subvol_cnt = 0
-        for i, j, k in infu.get_slice_for_sliding_window(t_image, ROI_SHAPE, SLIDE_OVERLAP_FACTOR):
+        for i, j, k in infu.get_slice_for_sliding_window(
+            t_image, ROI_SHAPE, SLIDE_OVERLAP_FACTOR
+        ):
             subvol_cnt += 1
             result_slice = t_result[
                 ...,
-                i:(ROI_SHAPE[0] + i),
-                j:(ROI_SHAPE[1] + j),
-                k:(ROI_SHAPE[2] + k)]
+                i: (ROI_SHAPE[0] + i),
+                j: (ROI_SHAPE[1] + j),
+                k: (ROI_SHAPE[2] + k),
+            ]
 
             input_slice = t_image[
                 ...,
-                i:(ROI_SHAPE[0] + i),
-                j:(ROI_SHAPE[1] + j),
-                k:(ROI_SHAPE[2] + k)]
+                i: (ROI_SHAPE[0] + i),
+                j: (ROI_SHAPE[1] + j),
+                k: (ROI_SHAPE[2] + k),
+            ]
 
             norm_map_slice = t_norm_map[
                 ...,
-                i:(ROI_SHAPE[0] + i),
-                j:(ROI_SHAPE[1] + j),
-                k:(ROI_SHAPE[2] + k)]
+                i: (ROI_SHAPE[0] + i),
+                j: (ROI_SHAPE[1] + j),
+                k: (ROI_SHAPE[2] + k),
+            ]
 
             result_slice += self.do_infer(input_slice) * t_norm_patch
 
@@ -169,7 +177,8 @@ def issue_queries(self, query_samples):
         for qsi in range(total):
             query = self.qsl.get_features(query_samples[qsi].index)
             mystr = "{:5d}/{:5d} -- Processing sample id {:2d} with shape = {:}".format(
-                    qsi+1, total, query_samples[qsi].index, query.shape)
+                qsi + 1, total, query_samples[qsi].index, query.shape
+            )
             final_result, mystr = self.infer_single_query(query, mystr)
             if mystr:
                 print(mystr)
diff --git a/vision/medical_imaging/3d-unet-kits19/global_vars.py b/vision/medical_imaging/3d-unet-kits19/global_vars.py
index 883d2f778..78c5e1235 100644
--- a/vision/medical_imaging/3d-unet-kits19/global_vars.py
+++ b/vision/medical_imaging/3d-unet-kits19/global_vars.py
@@ -24,33 +24,41 @@
 """
 
 __all__ = [
-    'CHECKSUM_INFER_FILE',
-    'CHECKSUM_CALIB_FILE',
-    'TARGET_CASES',
-    'CALIB_CASES',
-    'MEAN_VAL',
-    'STDDEV_VAL',
-    'MIN_CLIP_VAL',
-    'MAX_CLIP_VAL',
-    'PADDING_VAL',
-    'TARGET_SPACING',
-    'ROI_SHAPE',
-    'SLIDE_OVERLAP_FACTOR',
+    "CHECKSUM_INFER_FILE",
+    "CHECKSUM_CALIB_FILE",
+    "TARGET_CASES",
+    "CALIB_CASES",
+    "MEAN_VAL",
+    "STDDEV_VAL",
+    "MIN_CLIP_VAL",
+    "MAX_CLIP_VAL",
+    "PADDING_VAL",
+    "TARGET_SPACING",
+    "ROI_SHAPE",
+    "SLIDE_OVERLAP_FACTOR",
 ]
 
 # file pointers and sanity checks
 INFERENCE_CASE_FILE = Path(
-    Path.cwd(), 'meta', 'inference_cases.json').absolute()
+    Path.cwd(),
+    "meta",
+    "inference_cases.json").absolute()
 CALIBRATION_CASE_FILE = Path(
-    Path.cwd(), 'meta', 'calibration_cases.json').absolute()
+    Path.cwd(),
+    "meta",
+    "calibration_cases.json").absolute()
 CHECKSUM_INFER_FILE = Path(
-    Path.cwd(), 'meta', 'checksum_inference.json').absolute()
+    Path.cwd(),
+    "meta",
+    "checksum_inference.json").absolute()
 CHECKSUM_CALIB_FILE = Path(
-    Path.cwd(), 'meta', 'checksum_calibration.json').absolute()
-assert INFERENCE_CASE_FILE.is_file(), 'inference_cases.json is not found'
-assert CALIBRATION_CASE_FILE.is_file(), 'calibration_cases.json is not found'
-assert CHECKSUM_INFER_FILE.is_file(), 'checksum_inference.json is not found'
-assert CHECKSUM_CALIB_FILE.is_file(), 'checksum_calibration.json is not found'
+    Path.cwd(),
+    "meta",
+    "checksum_calibration.json").absolute()
+assert INFERENCE_CASE_FILE.is_file(), "inference_cases.json is not found"
+assert CALIBRATION_CASE_FILE.is_file(), "calibration_cases.json is not found"
+assert CHECKSUM_INFER_FILE.is_file(), "checksum_inference.json is not found"
+assert CHECKSUM_CALIB_FILE.is_file(), "checksum_calibration.json is not found"
 
 # cases used for inference and calibration
 TARGET_CASES = json.load(open(INFERENCE_CASE_FILE))
@@ -65,11 +73,16 @@
 TARGET_SPACING = [1.6, 1.2, 1.2]
 ROI_SHAPE = [128, 128, 128]
 SLIDE_OVERLAP_FACTOR = 0.5
-assert isinstance(TARGET_SPACING, list) and \
-    len(TARGET_SPACING) == 3 and any(TARGET_SPACING), \
-    f"Need proper target spacing: {TARGET_SPACING}"
-assert isinstance(ROI_SHAPE, list) and len(ROI_SHAPE) == 3 and any(ROI_SHAPE), \
-    f"Need proper ROI shape: {ROI_SHAPE}"
-assert isinstance(SLIDE_OVERLAP_FACTOR, float) and \
-    SLIDE_OVERLAP_FACTOR > 0 and SLIDE_OVERLAP_FACTOR < 1, \
-    f"Need sliding window overlap factor in (0,1): {SLIDE_OVERLAP_FACTOR}"
+assert (
+    isinstance(TARGET_SPACING, list)
+    and len(TARGET_SPACING) == 3
+    and any(TARGET_SPACING)
+), f"Need proper target spacing: {TARGET_SPACING}"
+assert (
+    isinstance(ROI_SHAPE, list) and len(ROI_SHAPE) == 3 and any(ROI_SHAPE)
+), f"Need proper ROI shape: {ROI_SHAPE}"
+assert (
+    isinstance(SLIDE_OVERLAP_FACTOR, float)
+    and SLIDE_OVERLAP_FACTOR > 0
+    and SLIDE_OVERLAP_FACTOR < 1
+), f"Need sliding window overlap factor in (0,1): {SLIDE_OVERLAP_FACTOR}"
diff --git a/vision/medical_imaging/3d-unet-kits19/inference_utils.py b/vision/medical_imaging/3d-unet-kits19/inference_utils.py
index 5f7b66cd9..333f9846c 100644
--- a/vision/medical_imaging/3d-unet-kits19/inference_utils.py
+++ b/vision/medical_imaging/3d-unet-kits19/inference_utils.py
@@ -103,28 +103,34 @@ def prepare_arrays(image, roi_shape=ROI_SHAPE):
     - norm_map where normal map is constructed upon
     - norm_patch, a gaussian kernel that is applied to each sub-volume inference result
     """
-    assert isinstance(roi_shape, list) and len(roi_shape) == 3 and any(roi_shape),\
-        f"Need proper ROI shape: {roi_shape}"
+    assert (
+        isinstance(roi_shape, list) and len(roi_shape) == 3 and any(roi_shape)
+    ), f"Need proper ROI shape: {roi_shape}"
 
     image_shape = list(image.shape[2:])
 
     result = np.zeros(shape=(1, 3, *image_shape), dtype=image.dtype)
     norm_map = np.zeros_like(result)
-    norm_patch = gaussian_kernel(
-        roi_shape[0], 0.125*roi_shape[0]).astype(norm_map.dtype)
+    norm_patch = gaussian_kernel(roi_shape[0], 0.125 * roi_shape[0]).astype(
+        norm_map.dtype
+    )
 
     return result, norm_map, norm_patch
 
 
-def get_slice_for_sliding_window(image, roi_shape=ROI_SHAPE, overlap=SLIDE_OVERLAP_FACTOR):
+def get_slice_for_sliding_window(
+    image, roi_shape=ROI_SHAPE, overlap=SLIDE_OVERLAP_FACTOR
+):
     """
     Returns indices for image stride, to fulfill sliding window inference
     Stride is determined by roi_shape and overlap
     """
-    assert isinstance(roi_shape, list) and len(roi_shape) == 3 and any(roi_shape),\
-        f"Need proper ROI shape: {roi_shape}"
-    assert isinstance(overlap, float) and overlap > 0 and overlap < 1,\
-        f"Need sliding window overlap factor in (0,1): {overlap}"
+    assert (
+        isinstance(roi_shape, list) and len(roi_shape) == 3 and any(roi_shape)
+    ), f"Need proper ROI shape: {roi_shape}"
+    assert (
+        isinstance(overlap, float) and overlap > 0 and overlap < 1
+    ), f"Need sliding window overlap factor in (0,1): {overlap}"
 
     image_shape = list(image.shape[2:])
     dim = len(image_shape)
@@ -150,7 +156,7 @@ def get_latency(*args, **kw):
         ts = time.time()
         result, mystr = function(*args, **kw)
         te = time.time()
-        print('{:86} took {:>10.5f} sec'.format(mystr, te - ts))
+        print("{:86} took {:>10.5f} sec".format(mystr, te - ts))
         return result, ""
 
     return get_latency
diff --git a/vision/medical_imaging/3d-unet-kits19/kits_QSL.py b/vision/medical_imaging/3d-unet-kits19/kits_QSL.py
index 59a7b31d9..85ffb80f4 100644
--- a/vision/medical_imaging/3d-unet-kits19/kits_QSL.py
+++ b/vision/medical_imaging/3d-unet-kits19/kits_QSL.py
@@ -26,12 +26,12 @@ class KiTS_2019_QSL:
     """
     A class to represent QSL (Query Sample Library) for MLPerf.
 
-    This populates preprocessed KiTS19 inference data set into LoadGen compatible QSL 
+    This populates preprocessed KiTS19 inference data set into LoadGen compatible QSL
 
     Attributes
     ----------
     preprocessed_data_dir: str
-        path to directory containing preprocessed data 
+        path to directory containing preprocessed data
     preprocessed_files: list of str
         list of KiTS19 cases that are used in inference
     count: int
@@ -62,8 +62,10 @@ def __init__(self, preprocessed_data_dir, perf_count):
         """
         print("Constructing QSL...")
         self.preprocessed_data_dir = preprocessed_data_dir
-        with open(Path(self.preprocessed_data_dir, "preprocessed_files.pkl"), "rb") as f:
-            self.preprocess_files = pickle.load(f)['file_list']
+        with open(
+            Path(self.preprocessed_data_dir, "preprocessed_files.pkl"), "rb"
+        ) as f:
+            self.preprocess_files = pickle.load(f)["file_list"]
 
         self.count = len(self.preprocess_files)
         self.perf_count = perf_count if perf_count is not None else self.count
@@ -72,7 +74,11 @@ def __init__(self, preprocessed_data_dir, perf_count):
 
         self.loaded_files = {}
         self.qsl = lg.ConstructQSL(
-            self.count, self.perf_count, self.load_query_samples, self.unload_query_samples)
+            self.count,
+            self.perf_count,
+            self.load_query_samples,
+            self.unload_query_samples,
+        )
         print("Finished constructing QSL.")
 
     def load_query_samples(self, sample_list):
@@ -82,7 +88,10 @@ def load_query_samples(self, sample_list):
         for sample_id in sample_list:
             file_name = self.preprocess_files[sample_id]
             print("Loading file {:}".format(file_name))
-            with open(Path(self.preprocessed_data_dir, "{:}.pkl".format(file_name)), "rb") as f:
+            with open(
+                Path(self.preprocessed_data_dir,
+                     "{:}.pkl".format(file_name)), "rb"
+            ) as f:
                 self.loaded_files[sample_id] = pickle.load(f)[0]
 
     def unload_query_samples(self, sample_list):
@@ -99,5 +108,6 @@ def get_features(self, sample_id):
         return self.loaded_files[sample_id]
 
 
-def get_kits_QSL(preprocessed_data_dir="build/preprocessed_data", perf_count=None):
+def get_kits_QSL(
+        preprocessed_data_dir="build/preprocessed_data", perf_count=None):
     return KiTS_2019_QSL(preprocessed_data_dir, perf_count)
diff --git a/vision/medical_imaging/3d-unet-kits19/onnxruntime_SUT.py b/vision/medical_imaging/3d-unet-kits19/onnxruntime_SUT.py
index 37fe5fd14..4c6418544 100644
--- a/vision/medical_imaging/3d-unet-kits19/onnxruntime_SUT.py
+++ b/vision/medical_imaging/3d-unet-kits19/onnxruntime_SUT.py
@@ -23,6 +23,7 @@
 import onnxruntime
 import os
 
+
 class _3DUNET_ONNXRuntime_SUT(BASE_3DUNET_SUT):
     """
     A class to represent SUT (System Under Test) for MLPerf.
@@ -56,17 +57,24 @@ def __init__(self, model_path, preprocessed_data_dir, performance_count):
             preprocessed_data_dir: str or PosixPath
                 path to directory containing preprocessed data
             performance_count: int
-                number of query samples guaranteed to fit in memory            
+                number of query samples guaranteed to fit in memory
         """
         super().__init__(preprocessed_data_dir, performance_count)
         print("Loading ONNX model...")
-        assert Path(model_path).is_file(
-        ), "Cannot find the model file {:}!".format(model_path)
+        assert Path(model_path).is_file(), "Cannot find the model file {:}!".format(
+            model_path
+        )
         opt = onnxruntime.SessionOptions()
-        if len(onnxruntime.get_all_providers()) > 1 and os.environ.get("USE_GPU", "yes").lower() not in [ "0", "false", "off", "no" ]:
-            self.sess = onnxruntime.InferenceSession(model_path, opt, providers=["CUDAExecutionProvider"])
+        if len(onnxruntime.get_all_providers()) > 1 and os.environ.get(
+            "USE_GPU", "yes"
+        ).lower() not in ["0", "false", "off", "no"]:
+            self.sess = onnxruntime.InferenceSession(
+                model_path, opt, providers=["CUDAExecutionProvider"]
+            )
         else:
-            self.sess = onnxruntime.InferenceSession(model_path, opt, providers=["CPUExecutionProvider"])
+            self.sess = onnxruntime.InferenceSession(
+                model_path, opt, providers=["CPUExecutionProvider"]
+            )
 
     def do_infer(self, input_tensor):
         """
@@ -79,4 +87,5 @@ def get_sut(model_path, preprocessed_data_dir, performance_count):
     """
     Redirect the call for instantiating SUT to ONNX Runtime specific SUT
     """
-    return _3DUNET_ONNXRuntime_SUT(model_path, preprocessed_data_dir, performance_count)
+    return _3DUNET_ONNXRuntime_SUT(
+        model_path, preprocessed_data_dir, performance_count)
diff --git a/vision/medical_imaging/3d-unet-kits19/preprocess.py b/vision/medical_imaging/3d-unet-kits19/preprocess.py
index e71c96da1..ce7ea33c4 100644
--- a/vision/medical_imaging/3d-unet-kits19/preprocess.py
+++ b/vision/medical_imaging/3d-unet-kits19/preprocess.py
@@ -67,11 +67,11 @@ class Stats:
         collects average values in the preprocessed images
     std: list
         collects standard deviation of values in the preprocessed images
-    d: 
+    d:
         collects depths of the preprocessed images
-    h: 
+    h:
         collects heights of the preprocessed images
-    w: 
+    w:
         collects widths of the preprocessed images
 
     Methods
@@ -94,11 +94,11 @@ def __init__(self):
             collects average values in the preprocessed images
         std: list
             collects standard deviation of values in the preprocessed images
-        d: 
+        d:
             collects depths of the preprocessed images
-        h: 
+        h:
             collects heights of the preprocessed images
-        w: 
+        w:
             collects widths of the preprocessed images
         """
         self.mean = []
@@ -182,7 +182,7 @@ class Preprocessor:
     pad_to_min_shape(image, label):
         pads image/label so that the shape is equal or larger than ROI shape
     load_and_resample(case):
-        gets a pair of CT-imaging/segmentation data for the case, then, 
+        gets a pair of CT-imaging/segmentation data for the case, then,
         resample to the same, predetermined common voxel spacing
     normalize_intensity(image):
         normalize intensity for a given target stats
@@ -245,9 +245,11 @@ def collect_cases(self):
         all_set = set([f for f in os.listdir(self.data_dir) if "case" in f])
         target_set = set(self.target_cases)
         collected_set = all_set & target_set
-        assert collected_set == target_set,\
-            "Some of the target inference cases were found: {}".format(
-                target_set - collected_set)
+        assert (
+            collected_set == target_set
+        ), "Some of the target inference cases were found: {}".format(
+            target_set - collected_set
+        )
         return sorted(list(collected_set))
 
     def print_stats(self):
@@ -279,7 +281,7 @@ def preprocess_case(self, case):
         image, label = self.pad_to_min_shape(image, label)
         image, label = self.adjust_shape_for_sliding_window(image, label)
         self.save(image, label, aux)
-        aux['image_shape'] = image.shape
+        aux["image_shape"] = image.shape
         return aux
 
     @staticmethod
@@ -290,8 +292,9 @@ def pad_to_min_shape(image, label):
         current_shape = image.shape[1:]
         bounds = [max(0, ROI_SHAPE[i] - current_shape[i]) for i in range(3)]
         paddings = [(0, 0)]
-        paddings.extend([(bounds[i] // 2, bounds[i] - bounds[i] // 2)
-                         for i in range(3)])
+        paddings.extend(
+            [(bounds[i] // 2, bounds[i] - bounds[i] // 2) for i in range(3)]
+        )
 
         image = np.pad(image, paddings, mode="edge")
         label = np.pad(label, paddings, mode="edge")
@@ -307,9 +310,13 @@ def load_and_resample(self, case: str):
         aux = dict()
 
         image = nibabel.load(
-            Path(self.data_dir, case, "imaging.nii.gz").absolute())
+            Path(
+                self.data_dir,
+                case,
+                "imaging.nii.gz").absolute())
         label = nibabel.load(
-            Path(self.data_dir, case, "segmentation.nii.gz").absolute())
+            Path(self.data_dir, case, "segmentation.nii.gz").absolute()
+        )
 
         image_spacings = image.header["pixdim"][1:4].tolist()
         original_affine = image.affine
@@ -329,15 +336,27 @@ def load_and_resample(self, case: str):
             reshaped_affine[i][idx] = targ_arr[idx] * sign
 
         if image_spacings != self.target_spacing:
-            image = zoom(image, zoom_factor, order=1,
-                         mode='constant', cval=image.min(), grid_mode=False)
-            label = zoom(label, zoom_factor, order=0,
-                         mode='constant', cval=label.min(), grid_mode=False)
-
-        aux['original_affine'] = original_affine
-        aux['reshaped_affine'] = reshaped_affine
-        aux['zoom_factor'] = zoom_factor
-        aux['case'] = case
+            image = zoom(
+                image,
+                zoom_factor,
+                order=1,
+                mode="constant",
+                cval=image.min(),
+                grid_mode=False,
+            )
+            label = zoom(
+                label,
+                zoom_factor,
+                order=0,
+                mode="constant",
+                cval=label.min(),
+                grid_mode=False,
+            )
+
+        aux["original_affine"] = original_affine
+        aux["reshaped_affine"] = reshaped_affine
+        aux["zoom_factor"] = zoom_factor
+        aux["case"] = case
 
         image = np.expand_dims(image, 0)
         label = np.expand_dims(label, 0)
@@ -369,40 +388,57 @@ def adjust_shape_for_sliding_window(self, image, label):
         strides = [int(roi_shape[i] * (1 - overlap)) for i in range(dim)]
 
         bounds = [image_shape[i] % strides[i] for i in range(dim)]
-        bounds = [bounds[i] if bounds[i] <
-                  strides[i] // 2 else 0 for i in range(dim)]
-        image = image[...,
-                      bounds[0] // 2: image_shape[0] - (bounds[0] - bounds[0] // 2),
-                      bounds[1] // 2: image_shape[1] - (bounds[1] - bounds[1] // 2),
-                      bounds[2] // 2: image_shape[2] - (bounds[2] - bounds[2] // 2)]
-        label = label[...,
-                      bounds[0] // 2: image_shape[0] - (bounds[0] - bounds[0] // 2),
-                      bounds[1] // 2: image_shape[1] - (bounds[1] - bounds[1] // 2),
-                      bounds[2] // 2: image_shape[2] - (bounds[2] - bounds[2] // 2)]
+        bounds = [
+            bounds[i] if bounds[i] < strides[i] //
+            2 else 0 for i in range(dim)]
+        image = image[
+            ...,
+            bounds[0] // 2: image_shape[0] - (bounds[0] - bounds[0] // 2),
+            bounds[1] // 2: image_shape[1] - (bounds[1] - bounds[1] // 2),
+            bounds[2] // 2: image_shape[2] - (bounds[2] - bounds[2] // 2),
+        ]
+        label = label[
+            ...,
+            bounds[0] // 2: image_shape[0] - (bounds[0] - bounds[0] // 2),
+            bounds[1] // 2: image_shape[1] - (bounds[1] - bounds[1] // 2),
+            bounds[2] // 2: image_shape[2] - (bounds[2] - bounds[2] // 2),
+        ]
         image, paddings = self.constant_pad_volume(
-            image, roi_shape, strides, self.padding_val)
+            image, roi_shape, strides, self.padding_val
+        )
         label, paddings = self.constant_pad_volume(
             label, roi_shape, strides, 0)
 
         return image, label
 
-    def constant_pad_volume(self, volume, roi_shape, strides, padding_val, dim=3):
+    def constant_pad_volume(self, volume, roi_shape,
+                            strides, padding_val, dim=3):
         """
         Helper padding volume symmetrically with value of padding_val
         Padded volume becomes ROI shape friendly
         """
-        bounds = [(strides[i] - volume.shape[1:][i] % strides[i]) %
-                  strides[i] for i in range(dim)]
-        bounds = [bounds[i] if (volume.shape[1:][i] + bounds[i]) >= roi_shape[i] else
-                  bounds[i] + strides[i]
-                  for i in range(dim)]
-        paddings = [(0, 0),
-                    (bounds[0] // 2, bounds[0] - bounds[0] // 2),
-                    (bounds[1] // 2, bounds[1] - bounds[1] // 2),
-                    (bounds[2] // 2, bounds[2] - bounds[2] // 2)]
+        bounds = [
+            (strides[i] - volume.shape[1:][i] % strides[i]) % strides[i]
+            for i in range(dim)
+        ]
+        bounds = [
+            (
+                bounds[i]
+                if (volume.shape[1:][i] + bounds[i]) >= roi_shape[i]
+                else bounds[i] + strides[i]
+            )
+            for i in range(dim)
+        ]
+        paddings = [
+            (0, 0),
+            (bounds[0] // 2, bounds[0] - bounds[0] // 2),
+            (bounds[1] // 2, bounds[1] - bounds[1] // 2),
+            (bounds[2] // 2, bounds[2] - bounds[2] // 2),
+        ]
 
         padded_volume = np.pad(
-            volume, paddings, mode='constant', constant_values=[padding_val])
+            volume, paddings, mode="constant", constant_values=[padding_val]
+        )
         return padded_volume, paddings
 
     def save(self, image, label, aux):
@@ -416,35 +452,51 @@ def save(self, image, label, aux):
             - case name
         Preprocessed imaging/segmentation data saved as NIFTI
         """
-        case = aux['case']
-        reshaped_affine = aux['reshaped_affine']
+        case = aux["case"]
+        reshaped_affine = aux["reshaped_affine"]
         image = image.astype(np.float32)
         label = label.astype(np.uint8)
         mean, std = np.round(np.mean(image, (1, 2, 3)), 2), np.round(
-            np.std(image, (1, 2, 3)), 2)
+            np.std(image, (1, 2, 3)), 2
+        )
         self.stats.append(
-            mean, std, image.shape[1], image.shape[2], image.shape[3])
+            mean,
+            std,
+            image.shape[1],
+            image.shape[2],
+            image.shape[3])
         pickle_file_path = Path(self.results_dir, f"{case}.pkl").absolute()
         with open(pickle_file_path, "wb") as f:
             pickle.dump([image, label], f)
         f.close()
         print(
-            f"Saved {str(pickle_file_path)} -- shape {image.shape} mean {mean} std {std}")
+            f"Saved {str(pickle_file_path)} -- shape {image.shape} mean {mean} std {std}"
+        )
         if not self.calibration:
             path_to_nifti_dir = Path(
                 self.results_dir, "nifti", case).absolute()
             path_to_nifti_dir.mkdir(parents=True, exist_ok=True)
             nifti_image = nibabel.Nifti1Image(
-                np.squeeze(image, 0), affine=reshaped_affine)
+                np.squeeze(image, 0), affine=reshaped_affine
+            )
             nifti_label = nibabel.Nifti1Image(
-                np.squeeze(label, 0), affine=reshaped_affine)
-            nibabel.save(nifti_image, Path(
-                path_to_nifti_dir / "imaging.nii.gz"))
-            nibabel.save(nifti_label, Path(
-                path_to_nifti_dir / "segmentation.nii.gz"))
-            assert nifti_image.shape == nifti_label.shape, \
-                "While saving NIfTI files to {}, image: {} and label: {} have different shape".format(
-                    path_to_nifti_dir, nifti_image.shape, nifti_label.shape)
+                np.squeeze(label, 0), affine=reshaped_affine
+            )
+            nibabel.save(
+                nifti_image,
+                Path(
+                    path_to_nifti_dir /
+                    "imaging.nii.gz"))
+            nibabel.save(
+                nifti_label,
+                Path(
+                    path_to_nifti_dir /
+                    "segmentation.nii.gz"))
+            assert (
+                nifti_image.shape == nifti_label.shape
+            ), "While saving NIfTI files to {}, image: {} and label: {} have different shape".format(
+                path_to_nifti_dir, nifti_image.shape, nifti_label.shape
+            )
 
 
 def preprocess_multiproc_helper(preproc, case):
@@ -459,10 +511,12 @@ def save_preprocessed_info(preproc_dir, aux, targets):
     """
     Saves list of preprocessed files and the associated aux info into preprocessed_files.pkl
     """
-    assert len(targets) == len(aux['cases']),\
-        "Error in number of preprocessed files:\nExpected:{}\nProcessed:{}".format(
-            targets, list(aux['cases'].keys()))
-    with open(os.path.join(preproc_dir, 'preprocessed_files.pkl'), 'wb') as f:
+    assert len(targets) == len(
+        aux["cases"]
+    ), "Error in number of preprocessed files:\nExpected:{}\nProcessed:{}".format(
+        targets, list(aux["cases"].keys())
+    )
+    with open(os.path.join(preproc_dir, "preprocessed_files.pkl"), "wb") as f:
         pickle.dump(aux, f)
     f.close()
 
@@ -473,16 +527,14 @@ def preprocess_with_multiproc(args):
     """
     preproc = Preprocessor(args)
     cases = preproc.collect_cases()
-    aux = {
-        'file_list': preproc.target_cases,
-        'cases': dict()
-    }
+    aux = {"file_list": preproc.target_cases, "cases": dict()}
     with Pool(args.num_proc) as p:
-        pool_out = p.starmap(preprocess_multiproc_helper,
-                             zip([preproc]*len(cases), cases))
+        pool_out = p.starmap(
+            preprocess_multiproc_helper, zip([preproc] * len(cases), cases)
+        )
 
     for _d in pool_out:
-        aux['cases'][_d['case']] = _d
+        aux["cases"][_d["case"]] = _d
     save_preprocessed_info(preproc.results_dir, aux, preproc.target_cases)
     p.join()
     p.close()
@@ -492,7 +544,7 @@ def generate_hash_from_volume(vol_path):
     """
     Generates MD5 hash from a single preprocessed file
     """
-    with open(vol_path, 'rb') as f:
+    with open(vol_path, "rb") as f:
         data = f.read()
         md5_hash = hashlib.md5(data).hexdigest()
     f.close()
@@ -507,19 +559,21 @@ def generate_hash_from_dataset(args):
     num_proc = args.num_proc
     checksum = dict()
     CHECKSUM_FILE = CHECKSUM_CALIB_FILE if args.calibration else CHECKSUM_INFER_FILE
-    results = [f for f in os.listdir(results_dir) if f.startswith(
-        'case') and f.endswith('pkl')]
+    results = [
+        f for f in os.listdir(results_dir) if f.startswith("case") and f.endswith("pkl")
+    ]
     vol_path = [os.path.join(results_dir, v) for v in results]
 
     print(
-        f"Generating checksum file checksum.json from preprocessed data in {results_dir}...")
+        f"Generating checksum file checksum.json from preprocessed data in {results_dir}..."
+    )
     with Pool(num_proc) as p:
         pool_out = p.map(generate_hash_from_volume, vol_path)
 
     for vol, md5 in pool_out:
         checksum[vol] = md5
 
-    with open(CHECKSUM_FILE, 'w') as f:
+    with open(CHECKSUM_FILE, "w") as f:
         json.dump(dict(sorted(checksum.items())), f, indent=4, sort_keys=True)
     f.close()
 
@@ -535,8 +589,9 @@ def verify_dataset(args):
     results_dir = args.results_dir
     num_proc = args.num_proc
     CHECKSUM_FILE = CHECKSUM_CALIB_FILE if args.calibration else CHECKSUM_INFER_FILE
-    results = [f for f in os.listdir(results_dir) if f.startswith(
-        'case') and f.endswith('pkl')]
+    results = [
+        f for f in os.listdir(results_dir) if f.startswith("case") and f.endswith("pkl")
+    ]
     vol_path = [os.path.join(results_dir, v) for v in results]
     violations = dict()
 
@@ -544,9 +599,11 @@ def verify_dataset(args):
     with open(CHECKSUM_FILE) as f:
         source = json.load(f)
     f.close()
-    assert len(source) == len(results),\
-        "checksum.json has {} entries while {} volumes found".format(
-            len(source), len(results))
+    assert len(source) == len(
+        results
+    ), "checksum.json has {} entries while {} volumes found".format(
+        len(source), len(results)
+    )
 
     with Pool(num_proc) as p:
         pool_out = p.map(generate_hash_from_volume, vol_path)
@@ -558,9 +615,9 @@ def verify_dataset(args):
     if any(violations):
         for vol, (res, ref) in violations.items():
             print(f"{vol} -- Invalid hash, got {res} while expecting {ref}")
-        assert False,\
-            "Verification failed, {}/{} mismatch(es) found".format(
-                len(violations), len(results))
+        assert False, "Verification failed, {}/{} mismatch(es) found".format(
+            len(violations), len(results)
+        )
 
     p.join()
     p.close()
@@ -571,34 +628,45 @@ def parse_args():
     """
     Args used for preprocessing
     """
-    PARSER = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=argparse.RawTextHelpFormatter)
-
-    PARSER.add_argument('--raw_data_dir',
-                        dest='data_dir',
-                        required=True,
-                        help="Dir where KiTS19 GIT repo is cloned")
-    PARSER.add_argument('--results_dir',
-                        dest='results_dir',
-                        required=True,
-                        help="Dir to store preprocessed data")
-    PARSER.add_argument('--mode',
-                        dest='mode',
-                        choices=["preprocess", "verify", "gen_hash"],
-                        default="preprocess",
-                        help="""preprocess for generating inference dataset, 
-                                gen_hash for generating new checksum file, 
-                                verify for verifying the checksums against stored checksum file""")
-    PARSER.add_argument('--calibration',
-                        dest='calibration',
-                        action='store_true',
-                        help="Preprocess calibration dataset instead of inference dataset")
-    PARSER.add_argument('--num_proc',
-                        dest='num_proc',
-                        type=int,
-                        choices=list(range(1, 17)),
-                        default=4,
-                        help="Number of processes to be used")
+    PARSER = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawTextHelpFormatter
+    )
+
+    PARSER.add_argument(
+        "--raw_data_dir",
+        dest="data_dir",
+        required=True,
+        help="Dir where KiTS19 GIT repo is cloned",
+    )
+    PARSER.add_argument(
+        "--results_dir",
+        dest="results_dir",
+        required=True,
+        help="Dir to store preprocessed data",
+    )
+    PARSER.add_argument(
+        "--mode",
+        dest="mode",
+        choices=["preprocess", "verify", "gen_hash"],
+        default="preprocess",
+        help="""preprocess for generating inference dataset,
+                                gen_hash for generating new checksum file,
+                                verify for verifying the checksums against stored checksum file""",
+    )
+    PARSER.add_argument(
+        "--calibration",
+        dest="calibration",
+        action="store_true",
+        help="Preprocess calibration dataset instead of inference dataset",
+    )
+    PARSER.add_argument(
+        "--num_proc",
+        dest="num_proc",
+        type=int,
+        choices=list(range(1, 17)),
+        default=4,
+        help="Number of processes to be used",
+    )
 
     args = PARSER.parse_args()
 
@@ -622,5 +690,5 @@ def main():
         verify_dataset(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/vision/medical_imaging/3d-unet-kits19/pytorch_SUT.py b/vision/medical_imaging/3d-unet-kits19/pytorch_SUT.py
index af3051ada..cd136adf1 100644
--- a/vision/medical_imaging/3d-unet-kits19/pytorch_SUT.py
+++ b/vision/medical_imaging/3d-unet-kits19/pytorch_SUT.py
@@ -63,12 +63,13 @@ def __init__(self, model_path, preprocessed_data_dir, performance_count):
             preprocessed_data_dir: str or PosixPath
                 path to directory containing preprocessed data
             performance_count: int
-                number of query samples guaranteed to fit in memory                
+                number of query samples guaranteed to fit in memory
         """
         super().__init__(preprocessed_data_dir, performance_count)
         print("Loading PyTorch model...")
-        assert Path(model_path).is_file(
-        ), "Cannot find the model file {:}!".format(model_path)
+        assert Path(model_path).is_file(), "Cannot find the model file {:}!".format(
+            model_path
+        )
         self.device = torch.device(
             "cuda:0" if torch.cuda.is_available() else "cpu")
         self.model = torch.jit.load(model_path, map_location=self.device)
@@ -98,4 +99,5 @@ def get_sut(model_path, preprocessed_data_dir, performance_count):
     """
     Redirect the call for instantiating SUT to PyTorch/TorchScript specific SUT
     """
-    return _3DUNET_PyTorch_SUT(model_path, preprocessed_data_dir, performance_count)
+    return _3DUNET_PyTorch_SUT(
+        model_path, preprocessed_data_dir, performance_count)
diff --git a/vision/medical_imaging/3d-unet-kits19/pytorch_checkpoint_SUT.py b/vision/medical_imaging/3d-unet-kits19/pytorch_checkpoint_SUT.py
index a4a047510..1453d5cf5 100644
--- a/vision/medical_imaging/3d-unet-kits19/pytorch_checkpoint_SUT.py
+++ b/vision/medical_imaging/3d-unet-kits19/pytorch_checkpoint_SUT.py
@@ -38,6 +38,7 @@
     "none": lambda _, __: nn.Identity(),
 }
 
+
 def _normalization(norm_type, num_features, num_groups=16):
     """
     A helper redirecting normalization function used in 3D-UNet
@@ -46,6 +47,7 @@ def _normalization(norm_type, num_features, num_groups=16):
         return normalizations[norm_type](num_features, num_groups)
     raise ValueError(f"Unknown normalization {norm_type}")
 
+
 def _activation(activation):
     """
     A helper redirecting activation function used in 3D-UNet
@@ -54,33 +56,53 @@ def _activation(activation):
         return activations[activation]
     raise ValueError(f"Unknown activation {activation}")
 
-def conv_block_factory(in_channels, out_channels,
-                       kernel_size=3, stride=1, padding=1,
-                       conv_type="regular",
-                       normalization="instancenorm", activation="relu"):
+
+def conv_block_factory(
+    in_channels,
+    out_channels,
+    kernel_size=3,
+    stride=1,
+    padding=1,
+    conv_type="regular",
+    normalization="instancenorm",
+    activation="relu",
+):
     """
     A method used for building basic 3D Convolution block of 3D-UNet
     """
     conv = convolutions[conv_type]
-    conv = conv(in_channels, out_channels, kernel_size=kernel_size, stride=stride,
-                padding=padding, bias=normalization=="none")
+    conv = conv(
+        in_channels,
+        out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        bias=normalization == "none",
+    )
 
     normalization = _normalization(normalization, out_channels)
     activation = _activation(activation)
 
-    return nn.Sequential(conv, normalization, activation)    
+    return nn.Sequential(conv, normalization, activation)
 
 
 class DownsampleBlock(nn.Module):
     """
     A class building encoder block of 3D-UNet
     """
+
     def __init__(self, in_channels, out_channels):
         super(DownsampleBlock, self).__init__()
-        self.conv1 = conv_block_factory(in_channels, out_channels, stride=2,
-                                        normalization="instancenorm", activation="relu")
-        self.conv2 = conv_block_factory(out_channels, out_channels, 
-                                        normalization="instancenorm", activation="relu")
+        self.conv1 = conv_block_factory(
+            in_channels,
+            out_channels,
+            stride=2,
+            normalization="instancenorm",
+            activation="relu",
+        )
+        self.conv2 = conv_block_factory(
+            out_channels, out_channels, normalization="instancenorm", activation="relu"
+        )
 
     def forward(self, x):
         x = self.conv1(x)
@@ -92,20 +114,32 @@ class UpsampleBlock(nn.Module):
     """
     A class building decoder block of 3D-UNet
     """
+
     def __init__(self, in_channels, out_channels):
         super(UpsampleBlock, self).__init__()
 
         self.in_channels = in_channels
         self.out_channels = out_channels
-        self.upsample_conv = conv_block_factory(in_channels, out_channels,
-                                                kernel_size=2, stride=2, padding=0,
-                                                conv_type="transpose", 
-                                                normalization="none", activation="none")
+        self.upsample_conv = conv_block_factory(
+            in_channels,
+            out_channels,
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            conv_type="transpose",
+            normalization="none",
+            activation="none",
+        )
 
-        self.conv1 = conv_block_factory(2 * out_channels, out_channels,
-                                        normalization="instancenorm", activation="relu")
-        self.conv2 = conv_block_factory(out_channels, out_channels,
-                                        normalization="instancenorm", activation="relu")
+        self.conv1 = conv_block_factory(
+            2 * out_channels,
+            out_channels,
+            normalization="instancenorm",
+            activation="relu",
+        )
+        self.conv2 = conv_block_factory(
+            out_channels, out_channels, normalization="instancenorm", activation="relu"
+        )
 
     def forward(self, x, skip):
         x = self.upsample_conv(x)
@@ -119,12 +153,15 @@ class InputBlock(nn.Module):
     """
     A class building the very first input block of 3D-UNet
     """
+
     def __init__(self, in_channels, out_channels):
         super(InputBlock, self).__init__()
-        self.conv1 = conv_block_factory(in_channels, out_channels, 
-                                        normalization="instancenorm", activation="relu")
-        self.conv2 = conv_block_factory(out_channels, out_channels,
-                                        normalization="instancenorm", activation="relu")
+        self.conv1 = conv_block_factory(
+            in_channels, out_channels, normalization="instancenorm", activation="relu"
+        )
+        self.conv2 = conv_block_factory(
+            out_channels, out_channels, normalization="instancenorm", activation="relu"
+        )
 
     def forward(self, x):
         x = self.conv1(x)
@@ -136,10 +173,17 @@ class OutputLayer(nn.Module):
     """
     A class building final output block of 3D-UNet
     """
+
     def __init__(self, in_channels, n_class):
         super(OutputLayer, self).__init__()
-        self.conv = conv_block_factory(in_channels, n_class, kernel_size=1, padding=0,
-                                       activation="none", normalization="none")
+        self.conv = conv_block_factory(
+            in_channels,
+            n_class,
+            kernel_size=1,
+            padding=0,
+            activation="none",
+            normalization="none",
+        )
 
     def forward(self, x):
         return self.conv(x)
@@ -157,12 +201,13 @@ class Unet3D(nn.Module):
     n_class: int
         number of classes the segmentation ends up for
     """
+
     def __init__(self, in_channels=1, n_class=3):
         """
         Constructs 3D-UNet as in MLPerf-Training 3D-UNet:
         https://github.com/mlcommons/training/blob/master/image_segmentation/pytorch
 
-        """        
+        """
         super(Unet3D, self).__init__()
 
         filters = [32, 64, 128, 256, 320]
@@ -175,18 +220,21 @@ def __init__(self, in_channels=1, n_class=3):
         self.input_block = InputBlock(in_channels, input_dim)
 
         self.downsample = nn.ModuleList(
-            [DownsampleBlock(i, o)
-             for (i, o) in zip(self.inp, self.out)]
+            [DownsampleBlock(i, o) for (i, o) in zip(self.inp, self.out)]
         )
         self.bottleneck = DownsampleBlock(filters[-1], filters[-1])
         upsample = [UpsampleBlock(filters[-1], filters[-1])]
-        upsample.extend([UpsampleBlock(i, o)
-                         for (i, o) in zip(reversed(self.out), reversed(self.inp))])
+        upsample.extend(
+            [
+                UpsampleBlock(i, o)
+                for (i, o) in zip(reversed(self.out), reversed(self.inp))
+            ]
+        )
         self.upsample = nn.ModuleList(upsample)
         self.output = OutputLayer(input_dim, n_class)
 
         for name, v in self.named_parameters():
-            if 'weight' in name or 'bias' in name:
+            if "weight" in name or "bias" in name:
                 v.data *= 1.0
 
     def forward(self, x):
@@ -246,18 +294,19 @@ def __init__(self, model_path, preprocessed_data_dir, performance_count):
             preprocessed_data_dir: str or PosixPath
                 path to directory containing preprocessed data
             performance_count: int
-                number of query samples guaranteed to fit in memory                
+                number of query samples guaranteed to fit in memory
         """
         super().__init__(preprocessed_data_dir, performance_count)
         print("Loading PyTorch model...")
-        assert Path(model_path).is_file(
-        ), "Cannot find the model file {:}!".format(model_path)
+        assert Path(model_path).is_file(), "Cannot find the model file {:}!".format(
+            model_path
+        )
         self.device = torch.device(
             "cuda:0" if torch.cuda.is_available() else "cpu")
         self.model = Unet3D()
         self.model.to(self.device)
         checkpoint = torch.load(model_path, map_location=self.device)
-        self.model.load_state_dict(checkpoint['best_model_state_dict'])
+        self.model.load_state_dict(checkpoint["best_model_state_dict"])
         self.model.eval()
 
     def do_infer(self, input_tensor):
@@ -284,4 +333,6 @@ def get_sut(model_path, preprocessed_data_dir, performance_count):
     """
     Redirect the call for instantiating SUT to PyTorch/TorchScript specific SUT
     """
-    return _3DUNET_PyTorch_CHECKPOINT_SUT(model_path, preprocessed_data_dir, performance_count)
+    return _3DUNET_PyTorch_CHECKPOINT_SUT(
+        model_path, preprocessed_data_dir, performance_count
+    )
diff --git a/vision/medical_imaging/3d-unet-kits19/run.py b/vision/medical_imaging/3d-unet-kits19/run.py
index c4517e85c..9d48f9928 100644
--- a/vision/medical_imaging/3d-unet-kits19/run.py
+++ b/vision/medical_imaging/3d-unet-kits19/run.py
@@ -16,15 +16,14 @@
 # limitations under the License.
 
 
+import mlperf_loadgen as lg
+from pathlib import Path
+import subprocess
+import argparse
 import os
 import sys
-sys.path.insert(0, os.getcwd())
-import argparse
-import subprocess
-
-from pathlib import Path
 
-import mlperf_loadgen as lg
+sys.path.insert(0, os.getcwd())
 
 
 __doc__ = """
@@ -41,7 +40,7 @@
 $(SCENARIO): Offline, SingleStream, MultiStream, or Server (Note: MultiStream may be deprecated)
 $(MODEL) should point to correct model for the chosen backend
 
-If run for the accuracy, DICE scores will be summarized and printed at the end of the test, and 
+If run for the accuracy, DICE scores will be summarized and printed at the end of the test, and
 inference results will be stored as NIFTI files.
 
 Performance run can be more specific as:
@@ -65,39 +64,52 @@ def get_args():
     """
     Args used for running 3D UNet KITS19
     """
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--backend",
-                        choices=["pytorch", "pytorch_checkpoint", "onnxruntime", "tensorflow"],
-                        default="pytorch",
-                        help="Backend")
-    parser.add_argument("--scenario",
-                        choices=["SingleStream", "Offline"],
-                        default="Offline",
-                        help="Scenario")
-    parser.add_argument("--accuracy",
-                        action="store_true",
-                        help="enable accuracy pass")
-    parser.add_argument("--mlperf_conf",
-                        default="build/mlperf.conf",
-                        help="mlperf rules config")
-    parser.add_argument("--user_conf",
-                        default="user.conf",
-                        help="user config for user LoadGen settings such as target QPS")
-    parser.add_argument("--audit_conf",
-                        default="audit.conf",
-                        help="audit config for LoadGen settings during compliance runs")
-    parser.add_argument("--model",
-                        default="build/model/3dunet_kits19_pytorch.ptc",
-                        help="Path to PyTorch, ONNX, or TF model")
-    parser.add_argument("--preprocessed_data_dir",
-                        default="build/preprocessed_data",
-                        help="path to preprocessed data")
-    parser.add_argument("--performance_count",
-                        type=int,
-                        default=None,
-                        help="performance count")
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawTextHelpFormatter
+    )
+
+    parser.add_argument(
+        "--backend",
+        choices=["pytorch", "pytorch_checkpoint", "onnxruntime", "tensorflow"],
+        default="pytorch",
+        help="Backend",
+    )
+    parser.add_argument(
+        "--scenario",
+        choices=["SingleStream", "Offline"],
+        default="Offline",
+        help="Scenario",
+    )
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--mlperf_conf", default="build/mlperf.conf", help="mlperf rules config"
+    )
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+    parser.add_argument(
+        "--audit_conf",
+        default="audit.conf",
+        help="audit config for LoadGen settings during compliance runs",
+    )
+    parser.add_argument(
+        "--model",
+        default="build/model/3dunet_kits19_pytorch.ptc",
+        help="Path to PyTorch, ONNX, or TF model",
+    )
+    parser.add_argument(
+        "--preprocessed_data_dir",
+        default="build/preprocessed_data",
+        help="path to preprocessed data",
+    )
+    parser.add_argument(
+        "--performance_count", type=int, default=None, help="performance count"
+    )
     args = parser.parse_args()
     return args
 
@@ -118,7 +130,7 @@ def main():
         "SingleStream": lg.TestScenario.SingleStream,
         "Offline": lg.TestScenario.Offline,
         "Server": lg.TestScenario.Server,
-        "MultiStream": lg.TestScenario.MultiStream
+        "MultiStream": lg.TestScenario.MultiStream,
     }
 
     args = get_args()
@@ -134,8 +146,10 @@ def main():
         from tensorflow_SUT import get_sut
     else:
         raise ValueError("Unknown backend: {:}".format(args.backend))
-    sut = get_sut(args.model, args.preprocessed_data_dir,
-                  args.performance_count)
+    sut = get_sut(
+        args.model,
+        args.preprocessed_data_dir,
+        args.performance_count)
 
     # setup LoadGen
     settings = lg.TestSettings()
@@ -148,7 +162,9 @@ def main():
         settings.mode = lg.TestMode.PerformanceOnly
 
     # set up mlperf logger
-    log_path = Path(os.environ.get("LOG_PATH", os.path.join("build", "logs"))).absolute()
+    log_path = Path(
+        os.environ.get("LOG_PATH", os.path.join("build", "logs"))
+    ).absolute()
     log_path.mkdir(parents=True, exist_ok=True)
     log_output_settings = lg.LogOutputSettings()
     log_output_settings.outdir = str(log_path)
@@ -158,10 +174,12 @@ def main():
 
     # start running test, from LoadGen
     print("Running Loadgen test...")
-    lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings, log_settings, args.audit_conf)
+    lg.StartTestWithLogSettings(
+        sut.sut, sut.qsl.qsl, settings, log_settings, args.audit_conf
+    )
 
     # if needed check accuracy
-    if args.accuracy and not os.environ.get('SKIP_VERIFY_ACCURACY', False):
+    if args.accuracy and not os.environ.get("SKIP_VERIFY_ACCURACY", False):
         print("Checking accuracy...")
         cmd = f"python3 accuracy_kits.py --preprocessed_data_dir={args.preprocessed_data_dir} --log_file={os.path.join(log_path, 'mlperf_log_accuracy.json')}"
         subprocess.check_call(cmd, shell=True)
diff --git a/vision/medical_imaging/3d-unet-kits19/tensorflow_SUT.py b/vision/medical_imaging/3d-unet-kits19/tensorflow_SUT.py
index 4e6a843ba..d2ba1bc52 100644
--- a/vision/medical_imaging/3d-unet-kits19/tensorflow_SUT.py
+++ b/vision/medical_imaging/3d-unet-kits19/tensorflow_SUT.py
@@ -61,8 +61,9 @@ def __init__(self, model_path, preprocessed_data_dir, performance_count):
         """
         super().__init__(preprocessed_data_dir, performance_count)
         print("Loading TensorFlow model...")
-        assert Path(model_path, "saved_model.pb").is_file(),\
-            "Cannot find the model file {:}!".format(model_path)
+        assert Path(
+            model_path, "saved_model.pb"
+        ).is_file(), "Cannot find the model file {:}!".format(model_path)
         loaded_model = tf.saved_model.load(model_path)
         self.model = loaded_model.signatures["serving_default"]
         self.output_name = list(self.model.structured_outputs)[0]
@@ -78,4 +79,5 @@ def get_sut(model_path, preprocessed_data_dir, performance_count):
     """
     Redirect the call for instantiating SUT to TensorFlow specific SUT
     """
-    return _3DUNET_TensorFlow_SUT(model_path, preprocessed_data_dir, performance_count)
+    return _3DUNET_TensorFlow_SUT(
+        model_path, preprocessed_data_dir, performance_count)
diff --git a/vision/medical_imaging/3d-unet-kits19/unet_onnx_to_tensorflow.py b/vision/medical_imaging/3d-unet-kits19/unet_onnx_to_tensorflow.py
index 72d8fa3dd..4539e0b98 100644
--- a/vision/medical_imaging/3d-unet-kits19/unet_onnx_to_tensorflow.py
+++ b/vision/medical_imaging/3d-unet-kits19/unet_onnx_to_tensorflow.py
@@ -22,6 +22,7 @@
 import argparse
 import os
 import sys
+
 sys.path.insert(0, os.getcwd())
 
 
@@ -49,18 +50,23 @@ def get_args():
     """
     Args used for converting ONNX to TensorFlow model
     """
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--model",
-                        default="build/model/3dunet_kits19_128x128x128.onnx",
-                        help="Path to the ONNX model")
-    parser.add_argument("--output_name",
-                        default="3dunet_kits19_128x128x128.tf",
-                        help="Name of output model")
-    parser.add_argument("--output_dir",
-                        default="build/model",
-                        help="Directory to save output model")
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawTextHelpFormatter
+    )
+
+    parser.add_argument(
+        "--model",
+        default="build/model/3dunet_kits19_128x128x128.onnx",
+        help="Path to the ONNX model",
+    )
+    parser.add_argument(
+        "--output_name",
+        default="3dunet_kits19_128x128x128.tf",
+        help="Name of output model",
+    )
+    parser.add_argument(
+        "--output_dir", default="build/model", help="Directory to save output model"
+    )
 
     args = parser.parse_args()
 
diff --git a/vision/medical_imaging/3d-unet-kits19/unet_pytorch_to_onnx.py b/vision/medical_imaging/3d-unet-kits19/unet_pytorch_to_onnx.py
index 3c9e2ad1f..8164d7d05 100644
--- a/vision/medical_imaging/3d-unet-kits19/unet_pytorch_to_onnx.py
+++ b/vision/medical_imaging/3d-unet-kits19/unet_pytorch_to_onnx.py
@@ -16,17 +16,14 @@
 # limitations under the License.
 
 
+from global_vars import *
+from pathlib import Path
+import torch
+import argparse
 import os
 import sys
-sys.path.insert(0, os.getcwd())
-
-import argparse
-import torch
-
-from pathlib import Path
-
-from global_vars import *
 
+sys.path.insert(0, os.getcwd())
 
 
 __doc__ = """
@@ -50,21 +47,28 @@ def get_args():
     """
     Args used for converting PyTorch/TorchScript to ONNX model
     """
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--model",
-                        default="build/model/3dunet_kits19_pytorch.ptc",
-                        help="Path to the PyTorch model")
-    parser.add_argument("--output_name",
-                        default="3dunet_kits19_128x128x128.onnx",
-                        help="Name of output model")
-    parser.add_argument("--dynamic_bs_output_name",
-                        default="3dunet_kits19_128x128x128_dynbatch.onnx",
-                        help="Name of output model")
-    parser.add_argument("--output_dir",
-                        default="build/model",
-                        help="Directory to save output model")
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawTextHelpFormatter
+    )
+
+    parser.add_argument(
+        "--model",
+        default="build/model/3dunet_kits19_pytorch.ptc",
+        help="Path to the PyTorch model",
+    )
+    parser.add_argument(
+        "--output_name",
+        default="3dunet_kits19_128x128x128.onnx",
+        help="Name of output model",
+    )
+    parser.add_argument(
+        "--dynamic_bs_output_name",
+        default="3dunet_kits19_128x128x128_dynbatch.onnx",
+        help="Name of output model",
+    )
+    parser.add_argument(
+        "--output_dir", default="build/model", help="Directory to save output model"
+    )
 
     args = parser.parse_args()
 
@@ -85,11 +89,13 @@ def main():
 
     output_path = Path(args.output_dir, args.output_name).absolute()
     dynamic_bs_output_path = Path(
-        args.output_dir, args.dynamic_bs_output_name).absolute()
+        args.output_dir, args.dynamic_bs_output_name
+    ).absolute()
 
     print("Loading PyTorch model...")
-    assert Path(model_path).is_file(
-    ), "Cannot find the model file {:}!".format(model_path)
+    assert Path(model_path).is_file(), "Cannot find the model file {:}!".format(
+        model_path
+    )
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
     model = torch.jit.load(model_path, map_location=device)
@@ -100,24 +106,45 @@ def main():
     output_channels = 3
     depth, height, width = ROI_SHAPE
 
-    dummy_input = torch.rand(
-        [batchsize, input_channels, height, width, depth]).float().to(device)
-    dummy_output = torch.rand(
-        [batchsize, output_channels, height, width, depth]).float().to(device)
+    dummy_input = (
+        torch.rand([batchsize, input_channels, height,
+                   width, depth]).float().to(device)
+    )
+    dummy_output = (
+        torch.rand([batchsize, output_channels, height, width, depth])
+        .float()
+        .to(device)
+    )
 
     # using opset version 12
-    torch.onnx.export(model, dummy_input, output_path, opset_version=12,
-                      do_constant_folding=False, input_names=['input'], output_names=['output'],
-                      example_outputs=dummy_output)
-
-    torch.onnx.export(model, dummy_input, dynamic_bs_output_path, opset_version=12,
-                      do_constant_folding=False, input_names=['input'], output_names=['output'],
-                      dynamic_axes={"input": {0: "batch_size"},
-                                    "output": {0: "batch_size"}},
-                      example_outputs=dummy_output)
-
-    print("Successfully exported model:\n  {}\nand\n  {}".format(
-        output_path, dynamic_bs_output_path))
+    torch.onnx.export(
+        model,
+        dummy_input,
+        output_path,
+        opset_version=12,
+        do_constant_folding=False,
+        input_names=["input"],
+        output_names=["output"],
+        example_outputs=dummy_output,
+    )
+
+    torch.onnx.export(
+        model,
+        dummy_input,
+        dynamic_bs_output_path,
+        opset_version=12,
+        do_constant_folding=False,
+        input_names=["input"],
+        output_names=["output"],
+        dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
+        example_outputs=dummy_output,
+    )
+
+    print(
+        "Successfully exported model:\n  {}\nand\n  {}".format(
+            output_path, dynamic_bs_output_path
+        )
+    )
 
 
 if __name__ == "__main__":