mlcommons · sarthakpati · May 22, 2023 · May 22, 2023 · May 22, 2023 · May 22, 2023
@@ -20,11 +20,17 @@
 def parse_gandlf_csv(fpath):
     df, _ = parseTrainingCSV(fpath, train=False)
     df = df.drop_duplicates()
+    # nans can be easily removed using df.dropna(axis=1, how='all')
+    # we want to keep them because we want the user to check the CSV instead
+    # there might be cases where labels are accidentally removed for some subjects, but not all
+    assert (
+        df.isnull().values.any() == False
+    ), "Data CSV contains null/nan values, please check."
     for _, row in df.iterrows():
         if "Label" in row:
             yield row["SubjectID"], row["Channel_0"], row["Label"]
         else:
-            yield row["SubjectID"], row["Channel_0"]
+            yield row["SubjectID"], row["Channel_0"], None
 
 
 def patch_extraction(input_path, output_path, config=None):
@@ -63,7 +69,8 @@ def patch_extraction(input_path, output_path, config=None):
     for sid, slide, label in parse_gandlf_csv(input_path):
         # Create new instance of slide manager
         manager = PatchManager(slide, os.path.join(output_path, str(sid)))
-        manager.set_label_map(label)
+        if label is not None:
+            manager.set_label_map(label)
         manager.set_subjectID(str(sid))
         manager.set_image_header("Channel_0")
         manager.set_mask_header("Label")

@@ -435,9 +435,8 @@ def mine_patches(self, config, output_csv=None):
                 new_df_rows.append(new_row)
 
             new_df = pd.DataFrame(new_df_rows)
-            output_df = pd.concat(
-                [output_df, new_df]
-            )  # Concatenate in case there is a pre-existing dataframe
+            # Concatenate in case there is a pre-existing dataframe
+            output_df = pd.concat([output_df, new_df])
 
         output_df.to_csv(csv_filename, index=False)
 

@@ -200,13 +200,14 @@ def convert_relative_paths_in_dataframe(input_dataframe, headers, path_root):
         if (loc == headers["labelHeader"]) or (loc in headers["channelHeaders"]):
             # These entries can be considered as paths to files
             for index, entry in enumerate(input_dataframe[column]):
-                this_path = pathlib.Path(entry)
-                start_path = pathlib.Path(path_root)
-                if start_path.is_file():
-                    start_path = start_path.parent
-                if not this_path.is_file():
-                    if not this_path.is_absolute():
-                        input_dataframe.loc[index, column] = str(
-                            start_path.joinpath(this_path)
-                        )
+                if isinstance(entry, str):
+                    this_path = pathlib.Path(entry)
+                    start_path = pathlib.Path(path_root)
+                    if start_path.is_file():
+                        start_path = start_path.parent
+                    if not this_path.is_file():
+                        if not this_path.is_absolute():
+                            input_dataframe.loc[index, column] = str(
+                                start_path.joinpath(this_path)
+                            )
     return input_dataframe
@@ -6,6 +6,7 @@ import os
 import argparse
 import ast
 import sys
+import traceback
 
 from GANDLF import version
 from GANDLF.cli import main_run, copyrightMessage
@@ -136,7 +137,7 @@ if __name__ == "__main__":
             args.reset,
             args.outputdir,
         )
-    except Exception as e:
-        sys.exit("ERROR: " + str(e))
+    except Exception:
+        sys.exit("ERROR: " + traceback.format_exc())
 
     print("Finished.")