fixed std issue in samples, for real

mpecchi · Mar 19, 2024 · 18fe33d · 18fe33d
1 parent 48185bd
commit 18fe33d
Show file tree

Hide file tree

Showing 8 changed files with 17 additions and 8 deletions.
diff --git a/example/example_gcms_data_analysis.py b/example/example_gcms_data_analysis.py
@@ -5,7 +5,7 @@
 )  # Import the Project class from the gcms_data_analysis package
 
 # Define the folder path where your data is located. Change this path to where you've stored your data files.
-folder_path = plib.Path(plib.Path(__file__).parent, "example\data")
+# folder_path = plib.Path(plib.Path(__file__).parent, "example\data")
 folder_path = plib.Path(
     r"C:\Users\mp933\OneDrive - Cornell University\Python\gcms_data_analysis\example\data"
 )

diff --git a/example/output/files_info.xlsx b/example/output/files_info.xlsx
diff --git a/example/output/samples/samples_info.xlsx b/example/output/samples/samples_info.xlsx
diff --git a/example/output/samples/samples_info_std.xlsx b/example/output/samples/samples_info_std.xlsx
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "gcms_data_analysis"
-version = "1.0.6"
+version = "1.0.7"
 authors = [{ name = "Matteo Pecchi" }]
 description = "Automatic analysis of GC-MS data"
 readme = "README.md"

diff --git a/src/gcms_data_analysis/main.py b/src/gcms_data_analysis/main.py
@@ -2631,12 +2631,21 @@ def _create_sample_from_files(
             non_num_columns = ["iupac_name", "compound_used_for_calibration"]
         else:
             non_num_columns = ["iupac_name"]
+        # Step 1: Create a comprehensive index of all unique compounds
+        all_compounds = pd.Index([])
+        for df in files_in_sample:
+            all_compounds = all_compounds.union(df.index)
+
+        # Step 2: Align all DataFrames to the comprehensive index
         aligned_dfs: list[pd.DataFrame] = [
-            df.align(files_in_sample[0], join="outer", axis=0)[0]
-            for df in files_in_sample
-        ]  # Align indices
+            df.reindex(all_compounds) for df in files_in_sample
+        ]
+        # aligned_dfs = [
+        #     df.align(files_in_sample[0], join="outer", axis=0)[0]
+        #     for df in files_in_sample
+        # ]  # Align indices
         # Fill NaN values for numerical columns after alignment and before concatenation
-        filled_dfs = [df.fillna(0) for df in aligned_dfs]
+        filled_dfs = [df.fillna(0.0) for df in aligned_dfs]
         # Keep non-numerical data separately and ensure no duplicates
         non_num_data: pd.DataFrame = pd.concat(
             [df[non_num_columns].drop_duplicates() for df in files_in_sample]
@@ -3038,8 +3047,8 @@ def plot_ave_std(
         aggr: bool = False,
         min_y_thresh: float | None = None,
         only_samples_to_plot: list[str] = None,
-        rename_samples: list[str] =None,
-        reorder_samples: list[str] =None,
+        rename_samples: list[str] = None,
+        reorder_samples: list[str] = None,
         item_to_color_to_hatch: pd.DataFrame | None = None,
         paper_col=0.8,
         fig_hgt_mlt=1.5,

diff --git a/tests/data_for_testing/compounds_properties.xlsx b/tests/data_for_testing/compounds_properties.xlsx
diff --git a/tests/data_for_testing/deriv_compounds_properties.xlsx b/tests/data_for_testing/deriv_compounds_properties.xlsx