From 55cff2f679e712aa412a78ddbf8b2eeac153932d Mon Sep 17 00:00:00 2001
From: mpecchi <matteo.pecchi@gmail.com>
Date: Sat, 4 May 2024 19:03:41 -0400
Subject: [PATCH] initial commits with new projects (derivation is not
 implemented)

---
 src/gcms_data_analysis/gcms.py          | 345 ++++++++++++++++++++++++
 tests/data_minimal_case/files_info.xlsx | Bin 9100 -> 8999 bytes
 tests/test_project_class.py             |  50 ++++
 3 files changed, 395 insertions(+)
 create mode 100644 src/gcms_data_analysis/gcms.py
 create mode 100644 tests/test_project_class.py

diff --git a/src/gcms_data_analysis/gcms.py b/src/gcms_data_analysis/gcms.py
new file mode 100644
index 0000000..5c881df
--- /dev/null
+++ b/src/gcms_data_analysis/gcms.py
@@ -0,0 +1,345 @@
+from __future__ import annotations
+from typing import Literal
+import pathlib as plib
+import pandas as pd
+
+
+class Project:
+    """
+    Represents a project (identified by the folder where the data is stored)
+    for TGA data analysis.
+
+    """
+
+    def __init__(
+        self,
+        folder_path: plib.Path | str,
+        name: str | None = None,
+        apply_semi_calibration: bool = True,
+        tanimoto_similarity_threshold: float = 0.4,
+        delta_mol_weight_threshold: int = 100,
+        file_load_skiprows: int = 8,
+        file_load_delimiter: Literal["\t", ",", ";"] = "\t",
+        file_load_format: Literal[".txt", ".csv"] = ".txt",
+        column_to_sort_values_in_samples: Literal[
+            "retention_time", "area", "height"
+        ] = "retention_time",
+        plot_font: Literal["Dejavu Sans", "Times New Roman"] = "Dejavu Sans",
+        plot_grid: bool = False,
+        auto_save_reports: bool = True,
+        columns_to_rename_and_keep_in_files: dict[str, str] | None = None,
+        compounds_to_rename_in_files: dict[str, str] | None = None,
+        param_to_axis_label: dict[str, str] | None = None,
+        string_in_deriv_names: list[str] | None = None,
+    ):
+        self.folder_path = plib.Path(folder_path)
+        self.out_path = plib.Path(self.folder_path, "output")
+        if name is None:
+            self.name = self.folder_path.parts[-1]
+        else:
+            self.name = name
+        self.apply_semi_calibration = apply_semi_calibration
+        self.tanimoto_similarity_threshold = tanimoto_similarity_threshold
+        self.delta_mol_weight_threshold = delta_mol_weight_threshold
+        self.file_load_skiprows = file_load_skiprows
+        self.file_load_delimiter = file_load_delimiter
+        self.file_load_format = file_load_format
+        self.column_to_sort_values_in_samples = column_to_sort_values_in_samples
+        self.plot_font = plot_font
+        self.plot_grid = plot_grid
+        self.auto_save_reports = auto_save_reports
+
+        if columns_to_rename_and_keep_in_files is None:
+            self.columns_to_rename_and_keep_in_files = {
+                "Ret.Time": "retention_time",
+                "Height": "height",
+                "Area": "area",
+                "Name": "comp_name",
+            }
+        else:
+            self.columns_to_rename_and_keep_in_files = (
+                columns_to_rename_and_keep_in_files
+            )
+        if compounds_to_rename_in_files is None:
+            self.compounds_to_rename_in_files = {}
+        else:
+            self.compounds_to_rename_in_files = compounds_to_rename_in_files
+        if param_to_axis_label is None:
+            self.param_to_axis_label = {
+                "height": "Peak Height [-]",
+                "area": "Peak Area [-]",
+                "area_if_undiluted": "Peak Area [-]",
+                "conc_vial_mg_L": "conc. [mg/L] (ppm)",
+                "conc_vial_if_undiluted_mg_L": "conc. [mg/L] (ppm)",
+                "fraction_of_sample_fr": "mass fraction [g/g$_{sample}$]",
+                "fraction_of_feedstock_fr": "mass fraction [g/g$_{feedstock}$]",
+            }
+        else:
+            self.param_to_axis_label = param_to_axis_label
+        if string_in_deriv_names is None:
+            self.string_in_deriv_names = [
+                "deriv",
+                "tms",
+                "tbms",
+                "trimethylsilyl",
+            ]
+        else:
+            self.string_in_deriv_names = string_in_deriv_names
+        # this does not depend on initialization, static default
+        self.files_info_defauls_columns = [
+            "dilution_factor",
+            "total_sample_conc_in_vial_mg_L",
+            "sample_yield_on_feedstock_basis_fr",
+        ]
+
+        self.files_info: pd.DataFrame | None = None
+        self.class_code_frac: pd.DataFrame | None = None
+        self.dict_classes_to_codes: dict[str, str] | None = None
+        self.dict_classes_to_mass_fractions: dict[str, float] | None = None
+
+        self.list_of_all_compounds: list[str] | None = None
+        self.compounds_properties: pd.DataFrame | None = None
+
+        self.deriv_list_of_all_compounds: list[str] | None = None
+        self.deriv_files_present: bool = False
+        self.deriv_is_calibrations: dict[str:bool] = {}
+        self.deriv_compounds_properties: pd.DataFrame | None = None
+        self.deriv_is_files: dict[str, bool] | None = None
+
+        self.samples_info: pd.DataFrame | None = None
+        self.samples_info_std: pd.DataFrame | None = None
+        self.samples: dict[str, pd.DataFrame] | None = None
+        self.samples_std: dict[str, pd.DataFrame] | None = None
+
+        self.list_of_files_param_reports = []
+        self.list_of_files_param_aggrreps = []
+        self.list_of_samples_param_reports = []
+        self.list_of_samples_param_aggrreps = []
+        self.files: dict[str, pd.DataFrame] = {}
+        self.calibrations: dict[str : pd.DataFrame] = {}
+        self.files_reports = {}
+        self.files_aggrreps = {}
+        self.samples_reports = {}
+        self.samples_reports_std = {}
+        self.samples_aggrreps = {}
+        self.samples_aggrreps_std = {}
+
+        self.columns_to_keep_in_files: list[str] = list(
+            self.columns_to_rename_and_keep_in_files.keys()
+        )
+        self.acceptable_params: list[str] = list(self.param_to_axis_label.keys())
+
+    def load_files_info(self, update_saved_files_info: bool = True) -> pd.DataFrame:
+        """ """
+        files_info_path = plib.Path(self.folder_path, "files_info.xlsx")
+        if files_info_path.exists():
+            files_info = pd.read_excel(
+                files_info_path, engine="openpyxl", index_col="filename"
+            )
+            self.files_info = self._add_default_to_files_info(files_info)
+            print("Info: files_info loaded")
+        else:
+            print("Info: files_info not found")
+            self.files_info = self.create_files_info()
+        if update_saved_files_info:
+            self.files_info.to_excel(plib.Path(self.folder_path, "files_info.xlsx"))
+        return self.files_info
+
+    def create_files_info(self, update_saved_files_info: bool = False) -> pd.DataFrame:
+        """ """
+        filename: list[str] = [
+            a.parts[-1].split(".")[0] for a in list(self.folder_path.glob("**/*.txt"))
+        ]
+        samplename = [f.split("_")[0] for f in filename]
+        replicatenumber = [int(f.split("_")[1]) for f in filename]
+        files_info_unsorted = pd.DataFrame(
+            index=filename,
+            data={
+                "samplename": samplename,
+                "replicatenumber": replicatenumber,
+            },
+        )
+        files_info = files_info_unsorted.sort_index()
+        files_info.index.name = "filename"
+        self.files_info = self._add_default_to_files_info(files_info)
+        if update_saved_files_info:
+            self.files_info.to_excel(plib.Path(self.folder_path, "files_info.xlsx"))
+        return self.files_info
+
+    def _add_default_to_files_info(
+        self, files_info_no_defaults: pd.DataFrame
+    ) -> pd.DataFrame:
+        """ """
+        if "derivatized" not in list(files_info_no_defaults):
+            files_info_no_defaults["derivatized"] = False
+        if "calibration_file" not in list(files_info_no_defaults):
+            files_info_no_defaults["calibration_file"] = False
+        for col in self.files_info_defauls_columns:
+            if col not in list(files_info_no_defaults):
+                files_info_no_defaults[col] = 1
+        return files_info_no_defaults
+
+    def create_samples_info(self):
+        """Creates a summary 'samples_info' DataFrame from 'files_info',
+        aggregating data for each sample, and updates the 'samples_info'
+        attribute with this summarized data."""
+        if self.files_info is None:
+            _ = self.load_files_info()
+        self.samples_info = (
+            self.files_info.reset_index().groupby("samplename").agg(list)
+        )
+        # self.samples_info.reset_index(inplace=True)
+        self.samples_info.set_index("samplename", drop=True, inplace=True)
+        print("Info: create_samples_info: samples_info created")
+        return self.samples_info
+
+    def load_all_files(self):
+        """Loads all files listed in 'files_info' into a dictionary, where keys are
+        filenames. Each file is processed to clean and standardize data. It updates the
+        'files' attribute with data frames of file contents and 'is_files_deriv' with
+        derivative information. Marks 'files_loaded' as True after loading."""
+        print("Info: load_all_files: loop started")
+        if self.files_info is None:
+            self.load_files_info()
+        for filename in self.files_info.index:
+            file = self.load_single_file(filename)
+            self.files[filename] = file
+        print("Info: load_all_files: files loaded")
+        return self.files
+
+    def load_single_file(self, filename) -> pd.DataFrame:
+        """Loads a single GCMS file by its name, cleans, and processes the data according
+        to project settings (e.g., delimiter, columns to keep). It sums areas for duplicated
+        compound names and handles dilution factors. Updates the file's data with iupac names
+        and reorders columns. Logs the process and returns the cleaned DataFrame."""
+        file = pd.read_csv(
+            plib.Path(self.folder_path, filename + self.file_load_format),
+            delimiter=self.file_load_delimiter,
+            index_col=0,
+            skiprows=self.file_load_skiprows,
+        )
+        columns_to_drop = [
+            cl for cl in file.columns if cl not in self.columns_to_keep_in_files
+        ]
+        file.drop(columns_to_drop, axis=1, inplace=True)
+        file.rename(
+            self.columns_to_rename_and_keep_in_files, inplace=True, axis="columns"
+        )
+
+        file["comp_name"] = file["comp_name"].fillna("unidentified")
+        sum_areas_in_file = file.groupby("comp_name")["area"].sum()
+        # the first ret time is kept for each duplicated Name
+        file.drop_duplicates(subset="comp_name", keep="first", inplace=True)
+        file.set_index("comp_name", inplace=True)  # set the cas as the index
+        file["area"] = sum_areas_in_file  # used summed areas as areas
+
+        file["area_if_undiluted"] = (
+            file["area"] * self.files_info.loc[filename, "dilution_factor"]
+        )
+        file["iupac_name"] = "n.a."
+        new_cols_order = ["iupac_name"] + [
+            col for col in file.columns if col != "iupac_name"
+        ]
+        file = file[new_cols_order]
+        file.index.name = filename
+        file.index = file.index.map(lambda x: x.lower())
+        file.rename(self.compounds_to_rename_in_files, inplace=True)
+        print("\tInfo: load_single_file ", filename)
+        return file
+
+    def load_class_code_frac(self) -> pd.DataFrame:
+        """ """
+        class_code_frac_path = plib.Path(
+            self.folder_path, "classifications_codes_fractions.xlsx"
+        )
+        if class_code_frac_path.exists():
+            self.class_code_frac = pd.read_excel(class_code_frac_path)
+        else:
+            raise FileNotFoundError(
+                '"classifications_codes_fractions.xlsx" not found in folder_path'
+            )
+        all_classes = self.class_code_frac.classes.tolist()
+        codes = self.class_code_frac.codes.tolist()  # list of code for each class
+        mfs = self.class_code_frac.mfs.tolist()  # list of mass fraction of each class
+        self.dict_classes_to_codes = dict(zip(all_classes, codes))  # dictionaries
+        self.dict_classes_to_mass_fractions = dict(
+            zip(all_classes, mfs)
+        )  # dictionaries
+        return self.class_code_frac
+
+    def load_calibrations(self):
+        """Loads calibration data from Excel files specified in the 'files_info' DataFrame,
+        handles missing files, and coerces non-numeric values to NaN in calibration data
+        columns. It ensures each calibration file is loaded once, updates the 'calibrations'
+        attribute with calibration data, and sets 'calibrations_loaded' and
+        'calibrations_not_present' flags based on the presence of calibration files."""
+        if self.files_info is None:
+            self.load_files_info()
+
+        if any(self.files_info["calibration_file"]):
+            for cal_name in set(self.files_info["calibration_file"].tolist()):
+                cal_path = plib.Path(self.folder_path, cal_name + ".xlsx")
+                if cal_path.exists():
+                    cal_file = pd.read_excel(cal_path, index_col=0)
+                else:
+                    raise FileNotFoundError(f"{cal_name=} not found in folder_path")
+
+                cal_file.index.name = "comp_name"
+                cols_cal_area = [c for c in list(cal_file) if "Area" in c]
+                cols_cal_ppms = [c for c in list(cal_file) if "PPM" in c]
+                cal_file[cols_cal_area + cols_cal_ppms] = cal_file[
+                    cols_cal_area + cols_cal_ppms
+                ].apply(pd.to_numeric, errors="coerce")
+                self.calibrations[cal_name] = cal_file
+        return self.calibrations
+
+    def create_list_of_all_compounds(self):
+        """Compiles a list of all unique compounds across all loaded files and calibrations,
+        only for underivatized compounds. It ensures all files
+        are loaded before compiling the list, excludes 'unidentified' compounds, and updates
+        the 'list_of_all_compounds' attribute. Logs completion and returns the list."""
+        if not self.files:
+            self.load_all_files()
+        if not self.calibrations:
+            self.load_calibrations()
+        all_dfs_with_comps = []
+        for file in self.files.values():
+            all_dfs_with_comps.append(file)
+        for calib in self.calibrations.values():
+            all_dfs_with_comps.append(calib)
+        # non-derivatized compounds
+        all_compounds: pd.DataFrame = pd.concat(all_dfs_with_comps)
+
+        set_of_all_compounds = pd.Index(all_compounds.index.unique())
+        # Using set comprehension to remove unwanted elements
+        filtered_compounds = {
+            compound.strip()  # Remove leading/trailing spaces
+            for compound in set_of_all_compounds
+            if compound not in ["unidentified", None, False, "", " ", "''"]
+        }
+        # Converting the filtered set to a list
+        self.list_of_all_compounds = list(filtered_compounds)
+        print(f"Info: created {len(self.list_of_all_compounds) = }")
+        return self.list_of_all_compounds
+
+    def load_compounds_properties(self):
+        """Attempts to load the 'compounds_properties.xlsx' file containing physical
+        and chemical properties of compounds. If not found, it creates a new properties
+        DataFrame and updates the 'compounds_properties_created' attribute."""
+        compounds_properties_path = plib.Path(
+            self.folder_path, "compounds_properties.xlsx"
+        )
+        if compounds_properties_path.exists():
+            cpdf = pd.read_excel(
+                compounds_properties_path,
+                index_col="comp_name",
+            )
+            # cpdf = _order_columns_in_compounds_properties(cpdf)
+            # cpdf = cpdf.fillna(0)
+            self.compounds_properties = cpdf
+            self.compounds_properties_created = True
+            print("Info: compounds_properties loaded")
+        else:
+            print("Warning: compounds_properties.xlsx not found, creating it")
+            cpdf = self.create_compounds_properties()
+        return self.compounds_properties
diff --git a/tests/data_minimal_case/files_info.xlsx b/tests/data_minimal_case/files_info.xlsx
index 687f990a5c691c3a0fbed64a49bf1427d4ff9970..72a589a045d37be4027affaa3b9323ce8300dcdc 100644
GIT binary patch
delta 3573
zcmZ9Pc|6o#+sDV)#$XItCcCj@ETL@K8M0NfO!i%NCWa9C+LP>K-?A&RZ;7&&ElWt2
z5Q-6zB_?EUzu)~l&;9)F^WQn2>vdk|b-k|l^|?;GcE$C&A*x1K=v6Qf(~sVgAqRm3
zAs`R~2m}iBlnB83dN^RQ9+v~XJWDZ#*b;eosHz)@d{41CAD&S-fnkcPF=~&S^A0{@
zZu>%E+r8#FS!(eESdyaIaK<UzZ5OWJ4I~ns*En!zZpNmIw4bgEXY)R|p4@-82Fhnj
zbL{){H{)LX-3Gh#R6t3+_H&^D8_$`%t>~ZA(y2yj(1gIVrk2N2vO$9$&Z8#1f?Hzg
zHQCW+HujUQqfldC?@cHJgvMtiO4BmR>}f<%hJMPv$v3Z&vV{m&8!6g@n<)0ll8?Vp
z_hcr=yjIsMy+JFwCOvQN7Y<bI6Ws;Voefx`UKjCWOP1Mso&rn=sz14Qn*>brK=T!M
zw=obqR+s0k0ZsV<FJ<m3XI>1m-H2^e)p9{PY-v#Bvtu;YF6~s7ozl(U;xS*)jwZ*C
zHu~|mzl}^z?M1DVWJX->wCiC8s<P@cqw6|!#<k5VWmF{zE!i!_5!QNOcHWTs%zdj~
zu)FGWu}`VyDuD4s#MJ#M!1NB+;R_R9hn)=4JanWeKurS<ix*3p#b;l&d{AA(6xE4{
zWsr6dHw~=$sH)UTRl7GQ26di|(7p4T-M%gwLci_VoLubu5$O`M%40H=Xt~1XH{_cs
zME87lVrh|-ClssE|C)9EPVhY--3G0^Jwt0kEq!|{2RIrk170S#C2p*U>Q1^Uz%*3#
zTpU$tBf65n3i}mK(j1ym?<4FF=_5Ya=ZQ|w`I-h&=NE!1Lz?`1W&gkgeMIBr*lh>5
z_ZChk$9Ep$$lCe0)qgez;bScy<ZAQLHs9xiL%uiL9!qn=y6U!Dc(m+?Dd<ZiXXeko
zI$*#qfI?gzV@|h^{y15M-!Lk2IG~s8(kc1#ZFx%Hg{{FF%I(jL`pcj1<1(GUZp^$!
z+p;u?I+DyrD?@6O<@^&J^{5=zyX6k&X1u9T&_X#bE#oVk45z#$**xP?yA_j?KegRt
zIUkm8Z23-M1|Gsz63|v}8i)BmE01J*6)9V}82fF`B+QP!lDpg*V@;rrNW>bm4Q+3b
za2=6j)^82`M!9eK<vo1U)_Lv2(eYmBZdU$xrMB#H{N$fYt*!XZ9SPER+i=p=B7{54
zdcl<Y$d>qI2VVqd0W#lDpm)I_5Q-WEqW?EnIKLpzySRT-wS>);goi$DB6aU6%8VyR
zL!RZ7>!i2187aEe!m|1*0p3-o(#f-<{BQxyA1yV(G>5-W7Xps*B?d8~lyg#(_o|IJ
zt%wLI3?a43Kdw{d>mpsrJ0pxy5sUtAfecXu2qs;2+p?)Up?72@i*h&WsBbB_Fu}|r
z<c2HqDF0GAaovd4vQg-!7ebsPKo`n%e3qs&#Int})(}V^Te{|B862G<(;V`avh@T;
z%WwXEXIhWr6_Z*sy-<^ym!u4z`v{+o%R^S}y&l4K(?jsR$2B50b?bhPOcyNjl0m<{
z03CJB)1P8T(;bX%8g%nMmv}}~^+e|jHL&S7eIT0FG;2wEfN7f|IGVav%&lNN5n4x)
z<)m8_Ejtk5ce`!0_?gq|etMRWyDxE7&%Qb=<)U2(Q{1NlkLDEleUu)^nm72`xYXM`
zQVR2XO4aoeUNZFnouS5H=4=A1a5Yoi0z3xP)!PrPz7T2`J<vz$h=j|@mwCrOl}D}n
z2v=6wVuzTzMQ8^vzFB-(`%&h!rm1;c!`#UC)pQ)xG^sbio@|5pMr`m=)EkV;j92v3
zowbR@bW}_~K_tjb1KHcqzbOX;;-?*x?9hd$J9W>h6N#PeT%gWOT`nxY#V3ayAZM%@
z)7j1%QZ2A~(;U3;5&Pjf#~DRUjX15m%TITES#ukC+Nsvbit3nnzJ5c#v7taQi0??o
z%9uBlCJ>G`a&7+B<NHg4hOE)kpqO~M-2XxueiK32gfiD^?NQ-mFb=T*90Zq!h=gC!
zJ54`$jn4OeuP@xNrkkEiqVE9MiN)2+s575-B3uHyKc4&hNw!>@Qwp2*lQOoO-mJrS
zxlf9KeVp{CGsjAQN5q^!)_V@J#xc4<7;5pUCyLwa!EwHbTJU>iSjCw?Gtnk=gL=_P
z&t=i`!Ns5;=BVKfCm)V)A7X<K-fgVoc?s*!_<2UeGjH7UAOBuBJ_CgzyAK(3lOvhF
z9`I!T0pnB1neqBac;h?xBPtj<+R6U)x#vXEfj|flo)5_m<QbbM&7BibJ4J&3&bQ*E
zWwgiCYqVoAV=n#Ka9YJ%UbyGqft|juYQ@E_hCk)+zj0Jr{kE?Q-2u(r!;}f^p;y%3
znv7O?>AeBw<JbDGX56{PQW=q!*PXonv7ceMWJ1vxY{qa=1fM!Fr$7Vl)k~ftR>nXB
zpk;#1#7dww{JG5d=kQX<iew&T>Ry}`g6R+BkqC73i&p*Xs3992X^8uWPE?SWg6tr7
z5#D6y@K-F#oL}o5Wvh#}p3-1yt5=!y2*ujNIH9zlO9=+?B%e#G)7Uft7mJG+GoI|o
zC+qVEY~9Gh7cQ?xNPIc2Os?`j%frYqrz-j%n%03h>V{p#L1$%ioRz}tzLH>P@1a}|
zLIy2Qx3I|5V6nz2`q}B5o3POyO3&u0PMyW8v)|pzt~F}~WwWO6F0sPvnjPiJSIh2?
zJi@+YMwa;SNXMs{hla^OQc`7{-9xX@=M}4si&~8-HYa!x#^#2fg*P*)26^kJ^*zC(
z)sKK4*b^L}lGy9RY;ZCn@tp__v9yT#Qt9zjK(#Z>erAj?8Ni*zBph3EDF`P~&$Y3U
zYE~upi*fYuoye@Y>-!`4`yq8nZcB*GH$~8k6mq*Q&@Fy)N4bvOU^W%EWdt=b`Hai{
zeJ_pUR!TsbvpC3+I`DYRO{p0kFYct&E&_~l1q8Z|a=nAK5nIi^#2vDMKqS(6+a!_F
z(k;CsCE;4F5oT&!`+DrxGN1y#WQDc9n_5t6&M$n5fhuJ#_X32)-mt_3!#~=e=>4xs
zTPUk^mh=l|_)GyZM~gQ320jP;jpfRnq-pD=`$x-6JO$_PpDvO}%-^mH!cH0jJZXq#
zrXQ+?-ZVj_I&XO|#ZA#^Jl@RE5%!s9LJ8WmYF?*l6x@r3PcI>*Xp#wLB8ENfnXKk?
z!@p7{ktU($VK3W)S2Nc1VtNwfTHlpq*u~zYFYnW2x9h3515ZC|8hAs~8D)KEj0+n<
zr*#445%IUodM;1(T@5jF&M?#JQmH;=YVG^HiyXMkoKlaRnX=Q<{7Q!8q_qSmA)ajI
z{?z}>gzL}cMSsv7&9O6%L`fM^WZe_X3L9(u`~JRt|LwpFPQp3DW*0o&B?W~N$%HsJ
zZ$M5;vSHTsXK&9yczw7B4HG)|l%EU)3TX^s7bm0j<q06xQY)So9JBpJDWxcJ=j3+d
zD2EUkV5r|K$sD~zNkR_>=az4l(|S>h1pD&J8-~yhF0gt|CTq5z1yA0u;)1HI72qY-
ztx~HKF&9I^>l^M|LmI||Z+Ea7{lFbGSU+kNu1iF94_T-uTGY8>tZeUE8QX9KUTLOU
z6`1M5r<GZ)6Tak?BP>J1;${aF)7S!<pJ(j@+Kv4=y9_PVd40QJ(=S<eiOVp=_mtRM
zI^Ei(XVP~W;GAe07GW2u$B?sK1<CY3`x>=3^3wgP5A$#NRCsn%`}pxxb0*xJ)Es${
zw7h@ev?VBHyDG-WzkMYv?#8_#dP&pV;?3H5$L!_7I!n937xP5x9f8>PmKn+QLRdkV
z-hSl!pV-Jphg?)wV!JAe(T^JU6xxBA8pD)6UG{gnIvJJ?g&V2mayttrY)9>By@d*8
z#7udHRX_bTX`9;`LVZ^Y2{_xc_!W-7G7B|ZIZ%u}PdtRY(UO~&3>fjAmxM>{P#(j;
zJ5t}8Akat$xB$x6^)~taiTw-<b4^#HFWwaG221Ht^1k!N*psV>P|gV}p2H%76yBiT
zXjtnxH2(5Om2Ht<O@AKjYO(aeJs>EDvP7NlbD^hp)7Sj7X8Bnh7@r%7<60+jteylH
zM!zWW%t<@ynpn%f1>9WaS`<8b)jx3DAXY2Qjc`o(r4?|+1ixCjPLip{a}WtZ!CMqF
zIk`2oi)6#%KS~0w8M?m=q*&*_<o@I&ve)jw$~g)bxh)f{GB-g8@{omr7l#;I!qr?4
zPP&wM>2|BTT^h>QF-eKWmj_6xAPLOw1qgMW-mhe3P1Y`TKvu?)OJT(mC_U8>N8&#q
zvtk^teH!&3&0f|HXI}5=wgacUy|yxqQ#)RK!MCp(n~7UVv0ur*CriHMx0Z+PXvjNw
zbR@@POL#tCu4orBydB~yEAI3{aND@0B0MO0#ustJ+cH-1#ga525hDi_sI*wyhz}W*
zDXn-qADpm}w{()>fAS{dU-PkpJE-upd}5sc{Tn*(K>xFLAox5!KF<Fh>;6{;feh*J
z&yYOlCdUr;V!*o~1v&p|bN`71fhbtdO;3dL|42Na6Pf?N4m!Gs4`*k^|3<<&{^s$#
lYMjr582=G;124cY!1<r50)gQF&?mX@PW(u!6yCp1{{=-(hK2wD

delta 3696
zcmZ9PcQhQz_rO>0tYD+8UZO-NT12mlAc)?3uZzgq7ZJU$nkd0$gH@wMT{Q`UBpYSb
z)rqVq5hR4KmtT43edmvvIdkuwnRCy*^SSd<H%+f`lpJ39n%gmi1ORZM1OONS06>(#
zWMojNpKDN%pF~uEKhnxBs8W$R*2(9RWVM7n5dx6s%%+($+!gli+WmHKJ&vh1ZhQ`9
zU@;EIyHS1#m$CIwF>_rD%TOzOA891fOV}BG@WNcW|Hds$a6syzJ#$)+&~X_<5z}lw
zHvT&D8obzqplW$sfX2hS7N~z7-W*CL1K1lA%)bh0=x+$tC3gbpHlEwi*f7ySIy7ps
zbD3+97BejD9f3){1MQ3a0*7GQCdcOD37w_s+^<~P*c9(zr@UG-65En8lKV_QvA0Yl
zcF1{3ZQdMVNLuTZ@762W>mS-{;8eiYDpYtNJQnxFMXs2%22tso4=EqAbppukl-x<?
z?xI{{5W$sfF5ed5HuyDaM3?C}vX5|lqF1}mN!!(%mhi}y^Zqscr5m5(DuejXc0$KR
zf8p-U@EL#4n?#xC1z2i-(dm~H7dmFmcZH}?y3ZTgSv`Wv9qVSN*Jp$~8d|IGem1-}
z0#8QC_RE{@F8^x4)(n>B5$pReeuCQyi5$_JQVu-IUZ;w-f>c1zd6s1YWDa>WBR-b$
z=FBENy{3ZMEsSX==e|4?aR}-uV?s3g@j(hmoLyO`e8aIIuF}{MZK)w=!-#9TZ^TPC
zVpfHiN^5><fi4zK!SE|8?V>I<LUNwLY59kJxuY-7-4b|-WZ1mBC$Vu`Uw_i&9+q}%
zx~%Yt(!=0%d4o$STQdD<1wzIu4~H9t-dfRGm&*>ah(o2hdM{J!_QWfneY&`8mmC`G
z&S4)LiE0y!XE=E$pFmCrMdY*W1$L*qf!$}pDhD^YR4@t*CDq&##fmAOG>UL3{l`bo
zsBjAznQ4JjDqUmPIaO)LB=Pv=2ae8mXD++TA2~D%{8fH4iG?dQ^#~tjzETsO&s~<I
z=i`spm6<RmD4gtvIF~-_kB~VHAAJ6`SQ8&T<?v?NLBX!uuPi1^)vf*e_`=3)o4e~o
z)(qLV9Yz2526>V%JWvx-x1LDBLE$fha=&>#mnJzuX~;$NM6I8)O@O)I)ytM7V3@{a
zC0W;g`ZKztv^Ont2(+ue5DMDyor}5KAlvuhmak353OSK_E1lQDnwNds>w$rqN`wo^
z!I&sZt0SW|!hzA??96=Pylg-ahyneQV83<*RlfV-_1(R1*j{cBoKBDNd6^Ki#&~&M
z;%MaxzJID0fXF6cMsV=bH{-|!iK%6gs`DFH>8VGL0E4*Ui1~~;DPY2$*bh!VJc2|?
zFW%BJ9=cb<)sVghOjR@qX#T4P6Z1}pO022oRcfD`>xYvMIJgyq*OKxF3sIrYYh?AA
z;5kOKra-3(Zv5+1S|!7^`hmJ_?_>#!^28tS7|@6;iYe0?PM+`@?euBbN_2``uG>aR
zDYVg?W~2$8&>Ol}?PzX4E3otcg1>t^!zQslUUR}Fsnat)kW?vj?u(c<7UWmuBwu@<
zKOLc@n6a^flT2Dx<$>0E{BkWHUO`69MQW}((7?(bQ!;JcT<MB;QXIu?MH&_}isV+c
z(zx&0eutfP6vPx7A<r(nT;hfiM@2<6-R1dvVj}RQ8`Fp6G<YhAP@$594cNxALhWMT
zMxAzS8J`fcD{0#xO0MAF87fc*IT8`(y0>0;e_BnAO||-ZnBLl=!kl@9h&So5^zi~!
zh!jWJayEmuhXN~~&{NgVA6fz+EmVAsr$J(_n>;JV?2=oROVhT$Qo}`U*WA#(DWCa0
z&UE3AgV%}29#m9GaMVy1G7l{(a@lq1ifL)n7_o9rjJ>BQ9o+K!QH!M;b5$^Dc4m7P
zx$o*v`C2o>kmkY<=Q3JC+2~6JygS$lgLxqb=w>aF+;;p{Rcem%i%c%<D8X?rG`c^%
zlKM~~RWeUYXr@hKa@$lA&=MVzmXE098ArhV+a4-Q@zAG9h2U|-U`znf>hLfgTKRK3
zSzs(Mv8cx?>7XuRQtj#;zr6g@A@el?A4>ss8sZey)0y<K!y2MllQYkmtI;jg!T~`}
z9KHi-z1`9@@uxwoQMXv-A0GY2wYb^U%x^`8s=SafPG~q*F05SRHI8*(g1&>F;8EP2
z7s{nrLbb5Ws|}xOWxHRlelNvX1Esw7JWxYD78PeVM9Z>bQYHaPrbr@6!HGLf<wI|7
zwB<(qNG`rMcO&Hq@a+~~1QYMnw0U)oDny>&vp1?BlcOjBb)3zK=oDJ|xLl&a0nJX;
zBH$pKvROXGjEVD<iEa5x?v-$das7|wnT;LuwNwOP%L+Yb9GF}tU3OI}GuYd&deG}Q
z^aH8c@KWKN-l?2DlQpmRAd51o2(3H2m$nOI`N!#3KErBRve&}5`5&^^|9gwz>nk6p
zWfBQm_x_x*FYjsWocNqBavQiV_2&|2l!^3h)5^@AKBf602d50DB#5VMe$6oq1?2G2
zC~mhiVgLY20RYhb4@JYmqy0U?{xH-sAn2JC@ZyfoW&dHMFApz-@44?&KFb}H!7dUO
z(I;T_J1z~8ZF07!LSisH*VF102gAgiFQ{%o9?t4Nf{snvqe&^Q-_bSN=v?+KS(aod
zQg~C<X2D|N{pMP?mpsM@j_*zN>$*;e9W2xVl<$xR)z)psPyS3Wuyxu_5m6Lr&7K^X
zt~GFA(@mWtGtkh)mjMkr1~>=oH!q3#s-qXI9P2%F#GOL=cQJ1U^f$lHw_+3ONlgRd
zq{q#ejEs`l!`hLpce?Zr7q)E+j){YlB_O5ETj5^ry56yx#OpI~UNXjm!R7^y?}kWe
zY5#X64;9lPb^ht0J(jond4Umvg=(U|jXl0mGJosAw#RC!i1uA`&UxDbn&Y9tYHZl1
zR^&u*%~<T`txVJ*b5dbm^d;q%T_*^z?_S!_!8Q(gFdAL5_=7K$`*W;s(uO9j%okc8
z+8jGi3-Xs$PHF_W*z?-wDqZN6?S>^ktrHDaI%548-o`1^Uo*i~<zn}}4Bh$-X*yUj
zOYIF)Ea&wttskX*?1MX}Q>Y6rcq^RLm6e!QmwE~tHrSx|_*(OdT3+kKt?XX8C_hWT
zm$2WNKFD1Wfg`~&nqI)ICv)}A@j4WJdmiTRytUDW1P6FShr|vvwGH|5ciys}pv_j!
z+mH5(pn#T@yTF}JnZz*iOpq>OqoY~9$FAq;q4Ce(qhCz#RxPWeWV16ibnC;I&PUSa
zh#Jt6Y_<U%jrKh}KxfdnK`7qCc`?fO*-8|Ruv24B;1if4m(Xd;#V|D{ueox@3-h_{
ziT=J1YzwX|TiC8*?Y8F|6}PWrH^_AU+w;Jst$Rc>Xh#yY6g9*b1QO48l*2gQESMc*
z`2CgpG4Ia=#@UH*pa|;+KcdHzM<d4~OJ)94T>i?x>fWly48uSs=V;;m9{+WUr6|pp
zOW`sp-8ZQRHr%`0lV5rFsXo{c{$T(N*P6}F+{|Z%2(>GFN0@qhlVv-fB?=5{P=)>)
zG=dZV4Ku1!vu`c}L;wJ+nVwyoh*tUA8vP}UOXA+W%Rh25cZ&{yS+f&|5E0x7GAPCT
z22a+jH4^jeqNwvp8%dt3604%wLz<CAkUus{yDK*4GZf89t?3xL*U?)XW`MapDtb!n
zrc+0qt1V5f-t?5FRr2F|D9ePIGpA*-Rcjn^tMjK<-I&oLq1s!G(yiocq_{pruAtHN
z?d4)2Ry!!bAYh$kR3Y*zN#H-xvu?(-6pJJ#Yo{PcaPB?55DMBK`r0=Hy!g%GPirWG
z6UvFHMs13<sV+V#OJOgRK1K=|irAxJD8!=3v*vQ8noO&-0hCXL(AR*dOR88qHPsU{
zt@2I>X?2eYHW%_LwDIi5%x=@@0TL(PSTSAqyFQZm6lPan9cD_wVdzZY2|c%4E{7#N
zqP@LeKb*C)5IxSt^;*mV6{1YDCT9eiU(X){xkFEudf?Ld4lwoM`l^#F2&FKuC^)-N
zMs+|d9D^@Vt2{?!k+4vUF#LE6yYl=6ar0MB9wPXAlff$%oamoxt*b^*Q<jfPYu+9r
z)_wu9=J3K=w?o(2D)Jk=a*}e0#h~BYJMUoHI&rnIX-&=RTD&NPy*tqVrh_%hs!w!G
zxP*)Vli`}%Uz+|zZ=T$_ZTg*=Lw_A0+eT@f^t7tTAh|7}wV$eiB&eovI-%WIj14_%
z(+<x#hS5D*4=F7(zyag0jaGVk36sCGfn3UqSMHwfM|T9UTBF~i4kp-F?RF@<?hRR+
zL7SVF^T3iUpT+wyH}vdE#FPT32G|+y``3!CG3K6gRjEihc!#ZB^PR%hl0l?Bw(lsm
zxn=jT$J4(A))`EYy4IT9l7sDXAdyb458#9Bf0-J9Xa_N1zjv)FQ=c9C-e4MKS=FPO
zGTXz~iMY`n21gVJ;YJIxwA~~V6>A(5;O{@~Zz5u&@Ll@}cgJNMIpP`!$BDjX&qxK|
z^}L5Fm_~x_4ogNX0|ux^CCB%4POu*aBd0$%8^q>(=g;sojD_3(ruv)4Bmh$g0>P66
z{_hXh6)pa^b(16Dyu4SnA3LcnH2{E^BWJ&93LzpE2QUN30ayPs059O5dJq6${XfG$
zV>&H@2gHuZ<pUz-dD)4pfQS=bLGXWr=^qod>xgb%Zt(vn=M}w}|B+tJJlGMjd_o+5
ovhym2Jg*cO|E17{=;adt{}W>X0PtUOEiS|*A0PQK&)=*60npyIj{pDw

diff --git a/tests/test_project_class.py b/tests/test_project_class.py
new file mode 100644
index 0000000..4029cee
--- /dev/null
+++ b/tests/test_project_class.py
@@ -0,0 +1,50 @@
+# %%
+import pytest
+import pathlib as plib
+from gcms_data_analysis.gcms import Project
+from pandas.testing import assert_frame_equal
+
+
+folder_path: plib.Path = plib.Path(__file__).parent
+
+folder_path = r"/Users/matteo/Projects/gcms_data_analysis/tests/data_minimal_case"
+# %%
+proj = Project(
+    folder_path=folder_path,
+    auto_save_reports=False,
+    compounds_to_rename_in_files={"phenol": "renamed_phenol"},
+)
+
+# check a couple of defaults
+assert proj.column_to_sort_values_in_samples == "retention_time"
+assert proj.delta_mol_weight_threshold == 100
+assert proj.acceptable_params == [
+    "height",
+    "area",
+    "area_if_undiluted",
+    "conc_vial_mg_L",
+    "conc_vial_if_undiluted_mg_L",
+    "fraction_of_sample_fr",
+    "fraction_of_feedstock_fr",
+]
+# %%
+fic = proj.create_files_info(update_saved_files_info=False)
+fil = proj.load_files_info(update_saved_files_info=False)
+
+fic.calibration_file = fil.calibration_file  # this cannot be updated automatically
+assert_frame_equal(fil, fic, check_exact=False, atol=1e-5, rtol=1e-5)
+# print(fil.columns)
+# print(fic.columns)
+# print(fil.index)
+# print(fic.index)
+# print(fil==fic)
+# %%
+files = proj.load_all_files()
+# %%
+ccf = proj.load_class_code_frac()
+
+# %%
+cal = proj.load_calibrations()
+print(cal)
+# %%
+lac = proj.create_list_of_all_compounds()