From 55cff2f679e712aa412a78ddbf8b2eeac153932d Mon Sep 17 00:00:00 2001 From: mpecchi Date: Sat, 4 May 2024 19:03:41 -0400 Subject: [PATCH] initial commits with new projects (derivation is not implemented) --- src/gcms_data_analysis/gcms.py | 345 ++++++++++++++++++++++++ tests/data_minimal_case/files_info.xlsx | Bin 9100 -> 8999 bytes tests/test_project_class.py | 50 ++++ 3 files changed, 395 insertions(+) create mode 100644 src/gcms_data_analysis/gcms.py create mode 100644 tests/test_project_class.py diff --git a/src/gcms_data_analysis/gcms.py b/src/gcms_data_analysis/gcms.py new file mode 100644 index 0000000..5c881df --- /dev/null +++ b/src/gcms_data_analysis/gcms.py @@ -0,0 +1,345 @@ +from __future__ import annotations +from typing import Literal +import pathlib as plib +import pandas as pd + + +class Project: + """ + Represents a project (identified by the folder where the data is stored) + for TGA data analysis. + + """ + + def __init__( + self, + folder_path: plib.Path | str, + name: str | None = None, + apply_semi_calibration: bool = True, + tanimoto_similarity_threshold: float = 0.4, + delta_mol_weight_threshold: int = 100, + file_load_skiprows: int = 8, + file_load_delimiter: Literal["\t", ",", ";"] = "\t", + file_load_format: Literal[".txt", ".csv"] = ".txt", + column_to_sort_values_in_samples: Literal[ + "retention_time", "area", "height" + ] = "retention_time", + plot_font: Literal["Dejavu Sans", "Times New Roman"] = "Dejavu Sans", + plot_grid: bool = False, + auto_save_reports: bool = True, + columns_to_rename_and_keep_in_files: dict[str, str] | None = None, + compounds_to_rename_in_files: dict[str, str] | None = None, + param_to_axis_label: dict[str, str] | None = None, + string_in_deriv_names: list[str] | None = None, + ): + self.folder_path = plib.Path(folder_path) + self.out_path = plib.Path(self.folder_path, "output") + if name is None: + self.name = self.folder_path.parts[-1] + else: + self.name = name + self.apply_semi_calibration = apply_semi_calibration + self.tanimoto_similarity_threshold = tanimoto_similarity_threshold + self.delta_mol_weight_threshold = delta_mol_weight_threshold + self.file_load_skiprows = file_load_skiprows + self.file_load_delimiter = file_load_delimiter + self.file_load_format = file_load_format + self.column_to_sort_values_in_samples = column_to_sort_values_in_samples + self.plot_font = plot_font + self.plot_grid = plot_grid + self.auto_save_reports = auto_save_reports + + if columns_to_rename_and_keep_in_files is None: + self.columns_to_rename_and_keep_in_files = { + "Ret.Time": "retention_time", + "Height": "height", + "Area": "area", + "Name": "comp_name", + } + else: + self.columns_to_rename_and_keep_in_files = ( + columns_to_rename_and_keep_in_files + ) + if compounds_to_rename_in_files is None: + self.compounds_to_rename_in_files = {} + else: + self.compounds_to_rename_in_files = compounds_to_rename_in_files + if param_to_axis_label is None: + self.param_to_axis_label = { + "height": "Peak Height [-]", + "area": "Peak Area [-]", + "area_if_undiluted": "Peak Area [-]", + "conc_vial_mg_L": "conc. [mg/L] (ppm)", + "conc_vial_if_undiluted_mg_L": "conc. [mg/L] (ppm)", + "fraction_of_sample_fr": "mass fraction [g/g$_{sample}$]", + "fraction_of_feedstock_fr": "mass fraction [g/g$_{feedstock}$]", + } + else: + self.param_to_axis_label = param_to_axis_label + if string_in_deriv_names is None: + self.string_in_deriv_names = [ + "deriv", + "tms", + "tbms", + "trimethylsilyl", + ] + else: + self.string_in_deriv_names = string_in_deriv_names + # this does not depend on initialization, static default + self.files_info_defauls_columns = [ + "dilution_factor", + "total_sample_conc_in_vial_mg_L", + "sample_yield_on_feedstock_basis_fr", + ] + + self.files_info: pd.DataFrame | None = None + self.class_code_frac: pd.DataFrame | None = None + self.dict_classes_to_codes: dict[str, str] | None = None + self.dict_classes_to_mass_fractions: dict[str, float] | None = None + + self.list_of_all_compounds: list[str] | None = None + self.compounds_properties: pd.DataFrame | None = None + + self.deriv_list_of_all_compounds: list[str] | None = None + self.deriv_files_present: bool = False + self.deriv_is_calibrations: dict[str:bool] = {} + self.deriv_compounds_properties: pd.DataFrame | None = None + self.deriv_is_files: dict[str, bool] | None = None + + self.samples_info: pd.DataFrame | None = None + self.samples_info_std: pd.DataFrame | None = None + self.samples: dict[str, pd.DataFrame] | None = None + self.samples_std: dict[str, pd.DataFrame] | None = None + + self.list_of_files_param_reports = [] + self.list_of_files_param_aggrreps = [] + self.list_of_samples_param_reports = [] + self.list_of_samples_param_aggrreps = [] + self.files: dict[str, pd.DataFrame] = {} + self.calibrations: dict[str : pd.DataFrame] = {} + self.files_reports = {} + self.files_aggrreps = {} + self.samples_reports = {} + self.samples_reports_std = {} + self.samples_aggrreps = {} + self.samples_aggrreps_std = {} + + self.columns_to_keep_in_files: list[str] = list( + self.columns_to_rename_and_keep_in_files.keys() + ) + self.acceptable_params: list[str] = list(self.param_to_axis_label.keys()) + + def load_files_info(self, update_saved_files_info: bool = True) -> pd.DataFrame: + """ """ + files_info_path = plib.Path(self.folder_path, "files_info.xlsx") + if files_info_path.exists(): + files_info = pd.read_excel( + files_info_path, engine="openpyxl", index_col="filename" + ) + self.files_info = self._add_default_to_files_info(files_info) + print("Info: files_info loaded") + else: + print("Info: files_info not found") + self.files_info = self.create_files_info() + if update_saved_files_info: + self.files_info.to_excel(plib.Path(self.folder_path, "files_info.xlsx")) + return self.files_info + + def create_files_info(self, update_saved_files_info: bool = False) -> pd.DataFrame: + """ """ + filename: list[str] = [ + a.parts[-1].split(".")[0] for a in list(self.folder_path.glob("**/*.txt")) + ] + samplename = [f.split("_")[0] for f in filename] + replicatenumber = [int(f.split("_")[1]) for f in filename] + files_info_unsorted = pd.DataFrame( + index=filename, + data={ + "samplename": samplename, + "replicatenumber": replicatenumber, + }, + ) + files_info = files_info_unsorted.sort_index() + files_info.index.name = "filename" + self.files_info = self._add_default_to_files_info(files_info) + if update_saved_files_info: + self.files_info.to_excel(plib.Path(self.folder_path, "files_info.xlsx")) + return self.files_info + + def _add_default_to_files_info( + self, files_info_no_defaults: pd.DataFrame + ) -> pd.DataFrame: + """ """ + if "derivatized" not in list(files_info_no_defaults): + files_info_no_defaults["derivatized"] = False + if "calibration_file" not in list(files_info_no_defaults): + files_info_no_defaults["calibration_file"] = False + for col in self.files_info_defauls_columns: + if col not in list(files_info_no_defaults): + files_info_no_defaults[col] = 1 + return files_info_no_defaults + + def create_samples_info(self): + """Creates a summary 'samples_info' DataFrame from 'files_info', + aggregating data for each sample, and updates the 'samples_info' + attribute with this summarized data.""" + if self.files_info is None: + _ = self.load_files_info() + self.samples_info = ( + self.files_info.reset_index().groupby("samplename").agg(list) + ) + # self.samples_info.reset_index(inplace=True) + self.samples_info.set_index("samplename", drop=True, inplace=True) + print("Info: create_samples_info: samples_info created") + return self.samples_info + + def load_all_files(self): + """Loads all files listed in 'files_info' into a dictionary, where keys are + filenames. Each file is processed to clean and standardize data. It updates the + 'files' attribute with data frames of file contents and 'is_files_deriv' with + derivative information. Marks 'files_loaded' as True after loading.""" + print("Info: load_all_files: loop started") + if self.files_info is None: + self.load_files_info() + for filename in self.files_info.index: + file = self.load_single_file(filename) + self.files[filename] = file + print("Info: load_all_files: files loaded") + return self.files + + def load_single_file(self, filename) -> pd.DataFrame: + """Loads a single GCMS file by its name, cleans, and processes the data according + to project settings (e.g., delimiter, columns to keep). It sums areas for duplicated + compound names and handles dilution factors. Updates the file's data with iupac names + and reorders columns. Logs the process and returns the cleaned DataFrame.""" + file = pd.read_csv( + plib.Path(self.folder_path, filename + self.file_load_format), + delimiter=self.file_load_delimiter, + index_col=0, + skiprows=self.file_load_skiprows, + ) + columns_to_drop = [ + cl for cl in file.columns if cl not in self.columns_to_keep_in_files + ] + file.drop(columns_to_drop, axis=1, inplace=True) + file.rename( + self.columns_to_rename_and_keep_in_files, inplace=True, axis="columns" + ) + + file["comp_name"] = file["comp_name"].fillna("unidentified") + sum_areas_in_file = file.groupby("comp_name")["area"].sum() + # the first ret time is kept for each duplicated Name + file.drop_duplicates(subset="comp_name", keep="first", inplace=True) + file.set_index("comp_name", inplace=True) # set the cas as the index + file["area"] = sum_areas_in_file # used summed areas as areas + + file["area_if_undiluted"] = ( + file["area"] * self.files_info.loc[filename, "dilution_factor"] + ) + file["iupac_name"] = "n.a." + new_cols_order = ["iupac_name"] + [ + col for col in file.columns if col != "iupac_name" + ] + file = file[new_cols_order] + file.index.name = filename + file.index = file.index.map(lambda x: x.lower()) + file.rename(self.compounds_to_rename_in_files, inplace=True) + print("\tInfo: load_single_file ", filename) + return file + + def load_class_code_frac(self) -> pd.DataFrame: + """ """ + class_code_frac_path = plib.Path( + self.folder_path, "classifications_codes_fractions.xlsx" + ) + if class_code_frac_path.exists(): + self.class_code_frac = pd.read_excel(class_code_frac_path) + else: + raise FileNotFoundError( + '"classifications_codes_fractions.xlsx" not found in folder_path' + ) + all_classes = self.class_code_frac.classes.tolist() + codes = self.class_code_frac.codes.tolist() # list of code for each class + mfs = self.class_code_frac.mfs.tolist() # list of mass fraction of each class + self.dict_classes_to_codes = dict(zip(all_classes, codes)) # dictionaries + self.dict_classes_to_mass_fractions = dict( + zip(all_classes, mfs) + ) # dictionaries + return self.class_code_frac + + def load_calibrations(self): + """Loads calibration data from Excel files specified in the 'files_info' DataFrame, + handles missing files, and coerces non-numeric values to NaN in calibration data + columns. It ensures each calibration file is loaded once, updates the 'calibrations' + attribute with calibration data, and sets 'calibrations_loaded' and + 'calibrations_not_present' flags based on the presence of calibration files.""" + if self.files_info is None: + self.load_files_info() + + if any(self.files_info["calibration_file"]): + for cal_name in set(self.files_info["calibration_file"].tolist()): + cal_path = plib.Path(self.folder_path, cal_name + ".xlsx") + if cal_path.exists(): + cal_file = pd.read_excel(cal_path, index_col=0) + else: + raise FileNotFoundError(f"{cal_name=} not found in folder_path") + + cal_file.index.name = "comp_name" + cols_cal_area = [c for c in list(cal_file) if "Area" in c] + cols_cal_ppms = [c for c in list(cal_file) if "PPM" in c] + cal_file[cols_cal_area + cols_cal_ppms] = cal_file[ + cols_cal_area + cols_cal_ppms + ].apply(pd.to_numeric, errors="coerce") + self.calibrations[cal_name] = cal_file + return self.calibrations + + def create_list_of_all_compounds(self): + """Compiles a list of all unique compounds across all loaded files and calibrations, + only for underivatized compounds. It ensures all files + are loaded before compiling the list, excludes 'unidentified' compounds, and updates + the 'list_of_all_compounds' attribute. Logs completion and returns the list.""" + if not self.files: + self.load_all_files() + if not self.calibrations: + self.load_calibrations() + all_dfs_with_comps = [] + for file in self.files.values(): + all_dfs_with_comps.append(file) + for calib in self.calibrations.values(): + all_dfs_with_comps.append(calib) + # non-derivatized compounds + all_compounds: pd.DataFrame = pd.concat(all_dfs_with_comps) + + set_of_all_compounds = pd.Index(all_compounds.index.unique()) + # Using set comprehension to remove unwanted elements + filtered_compounds = { + compound.strip() # Remove leading/trailing spaces + for compound in set_of_all_compounds + if compound not in ["unidentified", None, False, "", " ", "''"] + } + # Converting the filtered set to a list + self.list_of_all_compounds = list(filtered_compounds) + print(f"Info: created {len(self.list_of_all_compounds) = }") + return self.list_of_all_compounds + + def load_compounds_properties(self): + """Attempts to load the 'compounds_properties.xlsx' file containing physical + and chemical properties of compounds. If not found, it creates a new properties + DataFrame and updates the 'compounds_properties_created' attribute.""" + compounds_properties_path = plib.Path( + self.folder_path, "compounds_properties.xlsx" + ) + if compounds_properties_path.exists(): + cpdf = pd.read_excel( + compounds_properties_path, + index_col="comp_name", + ) + # cpdf = _order_columns_in_compounds_properties(cpdf) + # cpdf = cpdf.fillna(0) + self.compounds_properties = cpdf + self.compounds_properties_created = True + print("Info: compounds_properties loaded") + else: + print("Warning: compounds_properties.xlsx not found, creating it") + cpdf = self.create_compounds_properties() + return self.compounds_properties diff --git a/tests/data_minimal_case/files_info.xlsx b/tests/data_minimal_case/files_info.xlsx index 687f990a5c691c3a0fbed64a49bf1427d4ff9970..72a589a045d37be4027affaa3b9323ce8300dcdc 100644 GIT binary patch delta 3573 zcmZ9Pc|6o#+sDV)#$XItCcCj@ETL@K8M0NfO!i%NCWa9C+LP>K-?A&RZ;7&&ElWt2 z5Q-6zB_?EUzu)~l&;9)F^WQn2>vdk|b-k|l^|?;GcE$C&A*x1K=v6Qf(~sVgAqRm3 zAs`R~2m}iBlnB83dN^RQ9+v~XJWDZ#*b;eosHz)@d{41CAD&S-fnkcPF=~&S^A0{@ zZu>%E+r8#FS!(eESdyaIaK~ZA(y2yj(1gIVrk2N2vO$9$&Z8#1f?Hzg zHQCW+HujUQqfldC?@cHJgvMtiO4BmR>}f<%hJMPv$v3Z&vV{m&8!6g@n<)0ll8?Vp z_hcr=yjIsMy+JFwCOvQN7YbrK=T!M zw=obqR+s0k0ZsV1m-H2^e)p9{PY-v#Bvtu;YF6~s7ozl(U;xS*)jwZ*C zHu~|mzl}^z?M1DVWJX->wCiC8sQ1^Uz%*3# zTpU$tBf65n3i}mK(j1ym?<4FF=_5Ya=ZQ|w`I-h&=NE!1Lz?`1W&gkgeMIBr*lh>5 z_ZChk$9Ep$$lCe0)qgez;bScybm4Q+3b za2=6j)^82`M!9eKqI2VVqd0W#lDpm)I_5Q-WEqW?EnIKLpzySRT-wS>);goi$DB6aU6%8VyR zL!RZ7>!i2187aEe!m|1*0p3-o(#f-<{BQxyA1yV(G>5-W7Xps*B?d8~lyg#(_o|IJ zt%wLI3?a43Kdw{d>mpsrJ0pxy5sUtAfecXu2qs;2+p?)Up?72@i*h&WsBbB_Fu}|r zh6N#PeT%gWOT`nxY#V3ayAZM%@ z)7j1%QZ2A~(;U3;5&Pjf#~DRUjX15m%TITES#ukC+Nsvbit3nnzJ5c#v7taQi0??o z%9uBlCJ>G`a&7+B@Qwp2*lQOoO-mJrS zxlf9KeVp{CGsjAQN5q^!)_V@J#xc4<7;5pUCyLwa!EwHbTJU>iSjCw?Gtnk=gL=_P z&t=i`!Ns5;=BVKfCm)V)A7XAeBwnXB zpk;#1#7dww{JG5d=kQXRy}`g6R+BkqC73i&p*Xs3992X^8uWPE?SWg6tr7 z5#D6y@K-F#oL}o5Wvh#}p3-1yt5=!y2*ujNIH9zlO9=+?B%e#G)7Uft7mJG+GoI|o zC+qVEY~9Gh7cQ?xNPIc2Os?`j%frYqrz-j%n%03h>V{p#L1$%ioRz}tzLH>P@1a}| zLIy2Qx3I|5V6nz2`q}B5o3POyO3&u0PMyW8v)|pzt~F}~WwWO6F0sPvnjPiJSIh2? zJi@+YMwa;SNXMs{hla^OQc`7{-9xX@=M}4si&~8-HYa!x#^#2fg*P*)26^kJ^*zC( z)sKK4*b^L}lGy9RY;ZCn@tp__v9yT#Qt9zjK(#Z>erAj?8Ni*zBph3EDF`P~&$Y3U zYE~upi*fYuoye@Y>-!`4`yq8nZcB*GH$~8k6mq*Q&@Fy)N4bvOU^W%EWdt=b`Hai{ zeJ_pUR!TsbvpC3+I`DYRO{p0kFYct&E&_~l1q8Z|a=nAK5nIi^#2vDMKqS(6+a!_F z(k;CsCE;4F5oT&!`+DrxGN1y#WQDc9n_5t6&M$n5fhuJ#_X32)-mt_3!#~=e=>4xs zTPUk^mh=l|_)GyZM~gQ320jP;jpfRnq-pD=`$x-6JO$_PpDvO}%-^mH!cH0jJZXq# zrXQ+?-ZVj_I&XO|#ZA#^Jl@RE5%!s9LJ8WmYF?*l6x@r3PcI>*Xp#wLB8ENfnXKk? z!@p7{ktU($VK3W)S2Nc1VtNwfTHlpq*u~zYFYnW2x9h3515ZC|8hAs~8D)KEj0+n< zr*#445%IUodM;1(T@5jF&M?#JQmH;=YVG^HiyXMkoKlaRnX=Q<{7Q!8q_qSmA)ajI z{?z}>gzL}cMSsv7&9O6%L`fM^WZe_X3L9(u`~JRt|LwpFPQp3DW*0o&B?W~N$%HsJ zZ$M5;vSHTsXK&9yczw7B4HG)|l%EU)3TX^s7bm0j=az4l(|S>h1pD&J8-~yhF0gt|CTq5z1yA0u;)1HI72qY- ztx~HKF&9I^>l^M|LmI||Z+Ea7{lFbGSU+kNu1iF94_T-uTGY8>tZeUE8QX9KUTLOU z6`1M5rJ1;${aF)7S!=3^3wgP5A$#NRCsn%`}pxxb0*xJ)Es${ zw7h@ev?VBHyDG-WzkMYv?#8_#dP&pV;?3H5$L!_7I!n937xP5x9f8>PmKn+QLRdkV z-hSl!pV-Jphg?)wV!JAe(T^JU6xxBA8pD)6UG{gnIvJJ?g&V2mayttrY)9>By@d*8 z#7udHRX_bTX`9;`LVZ^Y2{_xc_!W-7G7B|ZIZ%u}PdtRY(UO~&3>fjAmxM>{P#(j; zJ5t}8Akat$xB$x6^)~taiTw-P|gV}p2H%76yBiT zXjtnxH2(5Om2HtEDvP7NlbD^hp)7Sj7X8Bnh7@r%7<60+jteylH zM!zWW%t<@ynpn%f1>9WaS`<8b)jx3DAXY2Qjc`o(r4?|+1ixCjPLip{a}WtZ!CMqF zIk`2oi)6#%KS~0w8M?m=q*&*_2|BTT^h>QF-eKWmj_6xAPLOw1qgMW-mhe3P1Y`TKvu?)OJT(mC_U8>N8&#q zvtk^teH!&3&0f|HXI}5=wgacUy|yxqQ#)RK!MCp(n~7UVv0ur*CriHMx0Z+PXvjNw zbR@@POL#tCu4orBydB~yEAI3{aND@0B0MO0#ustJ+cH-1#ga525hDi_sI*wyhz}W* zDXn-qADpm}w{()>fAS{dU-PkpJE-upd}5sc{Tn*(K>xFLAox5!KF;6{;feh*J z&yYOlCdUr;V!*o~1v&p|bN`71fhbtdO;3dL|42Na6Pf?N4m!Gs4`*k^|3<<&{^s$# lYMjr582=G;124cY!10tYD+8UZO-NT12mlAc)?3uZzgq7ZJU$nkd0$gH@wMT{Q`UBpYSb z)rqVq5hR4KmtT43edmvvIdkuwnRCy*^SSd(# zWMojNpKDN%pF~uEKhnxBs8W$R*2(9RWVM7n5dx6s%%+($+!gli+WmHKJ&vh1ZhQ`9 zU@;EIyHS1#m$CIwF>_rD%TOzOA891fOV}BG@WNcW|Hds$a6syzJ#$)+&~X_<5z}lw zHvT&D8obzqplW$sfX2hS7N~z7-W*CL1K1lA%)bh0=x+$tC3gbpHlEwi*f7ySIy7ps zbD3+97BejD9f3){1MQ3a0*7GQCdcOD37w_s+^<~P*c9(zr@UG-65En8lKV_QvA0Yl zcF1{3ZQdMVNLuTZ@762W>mS-{;8eiYDpYtNJQnxFMXs2%22tso4=EqAbppukl-xB5$pReeuCQyi5$_JQVu-IUZ;w-f>c1zd6s1YWDa>WBR-b$ z=FBENy{3ZMEsSX==e|4?aR}-uV?s3g@j(hmoLyO`e8aIIuF}{MZK)w=!-#9TZ^TPC zVpfHiN^5>Iah(o2hdM{J!_QWfneY&`8mmC`G z&S4)LiE0y!XE=E$pFmCrMdY*W1$L*qf!$}pDhD^YR4@t*CDq&##fmAOG>UL3{l`bo zsBjAznQ4JjDqUmPIaO)LB=Pv=2ae8mXD++TA2~D%{8fH4iG?dQ^#~tjzETsO&s~V%JWvx-x1LDBLE$fha=&>#mnJzuX~;$NM6I8)O@O)I)ytM7V3@{a zC0W;g`ZKztv^Ont2(+ue5DMDyor}5KAlvuhmak353OSK_E1lQDnwNds>w$rqN`wo^ z!I&sZt0SW|!hzA??96=Pylg-ahyneQV83<*RlfV-_1(R1*j{cBoKBDNd6^Ki#&~&M z;%MaxzJID0fXF6cMsV=bH{-|!iK%6gs`DFH>8VGL0E4*Ui1~~;DPY2$*bh!VJc2|? zFW%BJ9=cb<)sVghOjR@qX#T4P6Z1}pO022oRcfD`>xYvMIJgyq*OKxF3sIrYYh?AA z;5kOKra-3(Zv5+1S|!7^`hmJ_?_>#!^28tS7|@6;iYe0?PM+`@?euBbN_2``uG>aR zDYVg?W~2$8&>Ol}?PzX4E3otcg1>t^!zQslUUR}Fsnat)kW?vj?u(c<7UWmuBwu@< zKOLc@n6a^flT2Dx<$>0E{BkWHUO`69MQW}((7?(bQ!;JcT0;e_BnAO||-ZnBLl=!kl@9h&So5^zi~! zh!jWJayEmuhXN~~&{NgVA6fz+EmVAsr$J(_n>;JV?2=oROVhT$Qo}`U*WA#(DWCa0 z&UE3AgV%}29#m9GaMVy1G7l{(a@lq1ifL)n7_o9rjJ>BQ9o+K!QH!M;b5$^Dc4m7P zx$o*v`C2o>kmkY<=Q3JC+2~6JygS$lgLxqb=w>aF+;;p{Rcem%i%c%hLfgTKRK3 zSzs(Mv8cx?>7XuRQtj#;zr6g@A@el?A4>ss8sZey)0yF;8EP2 z7s{nrLbb5Ws|}xOWxHRlelNvX1Esw7JWxYD78PeVM9Z>bQYHaPrbr@6!HGLf&vp1?BlcOjBb)3zK=oDJ|xLl&a0nJX; zBH$pKvROXGjEVDnk6p zWfBQm_x_x*FYjsWocNqBavQiV_2&|2l!^3h)5^@AKBf602d50DB#5VMe$6oq1?2G2 zC~mhiVgLY20RYhb4@JYmqy0U?{xH-sAn2JC@ZyfoW&dHMFApz-@44?&KFb}H!7dUO z(I;T_J1z~8ZF07!LSisH*VF102gAgiFQ{%o9?t4Nf{snvqe&^Q-_bSN=v?+KS(aod zQg~C$*;e9W2xVl<$xR)z)psPyS3Wuyxu_5m6Lr&7K^X zt~GFA(@mWtGtkh)mjMkr1~>=oH!q3#s-qXI9P2%F#GOL=cQJ1U^f$lHw_+3ONlgRd zq{q#ejEs`l!`hLpce?Zr7q)E+j){YlB_O5ETj5^ry56yx#OpI~UNXjm!R7^y?}kWe zY5#X64;9lPb^ht0J(jond4Umvg=(U|jXl0mGJosAw#RC!i1uA`&UxDbn&Y9tYHZl1 zR^&u*%~^ktrHDaI%548-o`1^Uo*i~Y6rcq^RLm6e!QmwE~tHrSx|_*(OdT3+kKt?XX8C_hWT zm$2WNKFD1Wfg`~&nqI)ICv)}A@j4WJdmiTRytUDW1P6FShr|vvwGH|5ciys}pv_j! z+mH5(pn#T@yTF}JnZz*iOpq>OqoY~9$FAq;q4Ce(qhCz#RxPWeWV16ibnC;I&PUSa zh#Jt6Y_Xh#yY6g9*b1QO48l*2gQESMc* z`2CgpG4Ia=#@UH*pa|;+KcdHzMi?x>fWly48uSs=V;;m9{+WUr6|pp zOW`sp-8ZQRHr%`0lV5rFsXo{c{$T(N*P6}F+{|Z%2(>GFN0@qhlVv-fB?=5{P=)>) zG=dZV4Ku1!vu`c}L;wJ+nVwyoh*tUA8vP}UOXA+W%Rh25cZ&{yS+f&|5E0x7GAPCT z22a+jH4^jeqNwvp8%dt3604%wLzX?2eYHW%_LwDIi5%x=@@0TL(PSTSAqyFQZm6lPan9cD_wVdzZY2|c%4E{7#N zqP@LeKb*C)5IxSt^;*mV6{1YDCT9eiU(X){xkFEudf?Ld4lwoM`l^#F2&FKuC^)-N zMs+|d9D^@Vt2{?!k+4vUF#LE6yYl=6ar0MB9wPXAlff$%oamoxt*b^*QA(5;O{@~Zz5u&@Ll@}cgJNMIpP`!$BDjX&qxK| z^}L5Fm_~x_4ogNX0|ux^CCB%4POu*aBd0$%8^q>(=g;sojD_3(ruv)4Bmh$g0>P66 z{_hXh6)pa^b(16Dyu4SnA3LcnH2{E^BWJ&93LzpE2QUN30ayPs059O5dJq6${XfG$ zV>&H@2gHuZxgb%Zt(vn=M}w}|B+tJJlGMjd_o+5 ovhym2Jg*cO|E17{=;adt{}W>X0PtUOEiS|*A0PQK&)=*60npyIj{pDw diff --git a/tests/test_project_class.py b/tests/test_project_class.py new file mode 100644 index 0000000..4029cee --- /dev/null +++ b/tests/test_project_class.py @@ -0,0 +1,50 @@ +# %% +import pytest +import pathlib as plib +from gcms_data_analysis.gcms import Project +from pandas.testing import assert_frame_equal + + +folder_path: plib.Path = plib.Path(__file__).parent + +folder_path = r"/Users/matteo/Projects/gcms_data_analysis/tests/data_minimal_case" +# %% +proj = Project( + folder_path=folder_path, + auto_save_reports=False, + compounds_to_rename_in_files={"phenol": "renamed_phenol"}, +) + +# check a couple of defaults +assert proj.column_to_sort_values_in_samples == "retention_time" +assert proj.delta_mol_weight_threshold == 100 +assert proj.acceptable_params == [ + "height", + "area", + "area_if_undiluted", + "conc_vial_mg_L", + "conc_vial_if_undiluted_mg_L", + "fraction_of_sample_fr", + "fraction_of_feedstock_fr", +] +# %% +fic = proj.create_files_info(update_saved_files_info=False) +fil = proj.load_files_info(update_saved_files_info=False) + +fic.calibration_file = fil.calibration_file # this cannot be updated automatically +assert_frame_equal(fil, fic, check_exact=False, atol=1e-5, rtol=1e-5) +# print(fil.columns) +# print(fic.columns) +# print(fil.index) +# print(fic.index) +# print(fil==fic) +# %% +files = proj.load_all_files() +# %% +ccf = proj.load_class_code_frac() + +# %% +cal = proj.load_calibrations() +print(cal) +# %% +lac = proj.create_list_of_all_compounds()