From a61e3f988786fd780f7e0cae659ffc6c1691efe2 Mon Sep 17 00:00:00 2001 From: mpecchi Date: Mon, 20 May 2024 19:36:42 +0200 Subject: [PATCH] added automatic method calling for all methods that depend on previous methods in the Project --- .../example_name_to_properties.py | 4 +- src/gcms_data_analysis/fragmenter.py | 4 +- src/gcms_data_analysis/gcms.py | 40 +- .../compounds_properties.xlsx | Bin 6380 -> 6380 bytes tests/data_minimal_case/files_info.xlsx | Bin 5223 -> 5222 bytes tests/test_project_class.py | 831 +++++++++--------- 6 files changed, 457 insertions(+), 422 deletions(-) diff --git a/example/name_to_properties/example_name_to_properties.py b/example/name_to_properties/example_name_to_properties.py index a55f707..2c1426e 100644 --- a/example/name_to_properties/example_name_to_properties.py +++ b/example/name_to_properties/example_name_to_properties.py @@ -8,11 +8,11 @@ import pubchempy as pcp from gcms_data_analysis.fragmenter import Fragmenter -from gcms_data_analysis import name_to_properties +from gcms_data_analysis.gcms import name_to_properties folder_path = plib.Path( - r"C:\Users\mp933\OneDrive - Cornell University\Python\gcms_data_analysis\tests\data_name_to_properties" + r"/Users/matteo/Projects/gcms_data_analysis/example/name_to_properties/data_name_to_properties" ) # %% classifications_codes_fractions = pd.read_excel( diff --git a/src/gcms_data_analysis/fragmenter.py b/src/gcms_data_analysis/fragmenter.py index c89e20c..a97541b 100644 --- a/src/gcms_data_analysis/fragmenter.py +++ b/src/gcms_data_analysis/fragmenter.py @@ -2,9 +2,9 @@ from rdkit import Chem from rdkit.Chem import DataStructs from rdkit.Chem import rdmolops -from rdkit.Chem.AllChem import ( +from rdkit.Chem.AllChem import ( # pylint: disable=no-name-in-module GetMorganFingerprintAsBitVect, -) # pylint: disable=no-name-in-module +) class Fragmenter: diff --git a/src/gcms_data_analysis/gcms.py b/src/gcms_data_analysis/gcms.py index 5640ee8..a00dfd2 100644 --- a/src/gcms_data_analysis/gcms.py +++ b/src/gcms_data_analysis/gcms.py @@ -266,7 +266,7 @@ def load_all_files(self): for filename in self.files_info.index: file = self.load_single_file(filename) self.files[filename] = file - print("Info: load_all_files: files loaded") + print(f"Info: load_all_files: {len(self.files)} files loaded") return self.files def load_single_file(self, filename) -> pd.DataFrame: @@ -464,7 +464,10 @@ def create_tanimoto_and_molecular_weight_similarity_dfs( self.load_compounds_properties() if self.dict_names_to_iupacs is None: self.create_dict_names_to_iupacs() - if "iupac_name" not in list(self.files.values())[0].columns: + if ( + "iupac_name" not in list(self.files.values())[0].columns + or "iupac_name" not in list(self.calibrations.values())[0].columns + ): self.add_iupac_to_files_and_calibrations() prop_index_iupac = self.compounds_properties.set_index("iupac_name") prop_index_iupac = prop_index_iupac[ @@ -542,12 +545,23 @@ def apply_calibration_to_files(self): in the loaded files, adjusting concentrations based on calibration data, and updates the 'files' attribute with calibrated data.""" print("Info: apply_calibration_to_files: loop started") - if "iupac_name" not in list(self.files.values())[0].columns: + if not self.files: + self.load_all_files() + if not self.calibrations: + self.load_calibrations() + if self.compounds_properties is None: + self.load_compounds_properties() + if self.dict_names_to_iupacs is None: + self.create_dict_names_to_iupacs() + if ( + "iupac_name" not in list(self.files.values())[0].columns + or "iupac_name" not in list(self.calibrations.values())[0].columns + ): self.add_iupac_to_files_and_calibrations() if self.use_semi_calibration and not self.semi_calibration_dict: self.create_semi_calibration_dict() - for filename in self.files.keys(): + for filename in self.files: self.files[filename] = self.apply_calib_to_single_file(filename) return self.files @@ -630,7 +644,8 @@ def add_stats_to_files_info(self) -> pd.DataFrame: DataFrame, such as maximum height, area, and concentrations, updating the 'files_info' with these statistics.""" print("Info: add_stats_to_files_info: started") - + if not self.files: + self.load_all_files() numeric_columns = [ col for col in self.acceptable_params @@ -658,8 +673,8 @@ def create_samples_info(self): """Creates a summary 'samples_info' DataFrame from 'files_info', aggregating data for each sample, and updates the 'samples_info' attribute with this summarized data.""" - if self.files_info is None: - self.load_files_info() + if not self.files: + self.load_all_files() numeric_columns = [ col for col in self.acceptable_params @@ -801,6 +816,12 @@ def create_files_param_report(self, param="conc_vial_mg_L"): self.load_all_files() if param not in self.acceptable_params: raise ValueError(f"{param = } is not an acceptable param") + self.load_calibrations() + if self.calibrations: + self.apply_calibration_to_files() + for filename in self.files_info.index: + if param not in self.files[filename].columns: + raise ValueError(f"{param = } not found in {filename = }") # Create a dictionary of Series, each Series named after the file and containing the 'param' values series_dict = { filename: self.files[filename][param].rename(filename) @@ -829,6 +850,8 @@ def create_files_param_aggrrep(self, param="conc_vial_mg_L"): raise ValueError(f"{param = } is not an acceptable param") if param not in self.files_reports: self.create_files_param_report(param) + if self.compounds_properties is None: + self.load_compounds_properties() # create a df with iupac name index and fg_mf columns (underiv and deriv) comps_df = self.compounds_properties # .set_index("iupac_name") # comps_df = comps_df[~comps_df.index.duplicated(keep="first")] @@ -872,6 +895,9 @@ def create_samples_param_report(self, param: str = "conc_vial_mg_L"): print(f"Info: create_samples_param_report: {param = }") if param not in self.acceptable_params: raise ValueError(f"{param = } is not an acceptable param") + self.load_calibrations() + if self.calibrations: + self.apply_calibration_to_files() if param not in self.files_reports: self.create_files_param_report(param) file_to_sample_rename = dict( diff --git a/tests/data_minimal_case/compounds_properties.xlsx b/tests/data_minimal_case/compounds_properties.xlsx index 7136ef0f0443b360d30279f99373e3dbfffc1e58..5e84c024990c83d209c93b7a07c9d89d10d2a1b5 100644 GIT binary patch delta 2327 zcmY+GcTf{p7sZ3HK%|8lkY+-aPDDTuq(woBfP~(pNeLoVN-z`&NJP3A45*Yy2}K}8 zx>N%wp@SM&0Rd_2vPf?qvg0?i@1NhiGk4~_Iq%LpcU5^+na$joo`DAd05Ah`p4FwZ zRdAlbR;W0FVdX^Zrvm^uGwHxQ=;HKtvrZ*ejEm=TOX%lp2;52Gxq9rwE^!c<0Gy^jmxkoAZXpt>^<^)E`&yI z_YujT0XWYYi87ysSbt3-4H0ki5FV&Xf{WYjmagh)0a?$CeJhz^j(FOMPVqdeaP&z@ z;^L?4nzx)LJg0(YF)2aeQIROz)0_U>c4~%?%2;o(7B#T>Hd9{gH^v2YFT?iBb-a{K z(okDF1e`=)O6sSZ27c&duX%pSTdyIP1HWe9O9+S&StTbZa{-V=S^_He^cu>X+zo#R2L+i)fhf zgi)LNC+#Zh!1Y7-jCsX1MxTQ{*Js-D{ZmX88?~kfS&uV(CZ4I^r%3A=km zh9X~5+Ikal)5>9Dl0N^`R5-n`!bVUw&}C{l*f6mY)QuVsXKP8fosCP6GUUqTVx^9c zR6g%$`E(Oqj$!CG4p-l9g90+8sc6Ddfn&X3SzwK!$lM}PXhAXW+hT540QhRRC+dOq zH$^EpX>y}BBJ|DkXmU7zer36NO5d45o&{Oz;;wIS+Y_5!yK;s*WgHG0FGf?%bkw=fR+mSnAreKz z425b;HJg*0k`tyKDbt`XmrLS1t!z*jCZ9l6YEA=IQ-hNCzFYrjtqF^|pXNAa8Bo-7 zeR!%pSExIzK}!p>#PMkr&EKf$D^K~j&tSR@%q?J?SogA1n3J>9yrpvJQb)OtH5|04 zbX=9HD=ZNn^4^vzSW9}#wJzII_Qiz?M3{2rlUm0&kkJ4?826#uAU*f$s4hzr(Y>#$ z5;RSUy>y8MIP+>0Zw%*EWBT(vrjhVU$I?{%PA&tUQ=#ti9^V`EsD8CJ-wQfDA?PG? z`(Gd=QhkK=Yd14n76_!j(qjhFyv6XmL0R7wsi=fGi%eJK`YwTAc0&(5LgAAJ*|Qb= zBLqk}##Tg;?@Hb<*+%dtU5$~Pg+Pe!qhv@ynfq8WD+1ExNAD-2@Q7f!Zhk>hj_EE) z(`kgA)5J)?Tm+4Yt5wU@W8JI<{p6elv}2f6^{O?uuBquI#PVh6#>k~W!X%kh;ng_7 ziz?mQxw9n%)qamPV!r6^3m8;XEJJu{d0IOHSXI0%e2})SuyDs28?#{Qc3~k-@V=pK zWLe=;84aOy?v5sySKX+`Bobr!Fk@|*8%QF#+)dkULKE_If@J9vY5S$isvl3*Ot(HXftcj%+7=q~N`dXIJ>f>~ye?8^ZK)DU`@$7o`^dWYdRW0AY5Fwwmvy$0C%2S9S8LA> zoj!X;l87mlmsnv>VCNm74T!sYWB}t#Igwro3Ost=MhJ>J(3hP*l65%y$gN0$WRcoPzHG=Hq4P zS|(Nk%Ae4n+FKxEYL{P3mtWGP5M=UKKk(|?iW?f}4aHg0Za_1y0_S;fAqzdM@?|Z2 zybmgIIWBW@PFBY;LA~Y?;nbCobYlNcWm7z2XaR975|xbuqjy)~3e6O-;_x~&$N5_y z)-ftwP;qz@N_H+^G{#2;^N^STvp(Nyd=*4l3#ZBr-+ zzPS_`l_8Pp%5ad#R6o1f&ruc7hpNr|uJ0`Mbl3NOgUQ#W(h^3%(X++*BU6dZ!wK_& zWBUJuXssg$^$h(SVM2wuFPBj? zYATTlWlK&#ela-0{|cl z=+%{z#pzO;z~O_atGrE-4FUiu3J5MuqLG~sJ=GUW+34mZO*d}8RICe!YGh{SZJU6p zkHY9bu>qXfF-i^{+@2A*W^pbISt#oBGqvxPP&!_pesF~O%nb)mDrSMQhD&Z%c3)zT z585HMEI&zHI4aa)6M|rZ($t#NL@0G)Y4xUBO=*3BujCRnIqN%;D zSW}2d-vY|e^6a?y$doM_S^8CZ>$I_{Zo&RwJ zNA)RCg$go8K_-Hv-`2Q`-b6|Y0YIh-03bFkN(j@7KNU@g2?+`!#^`OqKsO;gtk5eo zX2KM@j?}Vvc{H45w#WJMQGP;J=Eca%pS-(4>dk}xgQlH5wa4C3kL#dM=Fzl8{#x{+ zAFO54BERKxH^OoqW$aE>6WhW~a?tfZU!M&BLc%m%d+;)>WKiyO&o?)puem>u5STCA zS}W|U8!ul>e&8O}5K&NcbZ%*3fHKo!Pif+Pu^ajFM!WYh$t`qwIO5LpT9cF)0}U0U zA%#U@6B;Pjj)W4Cl^{ZJFu8`4cnd#W;S@XUR=t zqe)DuY>UdHFVkMz)@*EcrGDZRFbjz@;xgH$0qx#mk8e#C+OE2i1|9#f|$y3 zc9126WqF5w62Ia$A>gs^8Vo)^?W2L4VXUGvkYHO@y_2LzN-KOAQB_Th$- zJge=#3`a<)Jj@x|e`(5VoUd#`jRKR;z*c6|WPAV#iUiChTd-jP^7;yW)pOQLm5 znR}nmoej~8SNtORZX_Q5GeWOr&AyE+r(_>5M2LotaF-XMWtMn`&DYtACSt^>dPpm# zA7Rwo1#>Jn3qJRp)O+TX2fjU5~NZ#0iA&&H}U}cSLhz?ehzIGPeGlt@CD+uV-KbY>= zBf#A8-jy)&wc$W!KMl)UIUaOZQ9oP>w+ws~KcATnj4C8&kY1uiN@MIQNDL@M4%b2u zK;(P9dUaL!u&+B?EH(9wR?qtsvG)4Hj|k1?*j7&%+th9LQP#Si*XI(n=2zj|#kEV0 ziKbP*Y;nkbn&pW`JL@@JO!5+_G_>h9T=2sR-b@VZufSMdm0>ZB>&n!Wt&KisxobBz zMuHK%y_gFmoQlg;TabF{xv%;l%zj|nW?mL}wrL?SBf)x}V(9kW0V=UA&@5$8=ugm$ z6DVx*x!C#nLYC>7lbA`i5t|{@^lMy7_B-~~vBd<-{oh{27M2|5h3#$2c8$2LO~0$T zK4Xr$J-;Gk6;6w?35|>QwBx&abk=ZAdv@yF^4LX!=ATDZa<1rU<29+a7jXB+a2K}^ z%42dH8jeOW8NB?9;9agdc`n0qkLei+kRNZsbr#TC5VTg=KW3YMJYa0>QS?8iKH$P-HVm2G`gr@(rI)=n|)u}Vp|t0O{}e}EA9Phay^`yK(2&S zTcClEv=1=YM_8}x_V;eWlG6h>`a0jWYR)0AbL$0abaRzZWyYW77?1+su%{Z;b%=v(Lurx4sVkz20nh19XTb^m zC~@TveAPl_tpHaIr|N6CK&ke*fvI2fHtu{`gL+3_rN2S|e;YtvcBfZt$D+^e48&ch zRV7#JW;dNqmfB9>mcZK{Q3Xn=c~yW4aa}ZzS=Uvl%`MunjEJ_%c|9aM%d}XOBC(H1 zOT715j9$iI<>UoT`0@)2E5OQl&gh+yE1Du9b=QwWTC!Zn_#7Qr_|&lOw|^U z`Bz7b$8jrtGA>(aFJ7s*rv2fEK%-2rwM6%mV}TqY@068fyRE5VFCb^6AQ?MW#*DW{ zym?7IJ7*}kn+m*@W}PSl=ees|6D!!HD(nS*+EaJ_EitYM>S5Hc?3xF5m5H2#jTw87uw1{gcrEP@n+-J2x}=ef-2S)byeQ>7Tw|+UHQaQIlW^lLDk03a=z~CSu@qtdH{C`T$!S7nU}f`#q!~|6-YujA<~;!N zxF-J*Qnv$z`}c4CoXeRQ7;?E87o+3iV$0l2gsIdtNfz$y23WwQf delta 548 zcmaE+@mzy9z?+#xgn@y9gW=Pf8(R7fzwV4s85(M zgXNK!<-M@AnTwCCxO@BZw6)9!bC3V{|L10yNL6Re{!HBtrj}V2 zPV=4?B+hK={%_`U(&t=S0-{$??DdLA&PP+TAellmz)Yt82lNWIZ-dXaI(!vtnr z<2ny!OkTnJ3@k8_FNzVwnEa0a5m?}|fHy=yQOFt*nv+w7%)rX#2}v`an!HO$3Cz0> z~F)-wEGcfQ1BLxN;fI1w7WxzTTgrynhOs)}@0`q1HOWU{d zF)&o*=$GdgWhdq5XX{nu<^*^%GKnz5<2Azg_2XS@fu{XtWMGhjnF^#E7$5OZ{>v{j YnO}qlY@VEmG~@BfRw8O_!a^W*0HHSHK>z>% diff --git a/tests/test_project_class.py b/tests/test_project_class.py index 06faf74..ab597ae 100644 --- a/tests/test_project_class.py +++ b/tests/test_project_class.py @@ -10,13 +10,15 @@ folder_path: plib.Path = plib.Path(__file__).parent folder_path = r"/Users/matteo/Projects/gcms_data_analysis/tests/data_minimal_case" -# %% + proj = Project( folder_path=folder_path, auto_save_to_excel=False, - compounds_to_rename_in_files={"almost oleic acid": "oleic acid"}, + compounds_to_rename_in_files={ + "almost oleic acid": "oleic acid", + "dichlorobenzene": "p-dichlorobenzene", + }, ) - # %% files_info_created = proj.create_files_info(update_saved_files_info=False) print(files_info_created.T) @@ -79,38 +81,55 @@ samples, samples_std = proj.create_samples_from_files() # %% reph = proj.create_files_param_report(param="height") -repc = proj.create_files_param_report(param="conc_vial_mg_L") print(reph) + +repc = proj.create_files_param_report(param="conc_vial_mg_L") print(repc) # %% repsh, repsh_d = proj.create_samples_param_report(param="height") -repsc, repsc_d = proj.create_samples_param_report(param="conc_vial_mg_L") print(repsh) +repsc, repsc_d = proj.create_samples_param_report(param="conc_vial_mg_L") print(repsc) # %% aggh = proj.create_files_param_aggrrep(param="height") -aggc = proj.create_files_param_aggrrep(param="conc_vial_mg_L") print(aggh) +# %% +aggc = proj.create_files_param_aggrrep(param="conc_vial_mg_L") + print(aggc) # %% aggsh, aggsh_d = proj.create_samples_param_aggrrep(param="height") -aggsc, aggsc_d = proj.create_samples_param_aggrrep(param="conc_vial_mg_L") print(aggsh) print(aggsh_d) +# %% +aggsc, aggsc_d = proj.create_samples_param_aggrrep(param="conc_vial_mg_L") + print(aggsc) print(aggsc_d) # %% proj.save_files_samples_reports() # %% -from __future__ import annotations -from typing import Literal -from myfigure.myfigure import MyFigure, colors, hatches +proj.plot_report() -def plot_ave_std( - project: Project, - files_or_samples: Literal["files", "samples"] = "samples", - parameter: Literal[ +# %% + + +@pytest.fixture +def project(): + test_project = Project( + folder_path=folder_path, + auto_save_to_excel=False, + compounds_to_rename_in_files={"almost oleic acid": "oleic acid"}, + ) + return test_project + + +# Test default parameters +def test_default_parameters(project): + assert proj.column_to_sort_values_in_samples == "retention_time" + assert proj.delta_mol_weight_threshold == 100 + assert proj.acceptable_params == [ "height", "area", "area_if_undiluted", @@ -118,415 +137,50 @@ def plot_ave_std( "conc_vial_if_undiluted_mg_L", "fraction_of_sample_fr", "fraction_of_feedstock_fr", - ] = "conc_vial_mg_L", - aggregate: bool = False, - show_total_in_twinx: bool = False, - min_y_thresh: float | None = None, - only_samples_to_plot: list[str] | None = None, - rename_samples: list[str] | None = None, - reorder_samples: list[str] | None = None, - item_to_color_to_hatch: pd.DataFrame | None = None, - yt_sum_label: str = "total\n(right axis)", - **kwargs, -) -> MyFigure: - """ """ - if show_total_in_twinx: - plot_twinx: bool = True - else: - plot_twinx: bool = None - default_kwargs = { - "filename": "plot" + parameter, - "out_path": proj.out_path, - "height": 4, - "width": 4, - "grid": proj.plot_grid, - "text_font": proj.plot_font, - "y_lab": project.parameter_to_axis_label[parameter], - "yt_lab": project.parameter_to_axis_label[parameter], - "twinx": plot_twinx, - "masked_unsignificant_data": True, - # "legend": False, - } - # Update kwargs with the default key-value pairs if the key is not present in kwargs - kwargs = {**default_kwargs, **kwargs} - # create folder where Plots are stored - out_path = plib.Path(project.out_path, "plots", files_or_samples) - out_path.mkdir(parents=True, exist_ok=True) - if not aggregate: # then use compounds reports - if files_or_samples == "files": - df_ave = proj.files_reports[parameter].T - df_std = pd.DataFrame() - elif files_or_samples == "samples": - df_ave = proj.samples_reports[parameter].T - df_std = proj.samples_reports_std[parameter].T - else: # use aggregated reports - if files_or_samples == "files": - df_ave = proj.files_aggrreps[parameter].T - df_std = pd.DataFrame() - elif files_or_samples == "samples": - df_ave = proj.samples_aggrreps[parameter].T - df_std = proj.samples_aggrreps_std[parameter].T + ] + assert proj.compounds_to_rename_in_files == {"almost oleic acid": "oleic acid"} - if only_samples_to_plot is not None: - df_ave = df_ave.loc[only_samples_to_plot, :].copy() - if files_or_samples == "samples": - df_std = df_std.loc[only_samples_to_plot, :].copy() - if rename_samples is not None: - df_ave.index = rename_samples - if files_or_samples == "samples": - df_std.index = rename_samples +# Test the `load_files_info` method +def test_load_files_info(project): + files_info = proj.load_files_info() + assert isinstance(files_info, pd.DataFrame) + assert len(files_info) > 0 - if reorder_samples is not None: - filtered_reorder_samples = [ - idx for idx in reorder_samples if idx in df_ave.index - ] - df_ave = df_ave.reindex(filtered_reorder_samples) - if files_or_samples == "samples": - df_std = df_std.reindex(filtered_reorder_samples) - if min_y_thresh is not None: - df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() - if files_or_samples == "samples": - df_std = df_std.loc[:, df_ave.columns].copy() +# Test the `load_all_files` method +def test_load_all_files(project): + files = proj.load_all_files() + assert isinstance(files, dict) + assert len(files) > 0 - if item_to_color_to_hatch is not None: # specific color and hatches to each fg - plot_colors = [ - item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns - ] - plot_hatches = [ - item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns - ] - else: # no specific colors and hatches specified - plot_colors = colors - plot_hatches = hatches - myfig = MyFigure( - rows=1, - cols=1, - **kwargs, - ) - if df_std.isna().all().all() or df_std.empty: # means that no std is provided - df_ave.plot( - ax=myfig.axs[0], - kind="bar", - width=0.9, - edgecolor="k", - legend=False, - capsize=3, - color=colors, - ) - else: # no legend is represented but non-significant values are shaded - mask = (df_ave.abs() > df_std.abs()) | df_std.isna() - df_ave[mask].plot( - ax=myfig.axs[0], - kind="bar", - width=0.9, - edgecolor="k", - legend=False, - yerr=df_std[mask], - capsize=3, - color=colors, - label="_nolegend_", - ) +# Test the `load_class_code_frac` method +def test_load_class_code_frac(project): + class_code_frac = proj.load_class_code_frac() + assert isinstance(class_code_frac, pd.DataFrame) + assert len(class_code_frac) > 0 - df_ave[~mask].plot( - ax=myfig.axs[0], - kind="bar", - width=0.9, - legend=False, - edgecolor="grey", - color=colors, - alpha=0.5, - label="_nolegend_", - ) - if show_total_in_twinx: - myfig.axts[0].scatter( - df_ave.index, - df_ave.sum(axis=1).values, - color="k", - linestyle="None", - edgecolor="k", - facecolor="grey", - s=100, - label=yt_sum_label, - alpha=0.5, - ) - if not df_std.empty: - myfig.axts[0].errorbar( - df_ave.index, - df_ave.sum(axis=1).values, - df_std.sum(axis=1).values, - capsize=3, - linestyle="None", - color="grey", - ecolor="k", - label="_nolegend_", - ) - myfig.save_figure() - return myfig +# Test the `load_calibrations` method +def test_load_calibrations(project): + calibrations = proj.load_calibrations() + assert isinstance(calibrations, dict) + assert len(calibrations) > 0 -def plot_df_ave_std( - proj: Project, - df_ave: pd.DataFrame, - df_std: pd.DataFrame = pd.DataFrame(), - filename: str = "plot", - show_total_in_twinx: bool = False, - annotate_outliers: bool = True, - min_y_thresh: float | None = None, - only_samples_to_plot: list[str] | None = None, - rename_samples: list[str] | None = None, - reorder_samples: list[str] | None = None, - item_to_color_to_hatch: pd.DataFrame | None = None, - yt_sum_label: str = "total\n(right axis)", - **kwargs, -) -> MyFigure: +# Test the `create_list_of_all_compounds` method +def test_create_list_of_all_compounds(project): + compounds = proj.create_list_of_all_compounds() + assert isinstance(compounds, list) + assert len(compounds) > 0 - # create folder where Plots are stored - out_path = plib.Path(Project.out_path, "df_plots") - out_path.mkdir(parents=True, exist_ok=True) - if only_samples_to_plot is not None: - df_ave = df_ave.loc[only_samples_to_plot, :].copy() - if not df_std.empty: - df_std = df_std.loc[only_samples_to_plot, :].copy() - if rename_samples is not None: - df_ave.index = rename_samples - if not df_std.empty: - df_std.index = rename_samples - - if reorder_samples is not None: - filtered_reorder_samples = [ - idx for idx in reorder_samples if idx in df_ave.index - ] - df_ave = df_ave.reindex(filtered_reorder_samples) - if not df_std.empty: - df_std = df_std.reindex(filtered_reorder_samples) - if reorder_samples is not None: - filtered_reorder_samples = [ - idx for idx in reorder_samples if idx in df_ave.index - ] - df_ave = df_ave.reindex(filtered_reorder_samples) - if not df_std.empty: - df_std = df_std.reindex(filtered_reorder_samples) - - if min_y_thresh is not None: - df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() - if not df_std.empty: - df_std = df_std.loc[:, df_ave.columns].copy() - - if item_to_color_to_hatch is not None: # specific color and hatches to each fg - colors = [item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns] - hatches = [item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns] - else: # no specific colors and hatches specified - colors = sns.color_palette(color_palette, df_ave.shape[1]) - hatches = htchs - - if show_total_in_twinx: - plot_twinx: bool = True - else: - plot_twinx: bool = False - - if show_total_in_twinx: - legend_x_anchor += 0.14 - yt_lab = y_lab - - myfig = MyFigure( - rows=1, - cols=1, - twinx=plot_twinx, - text_font=Project.plot_font, - y_lab=y_lab, - yt_lab=yt_lab, - y_lim=y_lim, - legend=False, - grid=Project.plot_grid, - **kwargs, - ) - if df_std.isna().all().all() or df_std.empty: # means that no std is provided - df_ave.plot( - ax=myfig.axs[0], - kind="bar", - rot=x_label_rotation, - width=0.9, - edgecolor="k", - legend=False, - capsize=3, - color=colors, - ) - bars = myfig.axs[0].patches # needed to add patches to the bars - n_different_hatches = int(len(bars) / df_ave.shape[0]) - else: # no legend is represented but non-significant values are shaded - mask = (df_ave.abs() > df_std.abs()) | df_std.isna() - - df_ave[mask].plot( - ax=myfig.axs[0], - kind="bar", - rot=x_label_rotation, - width=0.9, - edgecolor="k", - legend=False, - yerr=df_std[mask], - capsize=3, - color=colors, - label="_nolegend", - ) - df_ave[~mask].plot( - ax=myfig.axs[0], - kind="bar", - rot=x_label_rotation, - width=0.9, - legend=False, - edgecolor="grey", - color=colors, - alpha=0.5, - label="_nolegend", - ) - bars = myfig.axs[0].patches # needed to add patches to the bars - n_different_hatches = int(len(bars) / df_ave.shape[0] / 2) - if show_total_in_twinx: - myfig.axts[0].scatter( - df_ave.index, - df_ave.sum(axis=1).values, - color="k", - linestyle="None", - edgecolor="k", - facecolor="grey", - s=100, - label=yt_sum_label, - alpha=0.5, - ) - if not df_std.empty: - myfig.axts[0].errorbar( - df_ave.index, - df_ave.sum(axis=1).values, - df_std.sum(axis=1).values, - capsize=3, - linestyle="None", - color="grey", - ecolor="k", - ) - bar_hatches = [] - # get a list with the hatches - for h in hatches[:n_different_hatches] + hatches[:n_different_hatches]: - for n in range(df_ave.shape[0]): # htcs repeated for samples - bar_hatches.append(h) # append based on samples number - for bar, hatch in zip(bars, bar_hatches): # assign hatches to each bar - bar.set_hatch(hatch) - myfig.axs[0].set(xlabel=None) - if x_label_rotation != 0: - myfig.axs[0].set_xticklabels( - df_ave.index, rotation=x_label_rotation, ha="right", rotation_mode="anchor" - ) - if legend_location is not None: - hnd_ax, lab_ax = myfig.axs[0].get_legend_handles_labels() - if not df_std.empty: - hnd_ax = hnd_ax[: len(hnd_ax) // 2] - lab_ax = lab_ax[: len(lab_ax) // 2] - if legend_labelspacing > 0.5: # large legend spacing for molecules - myfig.axs[0].plot(np.nan, np.nan, "-", color="None", label=" ") - hhhh, aaaa = myfig.axs[0].get_legend_handles_labels() - hnd_ax.append(hhhh[0]) - lab_ax.append(aaaa[0]) - if show_total_in_twinx: - hnd_axt, lab_axt = myfig.axts[0].get_legend_handles_labels() - else: - hnd_axt, lab_axt = [], [] - if legend_location == "outside": # legend goes outside of plot area - myfig.axs[0].legend( - hnd_ax + hnd_axt, - lab_ax + lab_axt, - loc="upper left", - ncol=legend_columns, - bbox_to_anchor=(legend_x_anchor, legend_y_anchor), - labelspacing=legend_labelspacing, - ) - else: # legend is inside of plot area - myfig.axs[0].legend( - hnd_ax + hnd_axt, - lab_ax + lab_axt, - loc=legend_location, - ncol=legend_columns, - labelspacing=legend_labelspacing, - ) - # annotate ave+-std at the top of outliers bar (exceeding y_lim) - if annotate_outliers and (y_lim is not None): # and (not df_std.empty): - _annotate_outliers_in_plot(myfig.axs[0], df_ave, df_std, y_lim) - myfig.save_figure(filename, out_path) - return myfig - - -# %% - - -@pytest.fixture -def project(): - test_project = Project( - folder_path=folder_path, - auto_save_to_excel=False, - compounds_to_rename_in_files={"almost oleic acid": "oleic acid"}, - ) - return test_project - - -# Test default parameters -def test_default_parameters(project): - assert proj.column_to_sort_values_in_samples == "retention_time" - assert proj.delta_mol_weight_threshold == 100 - assert proj.acceptable_params == [ - "height", - "area", - "area_if_undiluted", - "conc_vial_mg_L", - "conc_vial_if_undiluted_mg_L", - "fraction_of_sample_fr", - "fraction_of_feedstock_fr", - ] - assert proj.compounds_to_rename_in_files == {"almost oleic acid": "oleic acid"} - - -# Test the `load_files_info` method -def test_load_files_info(project): - files_info = proj.load_files_info() - assert isinstance(files_info, pd.DataFrame) - assert len(files_info) > 0 - - -# Test the `load_all_files` method -def test_load_all_files(project): - files = proj.load_all_files() - assert isinstance(files, dict) - assert len(files) > 0 - - -# Test the `load_class_code_frac` method -def test_load_class_code_frac(project): - class_code_frac = proj.load_class_code_frac() - assert isinstance(class_code_frac, pd.DataFrame) - assert len(class_code_frac) > 0 - - -# Test the `load_calibrations` method -def test_load_calibrations(project): - calibrations = proj.load_calibrations() - assert isinstance(calibrations, dict) - assert len(calibrations) > 0 - - -# Test the `create_list_of_all_compounds` method -def test_create_list_of_all_compounds(project): - compounds = proj.create_list_of_all_compounds() - assert isinstance(compounds, list) - assert len(compounds) > 0 - - -# Test the `create_compounds_properties` method -def test_create_compounds_properties(project): - compounds_properties = proj.create_compounds_properties() - assert isinstance(compounds_properties, pd.DataFrame) - assert len(compounds_properties) > 0 +# Test the `create_compounds_properties` method +def test_create_compounds_properties(project): + compounds_properties = proj.create_compounds_properties() + assert isinstance(compounds_properties, pd.DataFrame) + assert len(compounds_properties) > 0 assert_frame_equal( @@ -742,3 +396,358 @@ def test_save_files_samples_reports(project): # %% +# %% +from __future__ import annotations +from typing import Literal +from myfigure.myfigure import MyFigure, colors, hatches + + +def plot_ave_std( + project: Project, + files_or_samples: Literal["files", "samples"] = "samples", + parameter: Literal[ + "height", + "area", + "area_if_undiluted", + "conc_vial_mg_L", + "conc_vial_if_undiluted_mg_L", + "fraction_of_sample_fr", + "fraction_of_feedstock_fr", + ] = "conc_vial_mg_L", + aggregate: bool = False, + show_total_in_twinx: bool = False, + min_y_thresh: float | None = None, + only_samples_to_plot: list[str] | None = None, + rename_samples: list[str] | None = None, + reorder_samples: list[str] | None = None, + item_to_color_to_hatch: pd.DataFrame | None = None, + yt_sum_label: str = "total\n(right axis)", + **kwargs, +) -> MyFigure: + """ """ + if show_total_in_twinx: + plot_twinx: bool = True + else: + plot_twinx: bool = None + default_kwargs = { + "filename": "plot" + parameter, + "out_path": proj.out_path, + "height": 4, + "width": 4, + "grid": proj.plot_grid, + "text_font": proj.plot_font, + "y_lab": project.parameter_to_axis_label[parameter], + "yt_lab": project.parameter_to_axis_label[parameter], + "twinx": plot_twinx, + "masked_unsignificant_data": True, + # "legend": False, + } + # Update kwargs with the default key-value pairs if the key is not present in kwargs + kwargs = {**default_kwargs, **kwargs} + # create folder where Plots are stored + out_path = plib.Path(project.out_path, "plots", files_or_samples) + out_path.mkdir(parents=True, exist_ok=True) + if not aggregate: # then use compounds reports + if files_or_samples == "files": + df_ave = proj.files_reports[parameter].T + df_std = pd.DataFrame() + elif files_or_samples == "samples": + df_ave = proj.samples_reports[parameter].T + df_std = proj.samples_reports_std[parameter].T + else: # use aggregated reports + if files_or_samples == "files": + df_ave = proj.files_aggrreps[parameter].T + df_std = pd.DataFrame() + elif files_or_samples == "samples": + df_ave = proj.samples_aggrreps[parameter].T + df_std = proj.samples_aggrreps_std[parameter].T + + if only_samples_to_plot is not None: + df_ave = df_ave.loc[only_samples_to_plot, :].copy() + if files_or_samples == "samples": + df_std = df_std.loc[only_samples_to_plot, :].copy() + + if rename_samples is not None: + df_ave.index = rename_samples + if files_or_samples == "samples": + df_std.index = rename_samples + + if reorder_samples is not None: + filtered_reorder_samples = [ + idx for idx in reorder_samples if idx in df_ave.index + ] + df_ave = df_ave.reindex(filtered_reorder_samples) + if files_or_samples == "samples": + df_std = df_std.reindex(filtered_reorder_samples) + + if min_y_thresh is not None: + df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() + if files_or_samples == "samples": + df_std = df_std.loc[:, df_ave.columns].copy() + + if item_to_color_to_hatch is not None: # specific color and hatches to each fg + plot_colors = [ + item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns + ] + plot_hatches = [ + item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns + ] + else: # no specific colors and hatches specified + plot_colors = colors + plot_hatches = hatches + + myfig = MyFigure( + rows=1, + cols=1, + **kwargs, + ) + if df_std.isna().all().all() or df_std.empty: # means that no std is provided + df_ave.plot( + ax=myfig.axs[0], + kind="bar", + width=0.9, + edgecolor="k", + legend=False, + capsize=3, + color=colors, + ) + else: # no legend is represented but non-significant values are shaded + mask = (df_ave.abs() > df_std.abs()) | df_std.isna() + df_ave[mask].plot( + ax=myfig.axs[0], + kind="bar", + width=0.9, + edgecolor="k", + legend=False, + yerr=df_std[mask], + capsize=3, + color=colors, + label="_nolegend_", + ) + + df_ave[~mask].plot( + ax=myfig.axs[0], + kind="bar", + width=0.9, + legend=False, + edgecolor="grey", + color=colors, + alpha=0.5, + label="_nolegend_", + ) + if show_total_in_twinx: + myfig.axts[0].scatter( + df_ave.index, + df_ave.sum(axis=1).values, + color="k", + linestyle="None", + edgecolor="k", + facecolor="grey", + s=100, + label=yt_sum_label, + alpha=0.5, + ) + if not df_std.empty: + myfig.axts[0].errorbar( + df_ave.index, + df_ave.sum(axis=1).values, + df_std.sum(axis=1).values, + capsize=3, + linestyle="None", + color="grey", + ecolor="k", + label="_nolegend_", + ) + + myfig.save_figure() + return myfig + + +def plot_df_ave_std( + proj: Project, + df_ave: pd.DataFrame, + df_std: pd.DataFrame = pd.DataFrame(), + filename: str = "plot", + show_total_in_twinx: bool = False, + annotate_outliers: bool = True, + min_y_thresh: float | None = None, + only_samples_to_plot: list[str] | None = None, + rename_samples: list[str] | None = None, + reorder_samples: list[str] | None = None, + item_to_color_to_hatch: pd.DataFrame | None = None, + yt_sum_label: str = "total\n(right axis)", + **kwargs, +) -> MyFigure: + + # create folder where Plots are stored + out_path = plib.Path(Project.out_path, "df_plots") + out_path.mkdir(parents=True, exist_ok=True) + if only_samples_to_plot is not None: + df_ave = df_ave.loc[only_samples_to_plot, :].copy() + if not df_std.empty: + df_std = df_std.loc[only_samples_to_plot, :].copy() + + if rename_samples is not None: + df_ave.index = rename_samples + if not df_std.empty: + df_std.index = rename_samples + + if reorder_samples is not None: + filtered_reorder_samples = [ + idx for idx in reorder_samples if idx in df_ave.index + ] + df_ave = df_ave.reindex(filtered_reorder_samples) + if not df_std.empty: + df_std = df_std.reindex(filtered_reorder_samples) + if reorder_samples is not None: + filtered_reorder_samples = [ + idx for idx in reorder_samples if idx in df_ave.index + ] + df_ave = df_ave.reindex(filtered_reorder_samples) + if not df_std.empty: + df_std = df_std.reindex(filtered_reorder_samples) + + if min_y_thresh is not None: + df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() + if not df_std.empty: + df_std = df_std.loc[:, df_ave.columns].copy() + + if item_to_color_to_hatch is not None: # specific color and hatches to each fg + colors = [item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns] + hatches = [item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns] + else: # no specific colors and hatches specified + colors = sns.color_palette(color_palette, df_ave.shape[1]) + hatches = htchs + + if show_total_in_twinx: + plot_twinx: bool = True + else: + plot_twinx: bool = False + + if show_total_in_twinx: + legend_x_anchor += 0.14 + yt_lab = y_lab + + myfig = MyFigure( + rows=1, + cols=1, + twinx=plot_twinx, + text_font=Project.plot_font, + y_lab=y_lab, + yt_lab=yt_lab, + y_lim=y_lim, + legend=False, + grid=Project.plot_grid, + **kwargs, + ) + if df_std.isna().all().all() or df_std.empty: # means that no std is provided + df_ave.plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + edgecolor="k", + legend=False, + capsize=3, + color=colors, + ) + bars = myfig.axs[0].patches # needed to add patches to the bars + n_different_hatches = int(len(bars) / df_ave.shape[0]) + else: # no legend is represented but non-significant values are shaded + mask = (df_ave.abs() > df_std.abs()) | df_std.isna() + + df_ave[mask].plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + edgecolor="k", + legend=False, + yerr=df_std[mask], + capsize=3, + color=colors, + label="_nolegend", + ) + df_ave[~mask].plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + legend=False, + edgecolor="grey", + color=colors, + alpha=0.5, + label="_nolegend", + ) + bars = myfig.axs[0].patches # needed to add patches to the bars + n_different_hatches = int(len(bars) / df_ave.shape[0] / 2) + if show_total_in_twinx: + myfig.axts[0].scatter( + df_ave.index, + df_ave.sum(axis=1).values, + color="k", + linestyle="None", + edgecolor="k", + facecolor="grey", + s=100, + label=yt_sum_label, + alpha=0.5, + ) + if not df_std.empty: + myfig.axts[0].errorbar( + df_ave.index, + df_ave.sum(axis=1).values, + df_std.sum(axis=1).values, + capsize=3, + linestyle="None", + color="grey", + ecolor="k", + ) + bar_hatches = [] + # get a list with the hatches + for h in hatches[:n_different_hatches] + hatches[:n_different_hatches]: + for n in range(df_ave.shape[0]): # htcs repeated for samples + bar_hatches.append(h) # append based on samples number + for bar, hatch in zip(bars, bar_hatches): # assign hatches to each bar + bar.set_hatch(hatch) + myfig.axs[0].set(xlabel=None) + if x_label_rotation != 0: + myfig.axs[0].set_xticklabels( + df_ave.index, rotation=x_label_rotation, ha="right", rotation_mode="anchor" + ) + if legend_location is not None: + hnd_ax, lab_ax = myfig.axs[0].get_legend_handles_labels() + if not df_std.empty: + hnd_ax = hnd_ax[: len(hnd_ax) // 2] + lab_ax = lab_ax[: len(lab_ax) // 2] + if legend_labelspacing > 0.5: # large legend spacing for molecules + myfig.axs[0].plot(np.nan, np.nan, "-", color="None", label=" ") + hhhh, aaaa = myfig.axs[0].get_legend_handles_labels() + hnd_ax.append(hhhh[0]) + lab_ax.append(aaaa[0]) + if show_total_in_twinx: + hnd_axt, lab_axt = myfig.axts[0].get_legend_handles_labels() + else: + hnd_axt, lab_axt = [], [] + if legend_location == "outside": # legend goes outside of plot area + myfig.axs[0].legend( + hnd_ax + hnd_axt, + lab_ax + lab_axt, + loc="upper left", + ncol=legend_columns, + bbox_to_anchor=(legend_x_anchor, legend_y_anchor), + labelspacing=legend_labelspacing, + ) + else: # legend is inside of plot area + myfig.axs[0].legend( + hnd_ax + hnd_axt, + lab_ax + lab_axt, + loc=legend_location, + ncol=legend_columns, + labelspacing=legend_labelspacing, + ) + # annotate ave+-std at the top of outliers bar (exceeding y_lim) + if annotate_outliers and (y_lim is not None): # and (not df_std.empty): + _annotate_outliers_in_plot(myfig.axs[0], df_ave, df_std, y_lim) + myfig.save_figure(filename, out_path) + return myfig