Skip to content

Commit

Permalink
added reports creation, saving
Browse files Browse the repository at this point in the history
  • Loading branch information
mpecchi committed May 7, 2024
1 parent 925d535 commit 01044f1
Show file tree
Hide file tree
Showing 5 changed files with 241 additions and 50 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ example/data/deriv_compounds_properties.xlsx
tests/data_for_testing/output/
RCSdata/data/output/
RCSdata/data/output_tanimoto/
**/output/
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
193 changes: 147 additions & 46 deletions src/gcms_data_analysis/gcms.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,6 @@ def __init__(
self.samples_info: pd.DataFrame | None = None
self.samples_info_std: pd.DataFrame | None = None

self.list_of_files_param_reports = []
self.list_of_files_param_aggrreps = []
self.list_of_samples_param_reports = []
self.list_of_samples_param_aggrreps = []
self.files: dict[str, pd.DataFrame] = {}
self.samples: dict[str, pd.DataFrame] = {}
self.samples_std: dict[str, pd.DataFrame] = {}
Expand Down Expand Up @@ -806,55 +802,160 @@ def create_files_param_report(self, param="conc_vial_mg_L"):
# Reindex the DataFrame to include all unique indices, filling missing values with 0
rep = rep.fillna(0)
rep = rep.sort_index(key=rep.max(axis=1).get, ascending=False)
rep = rep.loc[:, rep.any(axis=0)]
# remove null columns from rep
rep = rep.loc[rep.any(axis=1), :]
# Save and return the report
self.files_reports[param] = rep
self.list_of_files_param_reports.append(param)
return self.files_reports[param]

def create_samples_param_report(self, param="conc_vial_mg_L"):
"""
Create two reports that consolidate the average and standard deviation of a specified parameter
from different sample DataFrames, assuming both sets of DataFrames share the same indices.
:param param: The parameter to extract from each DataFrame. Defaults to "conc_vial_mg_L".
:return: A tuple of two DataFrames containing the consolidated averages and standard deviations.
"""
if not self.samples:
self.create_samples_from_files()

series_dict = {
samplename: self.samples[samplename][param].rename(samplename)
for samplename in self.samples_info.index
if param in self.samples[samplename].columns
}
series_dict_std = {
samplename: self.samples_std[samplename][param].rename(samplename)
for samplename in self.samples_info.index
if param in self.samples_std[samplename].columns
}
# Get the union of all indices from the individual sample DataFrames (assuming indices are the same for std and avg)
rep = pd.concat(
series_dict.values(), axis=1, keys=series_dict.keys(), join="outer"
)
rep_std = pd.concat(
series_dict_std.values(), axis=1, keys=series_dict_std.keys(), join="outer"
def create_files_param_aggrrep(self, param="conc_vial_mg_L"):
"""Aggregates compound concentration data by functional group for each
parameter across all FILES, providing a summarized view of functional
group concentrations. This aggregation facilitates the understanding
of functional group distribution across FILES."""
print("Info: create_param_aggrrep: ", param)
if param not in self.acceptable_params:
raise ValueError(f"{param = } is not an acceptable param")
if param not in self.files_reports:
self.create_files_param_report(param)
# create a df with iupac name index and fg_mf columns (underiv and deriv)
comps_df = self.compounds_properties # .set_index("iupac_name")
# comps_df = comps_df[~comps_df.index.duplicated(keep="first")]

# fg = functional groups, mf = mass fraction
filenames = self.files_info.index.tolist()
_all_comps = self.files_reports[param].index.tolist()
_all_comps = [comp for comp in _all_comps if comp != "unidentified"]
fg_mf_labs = [
c for c in comps_df.columns if c.startswith("fg_mf_") if c != "fg_mf_total"
]
fg_labs = [c[6:] for c in fg_mf_labs]

fg_mf_all = pd.DataFrame(index=_all_comps, columns=fg_mf_labs)
for idx in fg_mf_all.index.tolist():
fg_mf_all.loc[idx, fg_mf_labs] = comps_df.loc[idx, fg_mf_labs]
# create the aggregated dataframes and compute aggregated results
aggrrep = pd.DataFrame(columns=filenames, index=fg_labs, dtype="float")
aggrrep.fillna(0, inplace=True)
for col in filenames:
list_iupac = self.files_reports[param].index
signal = self.files_reports[param].loc[:, col].values
for fg, fg_mf in zip(fg_labs, fg_mf_labs):
# each compound contributes to the cumulative sum of each
# functional group for the based on the mass fraction it has
# of that functional group (fg_mf act as weights)
# if fg_mf in subrep: multiply signal for weight and sum
# to get aggregated
weights = fg_mf_all.loc[list_iupac, fg_mf].astype(signal.dtype)

aggrrep.loc[fg, col] = (signal * weights).sum()
aggrrep = aggrrep.loc[aggrrep.any(axis=1), :] # drop rows with only 0
aggrrep = aggrrep.sort_index(key=aggrrep[filenames].max(1).get, ascending=False)
self.files_aggrreps[param] = aggrrep
return aggrrep

def create_samples_param_report(self, param: str = "conc_vial_mg_L"):
print(f"Info: create_samples_param_report: {param = }")
if param not in self.acceptable_params:
raise ValueError(f"{param = } is not an acceptable param")
if param not in self.files_reports:
self.create_files_param_report(param)
file_to_sample_rename = dict(
zip(self.files_info.index.tolist(), self.files_info["samplename"])
)
# Populate the DataFrames with values

# Sort by the max value in each row and filter out columns that only contain 0s in the average report
rep = rep.sort_index(key=rep.max(axis=1).get, ascending=False)
rep = rep.loc[:, rep.any(axis=0)]
# Ensure the standard deviation DataFrame aligns with the average DataFrame
rep_std = rep_std.reindex_like(rep)

# Save and return the reports
self.samples_reports[param] = rep.fillna(0)
self.samples_reports_std[param] = rep_std
self.list_of_samples_param_reports.append(param)

filerep = self.files_reports[param].copy()
filerep.rename(columns=file_to_sample_rename, inplace=True)
self.samples_reports[param] = filerep.T.groupby(by=filerep.columns).mean().T
self.samples_reports_std[param] = filerep.T.groupby(by=filerep.columns).std().T
return self.samples_reports[param], self.samples_reports_std[param]

def create_samples_param_aggrrep(self, param: str = "conc_vial_mg_L"):
print(f"Info: create_samples_param_aggrrep: {param = }")
if param not in self.acceptable_params:
raise ValueError(f"{param = } is not an acceptable param")
if param not in self.files_aggrreps:
self.create_files_param_aggrrep(param)
file_to_sample_rename = dict(
zip(self.files_info.index.tolist(), self.files_info["samplename"])
)
fileagg = self.files_aggrreps[param].copy()
fileagg.rename(columns=file_to_sample_rename, inplace=True)
self.samples_aggrreps[param] = fileagg.T.groupby(by=fileagg.columns).mean().T
self.samples_aggrreps_std[param] = fileagg.T.groupby(by=fileagg.columns).std().T
return self.samples_aggrreps[param], self.samples_aggrreps_std[param]

def save_files_samples_reports(self):
""""""
for subfolder in [
"",
"files",
"samples",
"files_reports",
"files_aggrreps",
"samples_reports",
"samples_aggrreps",
]:
plib.Path(self.out_path, subfolder).mkdir(parents=True, exist_ok=True)
out_path = self.out_path
# save files_info and samples_info to the general output folder
if self.files_info is not None:
self.files_info.to_excel(plib.Path(out_path, "files_info.xlsx"))
if self.samples_info is not None:
self.samples_info.to_excel(plib.Path(out_path, "samples_info.xlsx"))
self.samples_info_std.to_excel(plib.Path(out_path, "samples_info_std.xlsx"))
if self.files:
for filename, df in self.files.items():
df.to_excel(plib.Path(out_path, "files", f"{filename}.xlsx"))
if self.samples:
for samplename, df in self.samples.items():
df.to_excel(plib.Path(out_path, "samples", f"{samplename}.xlsx"))
for samplename, df in self.samples_std.items():
df.to_excel(plib.Path(out_path, "samples", f"{samplename}_std.xlsx"))
if self.files_reports:
for param, df in self.files_reports.items():
df.to_excel(
plib.Path(out_path, "files_reports", f"report_files_{param}.xlsx")
)
if self.files_aggrreps:
for param, df in self.files_aggrreps.items():
df.to_excel(
plib.Path(
self.out_path, "files_aggrreps", f"aggrrep_files_{param}.xlsx"
)
)
if self.samples_reports:
for param, df in self.samples_reports.items():
df.to_excel(
plib.Path(
self.out_path, "samples_reports", f"report_samples_{param}.xlsx"
)
)
for param, df in self.samples_reports_std.items():
df.to_excel(
plib.Path(
self.out_path,
"samples_reports",
f"report_samples_{param}_std.xlsx",
)
)
if self.samples_aggrreps:
for param, df in self.samples_aggrreps.items():
df.to_excel(
plib.Path(
self.out_path,
"samples_aggrreps",
f"aggrrep_samples_{param}.xlsx",
)
)
for param, df in self.samples_aggrreps_std.items():
df.to_excel(
plib.Path(
self.out_path,
"samples_aggrreps",
f"aggrrep_samples_{param}_std.xlsx",
)
)


def create_tanimoto_similarity_dict(
comp_smiles: str, calib_smiless: list[str]
Expand Down
6 changes: 3 additions & 3 deletions tests/data_minimal_case/T_3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ Output Time 11:00:20 AM
# of Peaks 3
Mass TIC
Peak# Ret.Time Proc.From Proc.To Mass Area Height A/H Conc. Mark Name Ret. Index Area% Height% SI CAS #
1 13.703 13.580 13.900 TIC 0 0 1 0 V Phenol 2.59 3.01 97 108-95-2
2 20.942 20.767 21.020 TIC 150 150 1 0 V Naphthalene 8.11 9.20 98 91-20-3
3 21.426 21.373 21.500 TIC 1500 1500 1 0 V Dodecane 0.36 0.57 96 112-40-3
1 13.703 13.580 13.900 TIC 10 10 1 0 V Phenol 2.59 3.01 97 108-95-2
2 20.942 20.767 21.020 TIC 100 1000 1 0 V Naphthalene 8.11 9.20 98 91-20-3
3 21.426 21.373 21.500 TIC 1000 1000 1 0 V Dodecane 0.36 0.57 96 112-40-3
Binary file modified tests/data_minimal_case/compounds_properties.xlsx
Binary file not shown.
91 changes: 90 additions & 1 deletion tests/test_project_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,104 @@
si_ave, si_std = proj.create_samples_info()
# %%
s_ave, s_std = proj.create_single_sample_from_files(
files_in_sample=[s1, s2], samplename="s"
files_in_sample=[s1, s2], samplename="S"
)
# check that the average between s1 and s2 is the same as s_ave for area
for param in proj.acceptable_params:
print(f"{param = }")
for compound in s1.index.drop("notvalidcomp").drop("dichlorobenzene"):
print(f"\t {compound = }")

# print(f"\t\t {s1.loc[compound, param] = }")
# print(f"\t\t {s2.loc[compound, param] = }")
# print(f"\t\t {s_ave.loc[compound, param] = }")
if compound not in s2.index:
assert np.isclose(
s_ave.loc[compound, param],
(s1.loc[compound, param] + 0) / 2,
)
else:
assert np.isclose(
s_ave.loc[compound, param],
(s1.loc[compound, param] + s2.loc[compound, param]) / 2,
)
# do the same for the standard deviation

for compound in s1.index.drop("notvalidcomp").drop("dichlorobenzene"):
if compound not in s2.index:
assert np.isclose(
s_std.loc[compound, param], np.std((s1.loc[compound, param], 0), ddof=1)
)
else:
assert np.isclose(
s_std.loc[compound, param],
np.std((s1.loc[compound, param], s2.loc[compound, param]), ddof=1),
)

# %%

# %%
samples, samples_std = proj.create_samples_from_files()

# %%
reph = proj.create_files_param_report(param="height")
repc = proj.create_files_param_report(param="conc_vial_mg_L")
# %%
# Test that for each file and parameter, values match with the original file in the reports

for param in proj.acceptable_params:
print(f"{param=}")
rep = proj.create_files_param_report(param)
for filename, file in files.items():
print(f"\t{filename=}")
for compound in file.index:
print(f"\t\t{compound=}")
original_values = file.loc[compound, param]
try:
report_values = rep.loc[compound, filename]
assert np.allclose(original_values, report_values)
except KeyError:
assert np.isnan(original_values) or original_values == 0
# %%
for param in proj.acceptable_params:
print(f"{param=}")
rep, rep_std = proj.create_samples_param_report(param)
for samplename, sample in samples.items():
sample_std = proj.samples_std[samplename]
print(f"\t{samplename=}")
for compound in sample.index:
print(f"\t\t{compound=}")
original_values = sample.loc[compound, param]
original_values_std = sample_std.loc[compound, param]
try:
report_values = rep.loc[compound, samplename]
report_values_std = rep_std.loc[compound, samplename]
assert np.allclose(original_values, report_values)
assert np.allclose(original_values_std, report_values_std)
except KeyError:
assert np.isnan(original_values) or original_values == 0


# %%

print(reph)
print(repc)
# %%
repsh, repsh_d = proj.create_samples_param_report(param="height")
repsc, repsc_d = proj.create_samples_param_report(param="conc_vial_mg_L")
print(repsh)
print(repsc)
# %%
aggh = proj.create_files_param_aggrrep(param="height")
aggc = proj.create_files_param_aggrrep(param="conc_vial_mg_L")
print(aggh)
print(aggc)

aggsh, aggsh_d = proj.create_samples_param_aggrrep(param="height")
aggsc, aggsc_d = proj.create_samples_param_aggrrep(param="conc_vial_mg_L")
print(aggsh)
print(aggsh_d)
print(aggsc)
print(aggsc_d)
# %%
proj.save_files_samples_reports()

0 comments on commit 01044f1

Please sign in to comment.