Skip to content

Commit

Permalink
fixed RCS example based on new version
Browse files Browse the repository at this point in the history
  • Loading branch information
mpecchi committed Mar 29, 2024
1 parent 661f0ff commit f14c6f7
Showing 1 changed file with 199 additions and 103 deletions.
302 changes: 199 additions & 103 deletions RCSdata/RCS_gcms_data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@author Matteo Pecchi (mp933@cornell.edu).
"""

# =============================================================================
# # necessary packages, install them using conda (not pip)
# =============================================================================
Expand All @@ -17,35 +18,41 @@
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from gcms_data_analysis import Project, figure_create, figure_save
from gcms_data_analysis import Project
from gcms_data_analysis.plotting import plot_ave_std, MyFigure

folder_path = plib.Path(plib.Path(__file__).parent, "data")


folder_path = plib.Path(plib.Path(__file__).parent, 'data')
#%%
# %%
def get_calibration_error(name0, name1, calibration, xrange=[10, 200], steps=100):
cols_cal_area = [c for c in list(calibration) if 'Area' in c]
cols_cal_ppms = [c for c in list(calibration) if 'PPM' in c]
calibration[cols_cal_area + cols_cal_ppms] = \
calibration[cols_cal_area + cols_cal_ppms].apply(pd.to_numeric,
errors='coerce')
cols_cal_area = [c for c in list(calibration) if "Area" in c]
cols_cal_ppms = [c for c in list(calibration) if "PPM" in c]
calibration[cols_cal_area + cols_cal_ppms] = calibration[
cols_cal_area + cols_cal_ppms
].apply(pd.to_numeric, errors="coerce")
cal_areas0 = calibration.loc[name0, cols_cal_area].to_numpy(dtype=float)
cal_ppms0 = calibration.loc[name0, cols_cal_ppms].to_numpy(dtype=float)
# linear fit of calibration curve (exclude nan), get ppm from area
fit0 = np.polyfit(cal_areas0[~np.isnan(cal_areas0)],
cal_ppms0[~np.isnan(cal_ppms0)], 1)
fit0 = np.polyfit(
cal_areas0[~np.isnan(cal_areas0)], cal_ppms0[~np.isnan(cal_ppms0)], 1
)
cal_areas1 = calibration.loc[name1, cols_cal_area].to_numpy(dtype=float)
cal_ppms1 = calibration.loc[name1, cols_cal_ppms].to_numpy(dtype=float)
# linear fit of calibration curve (exclude nan), get ppm from area
fit1 = np.polyfit(cal_areas1[~np.isnan(cal_areas1)],
cal_ppms1[~np.isnan(cal_ppms1)], 1)
fit1 = np.polyfit(
cal_areas1[~np.isnan(cal_areas1)], cal_ppms1[~np.isnan(cal_ppms1)], 1
)
x = np.arange(xrange[0], xrange[1], steps)
line0 = np.poly1d(fit0)(x)
line1 = np.poly1d(fit1)(x)

mse = np.mean((line0 - line1)**2)
mse = np.average(abs(line0-line1)/line1)*100
mse = np.mean((line0 - line1) ** 2)
mse = np.average(abs(line0 - line1) / line1) * 100
return mse

#%%

# %%

Project.set_folder_path(folder_path)
# Set the base folder path for the project's data files
Expand All @@ -65,7 +72,7 @@ def get_calibration_error(name0, name1, calibration, xrange=[10, 200], steps=100

# Load calibration data for standard and derivatized samples, and determine if they are derivatized
calibrations, is_calibr_deriv = gcms.load_calibrations()
c1, c2 = calibrations['calibration88'], calibrations['deriv_calibration11']
c1, c2 = calibrations["calibration88"], calibrations["deriv_calibration11"]

# Generate a comprehensive list of all compounds found across samples
list_of_all_compounds = gcms.create_list_of_all_compounds()
Expand All @@ -87,7 +94,7 @@ def get_calibration_error(name0, name1, calibration, xrange=[10, 200], steps=100
files, is_files_deriv = gcms.apply_calibration_to_files()

# Extract specific files for detailed analysis or further operations
f11, f22, f33 = files['A_1'], files['Ader_1'], files['B_1']
f11, f22, f33 = files["A_1"], files["Ader_1"], files["B_1"]

# Add statistical information to the files_info DataFrame, such as mean, median, and standard deviation for each file
files_info = gcms.add_stats_to_files_info()
Expand All @@ -97,134 +104,223 @@ def get_calibration_error(name0, name1, calibration, xrange=[10, 200], steps=100

# Create samples and their standard deviations from the files, storing the results in dictionaries
samples, samples_std = gcms.create_samples_from_files()
s1, s2, s3 = samples['A'], samples['Ader'], samples['B']
sd1, sd2, sd3 = samples_std['A'], samples_std['Ader'], samples_std['B']

# Add statistical information to the samples_info DataFrame, enhancing the initial analysis with statistical data
samples_info = gcms.add_stats_to_samples_info()
s1, s2, s3 = samples["A"], samples["Ader"], samples["B"]
sd1, sd2, sd3 = samples_std["A"], samples_std["Ader"], samples_std["B"]

# Generate reports for specific parameters (e.g., concentration, mass fraction) for files and samples
rep_files_conc = gcms.create_files_param_report(param='conc_vial_mg_L')
rep_files_fr = gcms.create_files_param_report(param='fraction_of_sample_fr')
rep_samples_conc, rep_samples_conc_std = gcms.create_samples_param_report(param='conc_vial_mg_L')
rep_samples_fr, rep_samples_fr_std = gcms.create_samples_param_report(param='fraction_of_sample_fr')
rep_files_conc = gcms.create_files_param_report(param="conc_vial_mg_L")
rep_files_fr = gcms.create_files_param_report(param="fraction_of_sample_fr")
rep_samples_conc, rep_samples_conc_std = gcms.create_samples_param_report(
param="conc_vial_mg_L"
)
rep_samples_fr, rep_samples_fr_std = gcms.create_samples_param_report(
param="fraction_of_sample_fr"
)

# Generate aggregated reports based on functional groups for files and samples, for specific parameters
agg_files_conc = gcms.create_files_param_aggrrep(param='conc_vial_mg_L')
agg_files_fr = gcms.create_files_param_aggrrep(param='fraction_of_sample_fr')
agg_samples_conc, agg_samples_conc_std = gcms.create_samples_param_aggrrep(param='conc_vial_mg_L')
agg_samples_fr, agg_samples_fr_std = gcms.create_samples_param_aggrrep(param='fraction_of_sample_fr')
agg_files_conc = gcms.create_files_param_aggrrep(param="conc_vial_mg_L")
agg_files_fr = gcms.create_files_param_aggrrep(param="fraction_of_sample_fr")
agg_samples_conc, agg_samples_conc_std = gcms.create_samples_param_aggrrep(
param="conc_vial_mg_L"
)
agg_samples_fr, agg_samples_fr_std = gcms.create_samples_param_aggrrep(
param="fraction_of_sample_fr"
)

# %% Plotting results based on the generated reports, allowing for visual comparison of average values and standard deviations
# Plot results for individual files or samples based

gcms.plot_ave_std(param='fraction_of_sample_fr', min_y_thresh=0.05, files_or_samples='files',
legend_location='outside', xlab_rot=30, filename='sample_fraction_files',
mf = plot_ave_std(
gcms,
width=8,
height=4.5,
param="fraction_of_sample_fr",
min_y_thresh=0.05,
files_or_samples="files",
legend_location="best",
x_label_rotation=30,
filename="sample_fraction_files",
# only_samples_to_plot=['A_1', 'A_2', 'Ader_1', 'B_2'],
y_lim=[0, .3], annotate_lttrs='a'
)
gcms.plot_ave_std(param='fraction_of_sample_fr', min_y_thresh=0.05, files_or_samples='samples',
legend_location='outside', xlab_rot=0, filename='sample_fraction_samples',
y_lim=[0, 0.4],
annotate_lttrs="a",
annotate_lttrs_xy=(-0.08, -0.08),
)
# %%
mf = plot_ave_std(
gcms,
width=7,
height=4.5,
param="fraction_of_sample_fr",
min_y_thresh=0.05,
files_or_samples="samples",
legend_location="outside",
filename="sample_fraction_samples",
# only_samples_to_plot=['A_1', 'A_2', 'Ader_1', 'B_2'],
y_lim=[0, .3], annotate_lttrs='b'
)
#%%
y_lim=[0, 0.3],
annotate_lttrs="b",
annotate_lttrs_xy=(-0.2, -0.05),
)
# %%
# plot results bases on aggreport
gcms.plot_ave_std(param='fraction_of_sample_fr', aggr=True, files_or_samples='files',
filename='sample_fraction_aggr_files', xlab_rot=30, annotate_lttrs='c',
min_y_thresh=0.01, #yt_sum=True,
y_lim=[0, 1], color_palette='Set2')
gcms.plot_ave_std(param='fraction_of_sample_fr', aggr=True, files_or_samples='samples',
filename='sample_fraction_aggr_samples', annotate_lttrs='d',
min_y_thresh=0.01, #yt_sum=True,
y_lim=[0, 1], color_palette='Set2')
#%%
gcms.plot_ave_std(param='fraction_of_sample_fr', min_y_thresh=0.01,
legend_location='outside', only_samples_to_plot=['A', 'Ader', 'B'],
y_lim=[0, 0.3]
)
mf = plot_ave_std(
gcms,
width=7,
height=4.5,
param="fraction_of_sample_fr",
aggr=True,
files_or_samples="files",
filename="sample_fraction_aggr_files",
x_label_rotation=30,
annotate_lttrs="c",
min_y_thresh=0.01, # yt_sum=True,
y_lim=[0, 1],
color_palette="Set2",
annotate_lttrs_xy=(-0.08, -0.08),
)
# %%
mf = plot_ave_std(
gcms,
width=4.5,
height=4.5,
param="fraction_of_sample_fr",
aggr=True,
files_or_samples="samples",
filename="sample_fraction_aggr_samples",
annotate_lttrs="d",
min_y_thresh=0.01, # yt_sum=True,
y_lim=[0, 1],
color_palette="Set2",
annotate_lttrs_xy=(-0.15, -0.05),
)
# %%
mf = plot_ave_std(
gcms,
width=8,
height=4.5,
param="fraction_of_sample_fr",
min_y_thresh=0.01,
legend_location="outside",
only_samples_to_plot=["A", "Ader", "B"],
y_lim=[0, 0.3],
)
# %% plot results bases on aggreport
gcms.plot_ave_std(param='fraction_of_sample_fr', aggr=True, min_y_thresh=0.01,
y_lim=[0, .5], color_palette='Set2')
mf = plot_ave_std(
gcms,
width=4.5,
height=4.5,
param="fraction_of_sample_fr",
aggr=True,
min_y_thresh=0.01,
y_lim=[0, 0.5],
color_palette="Set2",
)

#%%
# %%
run_tanimoto_analysis = True
if run_tanimoto_analysis:
in_path = folder_path
out_path_cal = plib.Path(folder_path, 'output_tanimoto')
out_path_cal = plib.Path(folder_path, "output_tanimoto")
out_path_cal.mkdir(parents=True, exist_ok=True)
calibration = pd.read_excel(plib.Path(in_path, 'calibration88.xlsx'),
engine='openpyxl', index_col='Name')
calibration = pd.read_excel(
plib.Path(in_path, "calibration88.xlsx"), engine="openpyxl", index_col="Name"
)

combs = combinations(calibration.index.tolist(), 2)
tanimoto_error = pd.DataFrame(columns=['CalErr', 'DistMW', 'TanimS'], index=range(3915))
tanimoto_error = pd.DataFrame(
columns=["CalErr", "DistMW", "TanimS"], index=range(3915)
)
for c, (name0, name1) in enumerate(combs):
tanimoto_error.loc[c, 'CalErr'] = get_calibration_error(name0, name1, calibration)
tanimoto_error.loc[c, 'DistMW'] = abs(calibration.loc[name0,'MW'] - calibration.loc[name1,'MW'])
tanimoto_error.loc[c, "CalErr"] = get_calibration_error(
name0, name1, calibration
)
tanimoto_error.loc[c, "DistMW"] = abs(
calibration.loc[name0, "MW"] - calibration.loc[name1, "MW"]
)
try:
smis = [calibration.loc[name0, 'canonical_smiles'],
calibration.loc[name1, 'canonical_smiles']]
smis = [
calibration.loc[name0, "canonical_smiles"],
calibration.loc[name1, "canonical_smiles"],
]
mols = [Chem.MolFromSmiles(smi) for smi in smis]
fps = [GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in mols]
# perform Tanimoto similarity
tanimoto_error.loc[c, 'TanimS'] = DataStructs.TanimotoSimilarity(fps[0], fps[1])
tanimoto_error.loc[c, "TanimS"] = DataStructs.TanimotoSimilarity(
fps[0], fps[1]
)
except TypeError:
tanimoto_error.loc[c, 'TanimS'] = np.nan
tanimoto_error.to_excel(plib.Path(out_path_cal, 'tanimoto_error.xlsx'))
fig, ax, axt, fig_par = figure_create(rows=1, cols=1, plot_type=0, hgt_mltp=1.2,
paper_col=1.4)

aa = ax[0].scatter(tanimoto_error['TanimS'].values, tanimoto_error['CalErr'].values,
c=tanimoto_error['DistMW'].values)
ax[0].set_yscale('log')
plt.colorbar(aa, label=r'$\Delta$MW [atomic mass unit]')
plt.hlines(y=100, xmin=0, xmax=1, color='grey', linestyle='dotted')
plt.vlines(x=.4, ymin=0, ymax=100, color='grey', linestyle='dashed')
ax[0].annotate('default\nsetting', ha='left', va='bottom',
xycoords='axes fraction',
xy=(0.3, .01))
ax[0].annotate('Error = 100%', ha='left', va='bottom',
xycoords='axes fraction',
xy=(0.8, .6))
figure_save('tanimoto_error', out_path_cal, fig, ax, axt, fig_par,
x_lab='Tanimoto Similarity [-]', x_lim=[0, 1], y_lab='Average error [%]',
legend=None, tight_layout=True)
tanimoto_error.loc[c, "TanimS"] = np.nan
tanimoto_error.to_excel(plib.Path(out_path_cal, "tanimoto_error.xlsx"))
myfig = MyFigure(
rows=1,
cols=1,
width=7,
height=6,
x_lab="Tanimoto Similarity [-]",
x_lim=[0, 1],
y_lab="Average error [%]",
)
# fig, ax, axt, fig_par = figure_create(rows=1, cols=1, plot_type=0, hgt_mltp=1.2,
# paper_col=1.4)

aa = myfig.axs[0].scatter(
tanimoto_error["TanimS"].values,
tanimoto_error["CalErr"].values,
c=tanimoto_error["DistMW"].values,
)
myfig.axs[0].set_yscale("log")
plt.colorbar(aa, label=r"$\Delta$MW [atomic mass unit]")
plt.hlines(y=100, xmin=0, xmax=1, color="grey", linestyle="dotted")
plt.vlines(x=0.4, ymin=0, ymax=100, color="grey", linestyle="dashed")
myfig.axs[0].annotate(
"default\nsetting",
ha="left",
va="bottom",
xycoords="axes fraction",
xy=(0.3, 0.01),
)
myfig.axs[0].annotate(
"Error = 100%", ha="left", va="bottom", xycoords="axes fraction", xy=(0.8, 0.6)
)
myfig.save_figure(filename="tanimoto_error", out_path=out_path_cal)
# figure_save('tanimoto_error', out_path_cal, fig, ax, axt, fig_par,
# x_lab='Tanimoto Similarity [-]', x_lim=[0, 1], y_lab='Average error [%]',
# legend=None, tight_layout=True)

# create and export the similarity table for tetradecanoic acid
cpmnds = gcms.compounds_properties.set_index('iupac_name')
cpmnds = cpmnds[~cpmnds.index.duplicated(keep='first')].copy()
cpmnds = gcms.compounds_properties.set_index("iupac_name")
cpmnds = cpmnds[~cpmnds.index.duplicated(keep="first")].copy()
iupac = cpmnds.index[0]
mws = [cpmnds.loc[iupac, 'molecular_weight']]
smis = [cpmnds.loc[iupac, 'canonical_smiles']]
mws = [cpmnds.loc[iupac, "molecular_weight"]]
smis = [cpmnds.loc[iupac, "canonical_smiles"]]
names_cal = [iupac]
# then add all properties for all calibrated compounds
# if the sample was not derivatized (default)
# if not self.is_files_deriv[filename]:
for c in cpmnds.index.tolist()[1:6]:
names_cal.append(c)
# print(df_comps.index)
smis.append(cpmnds.loc[c, 'canonical_smiles'])
mws.append(cpmnds.loc[c, 'molecular_weight'])
smis.append(cpmnds.loc[c, "canonical_smiles"])
mws.append(cpmnds.loc[c, "molecular_weight"])
# calculate the delta mw with all calib compounds
delta_mw = np.abs(np.asarray(mws)[0]
- np.asarray(mws)[1:])
delta_mw = np.abs(np.asarray(mws)[0] - np.asarray(mws)[1:])
# get mols and fingerprints from rdkit for each comp
mols = [Chem.MolFromSmiles(smi) for smi in smis]
fps = [GetMorganFingerprintAsBitVect(ml, 2, nBits=1024)
for ml in mols]
fps = [GetMorganFingerprintAsBitVect(ml, 2, nBits=1024) for ml in mols]
# perform Tanimoto similarity betwenn the first and all
# other compounds
s = DataStructs.BulkTanimotoSimilarity(fps[0], fps[1:])
# create a df with results
df_sim = pd.DataFrame(data={'name': names_cal[1:],
'smiles': smis[1:], 'Similarity': s, 'delta_mw': delta_mw})
df_sim = pd.DataFrame(
data={
"name": names_cal[1:],
"smiles": smis[1:],
"Similarity": s,
"delta_mw": delta_mw,
}
)
# put the index title as the comp
df_sim.set_index('name', inplace=True)
df_sim.set_index("name", inplace=True)
df_sim.index.name = iupac
# sort values based on similarity and delta mw
df_sim = df_sim.sort_values(['Similarity', 'delta_mw'],
ascending=[False, True])
df_sim.to_excel(plib.Path(out_path_cal, 'similarity_table_tetradecanoic.xlsx'))

# %%

df_sim = df_sim.sort_values(["Similarity", "delta_mw"], ascending=[False, True])
df_sim.to_excel(plib.Path(out_path_cal, "similarity_table_tetradecanoic.xlsx"))

0 comments on commit f14c6f7

Please sign in to comment.