Skip to content

Commit

Permalink
updated similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
mpecchi committed May 5, 2024
1 parent 029c898 commit 9b74ef9
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 17 deletions.
60 changes: 54 additions & 6 deletions src/gcms_data_analysis/gcms.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def __init__(

self.list_of_all_compounds: list[str] | None = None
self.compounds_properties: pd.DataFrame | None = None
self.dict_names_to_iupacs: dict[str, str] | None = None

self.deriv_list_of_all_compounds: list[str] | None = None
self.deriv_files_present: bool = False
Expand Down Expand Up @@ -306,11 +307,9 @@ def create_list_of_all_compounds(self):
self.load_all_files()
if not self.calibrations:
self.load_calibrations()
all_dfs_with_comps = []
for file in self.files.values():
all_dfs_with_comps.append(file)
for calib in self.calibrations.values():
all_dfs_with_comps.append(calib)
all_dfs_with_comps = [f for f in self.files.values()] + [
f for f in self.calibrations.values()
]
# non-derivatized compounds
all_compounds: pd.DataFrame = pd.concat(all_dfs_with_comps)

Expand All @@ -336,7 +335,7 @@ def create_compounds_properties(

if self.dict_classes_to_codes is None:
self.load_class_code_frac()
if not self.list_of_all_compounds is None:
if self.list_of_all_compounds is None:
self.create_list_of_all_compounds()
# cpdf = pd.DataFrame(index=pd.Index(self.list_of_all_compounds))
#
Expand Down Expand Up @@ -373,6 +372,55 @@ def load_compounds_properties(self) -> pd.DataFrame:
cpdf = self.create_compounds_properties()
return self.compounds_properties

def create_dict_names_to_iupacs(self) -> dict[str, str]:
if self.compounds_properties is None:
self.load_compounds_properties()
self.dict_names_to_iupacs = self.compounds_properties["iupac_name"].to_dict()
return self.dict_names_to_iupacs

def add_iupac_to_files_and_calibrations(self):
"""Adds the IUPAC name to each compound in the loaded files,
distinguishing between underivatized and derivatized compounds,
and updates the corresponding file dataframes."""
if not self.files:
self.load_all_files()
if self.compounds_properties is None:
self.load_compounds_properties()
for file in self.files.values():
file["iupac_name"] = file.index.map(self.dict_names_to_iupacs)
for file in self.calibrations.values():
file["iupac_name"] = file.index.map(self.dict_names_to_iupacs)
return self.files, self.calibrations

# def apply_calibration_to_files(self):
# """Applies the appropriate calibration curve to each compound
# in the loaded files, adjusting concentrations based on calibration
# data, and updates the 'files' attribute with calibrated data."""
# print("Info: apply_calibration_to_files: loop started")
# if not self.files:
# self.load_all_files()
# if not self.calibrations:
# self.load_calibrations()
# if not self.iupac_to_files_added:
# _, _ = self.add_iupac_to_files()

# for filename, _ in self.files.items():
# calibration_name = self.files_info.loc[filename, "calibration_file"]
# calibration = self.calibrations[calibration_name]
# if not self.is_files_deriv[filename]:
# df_comps = self.compounds_properties
# else:
# df_comps = self.deriv_compounds_properties
# file = self._apply_calib_to_file(filename, calibration, df_comps)
# if Project.auto_save_to_excel:
# self.save_file(file, filename)
# self.calibration_to_files_applied = True
# return self.files, self.is_files_deriv


def create_tanimoto_matrix(smiles_list: list[str]):
pass


def get_compound_from_pubchempy(comp_name: str) -> pcp.Compound:
if not isinstance(comp_name, str) or comp_name.isspace():
Expand Down
3 changes: 2 additions & 1 deletion tests/data_minimal_case/S_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Output Time 11:00:20 AM
# of Peaks 3
Mass TIC
Peak# Ret.Time Proc.From Proc.To Mass Area Height A/H Conc. Mark Name Ret. Index Area% Height% SI CAS #
1 13.703 13.580 13.900 TIC 20 20 1 0 V Phenol 2.59 3.01 97 108-95-2
1 13.703 13.580 13.900 TIC 20 20 1 0 V Capric Acid 2.59 3.01 97 108-95-2
2 20.942 20.767 21.020 TIC 200 200 1 0 V Naphthalene 8.11 9.20 98 91-20-3
3 21.426 21.373 21.500 TIC 2000 2000 1 0 V Dodecane 0.36 0.57 96 112-40-3
4 22.426 22.373 22.500 TIC 1000 1000 1 0 V NotValidComp 0.36 0.57 96 112-40-3
3 changes: 2 additions & 1 deletion tests/data_minimal_case/S_2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Output Time 11:00:20 AM
# of Peaks 3
Mass TIC
Peak# Ret.Time Proc.From Proc.To Mass Area Height A/H Conc. Mark Name Ret. Index Area% Height% SI CAS #
1 13.703 13.580 13.900 TIC 40 40 1 0 V Phenol 2.59 3.01 97 108-95-2
1 13.703 13.580 13.900 TIC 40 40 1 0 V Capric Acid 2.59 3.01 97 108-95-2
2 20.942 20.767 21.020 TIC 400 400 1 0 V Naphthalene 8.11 9.20 98 91-20-3
3 21.426 21.373 21.500 TIC 4000 4000 1 0 V Dodecane 0.36 0.57 96 112-40-3
4 24.426 26.373 21.500 TIC 1000 1000 1 0 V Almost Oleic Acid 0.36 0.57 96 112-40-3
Binary file modified tests/data_minimal_case/cal_minimal.xlsx
Binary file not shown.
Binary file modified tests/data_minimal_case/compounds_properties.xlsx
Binary file not shown.
80 changes: 71 additions & 9 deletions tests/test_project_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
proj = Project(
folder_path=folder_path,
auto_save_to_excel=False,
# compounds_to_rename_in_files={"phenol": "catechol"},
compounds_to_rename_in_files={"almost oleic acid": "oleic acid"},
)

# check a couple of defaults
Expand All @@ -33,25 +33,87 @@

fic.calibration_file = fil.calibration_file # this cannot be updated automatically
assert_frame_equal(fil, fic, check_exact=False, atol=1e-5, rtol=1e-5)
# print(fil.columns)
# print(fic.columns)
# print(fil.index)
# print(fic.index)
# print(fil==fic)

# %%
files = proj.load_all_files()
# %%
ccf = proj.load_class_code_frac()

# %%
cal = proj.load_calibrations()
print(cal)
# %%
lac = proj.create_list_of_all_compounds()

# %%
cpc = proj.create_compounds_properties(update_saved_files_info=False)
cpc = proj.create_compounds_properties(update_saved_files_info=True)
cpl = proj.load_compounds_properties()

assert_frame_equal(cpc, cpl, check_exact=False, atol=1e-5, rtol=1e-5, check_dtype=False)
# %%
dni = proj.create_dict_names_to_iupacs()
assert "oleic acid" in dni.keys()
assert "notvalidcomp" in dni.keys()
assert "decanoic acid" in dni.values()
# %%
files_iupac, calibration_iupac = proj.add_iupac_to_files_and_calibrations()
# %%
import pandas as pd
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem.AllChem import (
GetMorganFingerprintAsBitVect,
) # pylint: disable=no-name-in-module


def create_tanimoto_similarity_dict(
comp_smiles: str, calib_smiless: list[str]
) -> dict[str, list[float]]:

mols_comp = Chem.MolFromSmiles(comp_smiles)
mols_cal = [
Chem.MolFromSmiles(smi) for smi in calib_smiless if isinstance(smi, str)
]

# Generate fingerprints from molecule objects, skipping None entries created from invalid SMILES
fps_comp = GetMorganFingerprintAsBitVect(mols_comp, 2, nBits=1024)

fps_cal = [
GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
for mol in mols_cal
if mol is not None
]

# perform Tanimoto similarity betwenn the first and all other compounds
similarity = DataStructs.BulkTanimotoSimilarity(fps_comp, fps_cal)
# create a df with results
return similarity


calib_comp_iupacs = proj.calibrations["cal_minimal"].index.tolist()
calib_smiless = proj.compounds_properties.loc[
calib_comp_iupacs, "canonical_smiles"
].tolist()
tanimoto_similarity_df: pd.DataFrame = pd.DataFrame(
index=proj.compounds_properties["iupac_name"],
columns=calib_comp_iupacs,
)
for iupac, smiles in zip(
proj.compounds_properties["iupac_name"],
proj.compounds_properties["canonical_smiles"],
):
if isinstance(smiles, str):
sim = create_tanimoto_similarity_dict(smiles, calib_smiless)
tanimoto_similarity_df.loc[iupac, :] = sim
# %%
compounds_properties = proj.compounds_properties
calib_comp_iupacs = proj.calibrations["cal_minimal"]["iupac_name"].tolist()
tanimoto_similarity_df: pd.DataFrame = pd.DataFrame(
index=compounds_properties.iupac_name.tolist(),
columns=proj.calibrations["cal_minimal"]["iupac_name"].tolist(),
)
for comp in compounds_properties.iupac_name.tolist():
print(comp)
s = create_tanimoto_similarity_dict(comp, calib_comp_iupacs)
tanimoto_similarity_df.loc[comp, :] = s
# %%

# %%

0 comments on commit 9b74ef9

Please sign in to comment.