In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/laurasisson/odor-pair'

In [2]:
import json
with open("dataset/full_large.json") as f:
    full_data = json.load(f)
len(full_data), full_data[0]

(268963,
 {'mol1': 'CC1=CC2C(C2(C)C)CC1C(=O)C',
  'mol2': 'CCCC(CC)O',
  'blend_notes': ['herbal']})

In [3]:
import collections
from tqdm.notebook import tqdm
all_smiles = collections.Counter()
for d in tqdm(full_data):
    all_smiles[d["mol1"]] += 1
    all_smiles[d["mol2"]] += 1
len(all_smiles), all_smiles.most_common(3)

  0%|          | 0/268963 [00:00<?, ?it/s]

(3430,
 [('', 2206),
  ('CC(CC1=CC2=C(C=C1)OCO2)C=O', 963),
  ('OC(C1OC(\\C=C)(CC1)C)(C)C', 927)])

In [4]:
from ogb.utils import smiles2graph

valid_smiles = set()
for smiles in tqdm(all_smiles.keys()):
    try:
        smiles2graph(smiles)
        valid_smiles.add(smiles)
    except AttributeError:
        print(smiles)
        continue
len(valid_smiles)

  0%|          | 0/3430 [00:00<?, ?it/s]

[15:16:15] SMILES Parse Error: syntax error while parsing: (C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC
[15:16:15] SMILES Parse Error: Failed parsing SMILES '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC' for input: '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC'
[15:16:15] SMILES Parse Error: syntax error while parsing: InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3
[15:16:15] SMILES Parse Error: Failed parsing SMILES 'InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3' for input: 'InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3'


(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC
InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3
CC(=O)c1ccc(C)n1


[15:16:17] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 8


3427

In [5]:
from pubchempy import get_cids

smiles_to_cid = {}
for smiles in tqdm(valid_smiles):
    try:
        # Retrieve the first CID for the SMILES string
        cids = get_cids(smiles, namespace='smiles')
        if not cids:
            continue
        smiles_to_cid[smiles] = cids[0]
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
next(iter(smiles_to_cid.items()))

  0%|          | 0/3427 [00:00<?, ?it/s]

Error processing SMILES : identifier/cid cannot be None


('CC(=O)OCC\\C=C\\C/C=C\\CC', 71587638)

In [6]:
import json

prediction_targets = []
for d in tqdm(full_data):
    try:
        cid1 = smiles_to_cid[d["mol1"]]
        cid2 = smiles_to_cid[d["mol2"]]
        descriptors = str(d["blend_notes"])
        prediction_targets.append({"CID.1":cid1,"CID.2":cid2,"Descriptors":descriptors})
    except KeyError:
        continue
prediction_targets[0]

  0%|          | 0/268963 [00:00<?, ?it/s]

{'CID.1': 44079, 'CID.2': 12178, 'Descriptors': "['herbal']"}

In [7]:
import pandas as pd
prediction_targets_df = pd.DataFrame(prediction_targets).sort_values(by=["CID.1","CID.2"])
prediction_targets_df.to_csv("pyrfume/prediction_targets_sisson_2024.csv",index=False)
prediction_targets_df

Unnamed: 0,CID.1,CID.2,Descriptors
28137,0,177,"['floral', 'pungent']"
28171,0,957,"['floral', 'waxy']"
247415,0,2879,['floral']
28042,0,3893,"['fatty', 'floral']"
28051,0,6544,"['floral', 'green']"
...,...,...,...
13126,152743294,26447,['minty']
13123,152743294,29025,['camphoreous']
9418,152743294,439263,['mentholic']
13125,152743294,11615984,['cooling']


In [8]:
from ast import literal_eval

df = pd.read_csv("pyrfume/prediction_targets_sisson_2024.csv")
df["Descriptors"] = df["Descriptors"].apply(literal_eval)
df

Unnamed: 0,CID.1,CID.2,Descriptors
0,0,177,"[floral, pungent]"
1,0,957,"[floral, waxy]"
2,0,2879,[floral]
3,0,3893,"[fatty, floral]"
4,0,6544,"[floral, green]"
...,...,...,...
266445,152743294,26447,[minty]
266446,152743294,29025,[camphoreous]
266447,152743294,439263,[mentholic]
266448,152743294,11615984,[cooling]


In [9]:
# Generate index
cid_to_smiles = {v:k for k,v in smiles_to_cid.items()}
# Build dataframe
molecules_df = pd.DataFrame(list(cid_to_smiles.items()),columns=["CID","SMILES"]).set_index("CID")
# Count frequency
molecules_df["Frequency"] = molecules_df["SMILES"].map(all_smiles)
# Order columns
molecules_df = molecules_df[["Frequency","SMILES"]].sort_index()
molecules_df.to_csv("pyrfume/molecules.csv",index=False)
molecules_df

Unnamed: 0_level_0,Frequency,SMILES
CID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12,O[C@H]([C@@H](O)C(=O)NCCO)[C@@H](O)[C@H](O)CO
4,5,CC(CN)O
58,175,CCC(=O)C(=O)O
107,35,C1=CC=C(C=C1)CCC(=O)O
119,70,C(CC(=O)O)CN
...,...,...
118437152,11,CC(C)C1CCC(S1)(C)CCOC(=O)C
124155846,94,CC1CC(C(O1)C)SC(=O)C
131751266,13,CC1/N=C(\SC1CC)C(C)CC
131751267,13,CC1/N=C(/CC(C)C)SC1CC
