# Predicting odorant/odorless molecules
We will just load these molecules from https://github.com/pyrfume/pyrfume-data/tree/main/sharma_2021b

Then build a random forest to predict odor quality from Morgan Fingerprints.

You could also adapt this code to build on top of OpenPOM embeddings.

### Load data

In [1]:
import pandas as pd

print("Odorant Molecules")
odorants = pd.read_csv("sharma_2021b/odorants.csv")
# Assign target property
odorants["Odorant"] = 1
display(odorants)

print("Odorless Molecules")
odorless = pd.read_csv("sharma_2021b/odorless.csv")
odorless["Odorant"] = 0
display(odorless)

Odorant Molecules


Unnamed: 0,Sr.No.,Name,SMILES,CAS No,Mol. Wt.,Mol. Formula,PubChem,ZINC,#OR's,#Odors,Odorant
0,1,1-Aminopropan-2-ol,CC(CN)O,78-96-6,75.11,C3H9NO,4,ZINC4658592,0,1,1
1,2,3-methyl-2-oxobutanoic acid,CC(C)C(=O)C(=O)O,759-05-7,116.12,C5H8O3,49,ZINC1532553,0,1,1
2,3,2-Oxobutyric acid,CCC(=O)C(=O)O,600-18-0,102.09,C4H6O3,58,ZINC1532540,0,5,1
3,4,4-methyl-2-oxopentanoic acid,CC(C)CC(=O)C(=O)O,816-66-0,130.14,C6H10O3,70,ZINC1532578,0,1,1
4,5,"3,4-dihydroxybenzoic acid",C1=CC(=C(C=C1C(=O)O)O)O,99-50-3,154.12,C7H6O4,72,ZINC13246,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...
3980,3981,"N-Acetyl-2,3-dihydro-1H-pyrrole",CC(=O)N1CCC=C1,23105-58-0,111.14,C6H9NO,10909522,ZINC13480247,0,1,1
3981,3982,"2,4-Heptadienal",CCC=CC=CC=O,4313-03--5,110.15,C7H10O,52833210,ZINC2567938,0,7,1
3982,3983,"1,3-Pentadiene",CC=CC=C,504-60-9,68.12,C5H8,62204,ZINC1699364,0,2,1
3983,3984,"trans-2,3-Epoxydecanal",CCCCCCCC1C(O1)C=O,CAS-5,170.25,C10H18O2,6429290,ZINC100076041,0,1,1


Odorless Molecules


Unnamed: 0,Sr.No.,Name,SMILES,CAS No,Mol. Wt.,Mol. Formula,PubChem,ZINC,Odorant
0,1,"2,3-Dihydroxybenzoic acid",C1=CC(=C(C(=C1)O)O)C(=O)O,303-38-8,154.12,C7H6O4,19,ZINC388166,0
1,2,Carbon dioxide,C(=O)=O,124-38-9,44.01,CO2,280,,0
2,3,Carbastat,C[N+](C)(C)CCOC(=O)N,462-58-8,147.20,C6H15N2O2+,2551,ZINC3079342,0
3,4,Z-Dimethomorph,COC1=C(C=C(C=C1)/C(=CC(=O)N2CCOCC2)/C3=CC=C(C=...,110488-70-5,324.42,C21H22ClNO4,5463781,ZINC26891906,0
4,5,Nepetalic acid,C[C@H]1CC[C@@H]([C@@H]1C(=O)O)[C@@H](C)C=O,524-06-1,387.86,C10H16O3,5486616,ZINC6037654,0
...,...,...,...,...,...,...,...,...,...
1119,1120,Pimelic acid,C(CCC(=O)O)CCC(=O)O,111-16-0,160.17,C7H12O4,385,,0
1120,1121,Malic acid,C(C(C(=O)O)O)C(=O)O,6915-15-7,134.09,C4H6O5,525,,0
1121,1122,DL-Alanine,CC(C(=O)O)N,302-72-7,89.09,C3H7NO2,602,,0
1122,1123,Nonadecane,CCCCCCCCCCCCCCCCCCC,629-92-5,268.52,C19H40,12401,,0


### Combine the datasets and select relevant columns

In [2]:
all_data = pd.concat([odorants,odorless],axis=0)[["Name","SMILES","Odorant"]]
all_data

Unnamed: 0,Name,SMILES,Odorant
0,1-Aminopropan-2-ol,CC(CN)O,1
1,3-methyl-2-oxobutanoic acid,CC(C)C(=O)C(=O)O,1
2,2-Oxobutyric acid,CCC(=O)C(=O)O,1
3,4-methyl-2-oxopentanoic acid,CC(C)CC(=O)C(=O)O,1
4,"3,4-dihydroxybenzoic acid",C1=CC(=C(C=C1C(=O)O)O)O,1
...,...,...,...
1119,Pimelic acid,C(CCC(=O)O)CCC(=O)O,0
1120,Malic acid,C(C(C(=O)O)O)C(=O)O,0
1121,DL-Alanine,CC(C(=O)O)N,0
1122,Nonadecane,CCCCCCCCCCCCCCCCCCC,0


### Generate MFPs
I reuse the same utility across many projects, but perhaps there is a more standard library or https://github.com/osmoai/osmordred may be quite powerful.

In [3]:
import fingerprint

mfpgen = fingerprint.make_mfpgen()
all_data["MFP"] = all_data["SMILES"].apply(lambda smiles: fingerprint.smiles_to_embed(mfpgen, smiles))
# Drop molecules we couldn't generate fingerprints for
all_data = all_data.dropna(how="any")
print(f"Found {len(all_data)} valid molecules.")
all_data.iloc[0]

Found 5097 valid molecules.


Name                                      1-Aminopropan-2-ol
SMILES                                               CC(CN)O
Odorant                                                    1
MFP        [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: 0, dtype: object

### Split data into train/test

In [4]:
import sklearn
import sklearn.model_selection

# Stratify splits based on Odorant label
train_data, test_data = sklearn.model_selection.train_test_split(all_data,stratify=all_data["Odorant"])
print(f"Split with {len(train_data)} Train and {len(test_data)} Test Molecules")
print(f"Stratification: Train = {train_data['Odorant'].mean():.3f} Odorant & Test = {test_data['Odorant'].mean():.3f} Odorant")

Split with 3822 Train and 1275 Test Molecules
Stratification: Train = 0.780 Odorant & Test = 0.780 Odorant


### Train Random Forest
We could do further tuning to improve score or explore class imbalance ratio scoring

In [5]:
import sklearn.ensemble
import numpy as np

train_X, test_X = np.stack(train_data["MFP"].tolist()), np.stack(test_data["MFP"].tolist())
train_y, test_y = train_data["Odorant"].to_numpy(), test_data["Odorant"].to_numpy()

clf = sklearn.ensemble.RandomForestClassifier()
clf.fit(train_X, train_y)
print(f"Test set accuracy: {clf.score(test_X, test_y):.3f}")

Test set accuracy: 0.946


### Write to file

In [6]:
import pickle

model_fname = "odorant_classifier.pkl"

with open(model_fname,"wb") as f:
    pickle.dump(clf,f)

### Read from file and sanity-check accuracy

In [7]:
with open(model_fname,"rb") as f:
    clf = pickle.load(f)

print(f"Test set accuracy: {clf.score(test_X, test_y):.3f}")

Test set accuracy: 0.946
