In [None]:
import os
os.chdir("..")

In [2]:
import json
import torch
import tqdm
import torchmetrics
import graph.utils
import single.utils

import numpy as np
import analysis.fingerprint
import sklearn
import sklearn.model_selection
import warnings

with open("dataset/full.json") as f:
    full_data = json.load(f)

all_blend_notes = set()
all_single_notes = set()
for d in full_data:
    all_blend_notes.update(d["blend_notes"])
    all_single_notes.update(d["mol1_notes"])
    all_single_notes.update(d["mol2_notes"])

# Convert to list so indexing is faster.
print(f"Found {len(all_blend_notes)} notes in blends.")
all_blend_notes = list(graph.utils.canonize(all_blend_notes))
print(f"Canonized down to {len(all_blend_notes)} notes in blends.")
print()
print(f"Found {len(all_single_notes)} notes for single molecules.")
print("Not canonizing single notes")

Found 109 notes in blends.
Canonized down to 77 notes in blends.

Found 496 notes for single molecules.
Not canonizing single notes


In [3]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-large')
instruction = "Represent the Odor descriptor for canonicalization:"
blend_embeddings = model.encode([[instruction, n] for n in all_blend_notes])
single_embeddings = model.encode([[instruction, n] for n in all_single_notes])

load INSTRUCTOR_Transformer
max_seq_length  512


In [4]:
import collections
import scipy
sims = dict()
for i, note in enumerate(all_single_notes):
    sims[note] = collections.Counter()
    for j, other in enumerate(all_blend_notes):
        sims[note][other] = (1-scipy.spatial.distance.cosine(single_embeddings[i],blend_embeddings[j]))

In [5]:
canonize = {n:cntr.most_common(1)[0][0] for n,cntr in sims.items()}
canonize["pine"]

'pine'

In [6]:
def canonize_notes(notes):
    return [canonize[n] for n in notes]

In [7]:
ys = []
xs = []
empty = 0
for d in tqdm.tqdm(full_data):
    blnd = graph.utils.canonize(d["blend_notes"])
    if len(blnd) == 0:
        empty+=1
        continue
    ys.append(graph.utils.multi_hot(blnd))
    assert ys[-1].sum() == len(blnd)

    n1 = canonize_notes(d["mol1_notes"])
    n2 = canonize_notes(d["mol2_notes"])
    xs.append(torch.concat([graph.utils.multi_hot(n1),graph.utils.multi_hot(n2)]))
    assert xs[-1].sum() == len(n1+n2)

print(empty)
ys = torch.stack(ys).int()
xs = torch.stack(xs)

100%|█████████████████████████████████████| 166814/166814 [00:11<00:00, 14672.30it/s]


981


In [8]:
import sklearn.linear_model
class LogitRegression(sklearn.linear_model.RidgeCV):
    EPS = 1e-5

    def __init__(self,**kwargs):
        super().__init__(**kwargs)

    def _clip01(self,arr):
        return np.asarray(arr).clip(self.EPS,1-self.EPS)

    def fit(self, x, p):
        p = self._clip01(p)
        y = scipy.special.logit(p)
        return super().fit(x, y)
    
    def _is_multitask(self):
        return True

    def predict(self, x):
        y = super().predict(x)
        return scipy.special.expit(y)

In [10]:
warnings.filterwarnings("ignore", ".*samples in target*")
auroc = torchmetrics.classification.MultilabelAUROC(ys.shape[1],average=None)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(xs, ys)
print(f"Train:{X_train.shape}->{y_train.shape}")
print(f"Test:{X_test.shape}->{y_test.shape}")
# Try going bak to RidgeCV with alpha=alphas=[1e-3, 1e-2, 1e-1, 1]
lgr = LogitRegression().fit(X_train,y_train)
test_pred = torch.from_numpy(lgr.predict(X_test))
scores = auroc(test_pred,y_test).numpy()
print("AUROC for model from blend -> blend:",np.mean(scores))
idcs = np.flip(np.argsort(scores))
print(list(zip(np.array(all_blend_notes)[idcs],scores[idcs])))

Train:torch.Size([124374, 154])->torch.Size([124374, 77])
Test:torch.Size([41459, 154])->torch.Size([41459, 77])
AUROC for model from blend -> blend: 0.96860343
[('thujonic', 0.9994554), ('mossy', 0.9993726), ('fresh', 0.99932647), ('sour', 0.9983013), ('solvent', 0.99821025), ('mustard', 0.9977543), ('licorice', 0.9977073), ('fusel', 0.9972534), ('cheesy', 0.99660116), ('coumarinic', 0.99656326), ('tonka', 0.9958565), ('mentholic', 0.9955012), ('clean', 0.99515915), ('marine', 0.9950988), ('acidic', 0.9947129), ('orris', 0.994557), ('vanilla', 0.99437743), ('animal', 0.9942964), ('chocolate', 0.99420017), ('coconut', 0.99377936), ('musk', 0.99372214), ('rummy', 0.9936969), ('jammy', 0.9935465), ('roasted', 0.99275625), ('anise', 0.9922807), ('alliaceous', 0.9920552), ('meaty', 0.99185413), ('alcoholic', 0.99171835), ('cocoa', 0.9909704), ('amber', 0.9879905), ('caramellic', 0.98621315), ('honey', 0.9851037), ('berry', 0.98485374), ('buttery', 0.9820218), ('camphoreous', 0.98147976), (

In [12]:
import math
print(f"To have alpha>.9, we can do blends of size {math.log(.9,np.mean(scores)):.2f}.")

To have alpha>.9, we can do blends of size 3.30.


In [13]:
note_translator = LogitRegression().fit(single_embeddings.T,blend_embeddings.T)

In [14]:
ys = []
n1s = []
n2s = []
empty = 0
for d in tqdm.tqdm(full_data):
    blnd = graph.utils.canonize(d["blend_notes"])
    if len(blnd) == 0:
        empty+=1
        continue
    ys.append(graph.utils.multi_hot(blnd))
    assert ys[-1].sum() == len(blnd)

    n1s.append(graph.utils.multi_hot(d["mol1_notes"],underyling_list=list(all_single_notes)))
    n2s.append(graph.utils.multi_hot(d["mol2_notes"],underyling_list=list(all_single_notes)))

n1s = note_translator.predict(n1s)
n2s = note_translator.predict(n2s)
ys = torch.stack(ys).int()
xs = np.concatenate([n1s,n2s],axis=1)

100%|██████████████████████████████████████| 166814/166814 [00:23<00:00, 7129.51it/s]


In [15]:
warnings.filterwarnings("ignore", ".*samples in target*")
auroc = torchmetrics.classification.MultilabelAUROC(ys.shape[1],average=None)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(xs, ys)
print(f"Train:{X_train.shape}->{y_train.shape}")
print(f"Test:{X_test.shape}->{y_test.shape}")
# Try going bak to RidgeCV with alpha=alphas=[1e-3, 1e-2, 1e-1, 1]
lgr = LogitRegression().fit(X_train,y_train)
test_pred = torch.from_numpy(lgr.predict(X_test))
scores = auroc(test_pred,y_test).numpy()
print("AUROC for model from blend -> blend:",np.mean(scores))
idcs = np.flip(np.argsort(scores))
print(list(zip(np.array(all_blend_notes)[idcs],scores[idcs])))

Train:(124374, 154)->torch.Size([124374, 77])
Test:(41459, 154)->torch.Size([41459, 77])
AUROC for model from blend -> blend: 0.9641541
[('fresh', 0.99982446), ('fusel', 0.9998221), ('alcoholic', 0.99954265), ('sour', 0.9995146), ('thujonic', 0.9992993), ('pine', 0.9990592), ('solvent', 0.9982702), ('acidic', 0.9982362), ('mentholic', 0.99767286), ('orris', 0.9975846), ('meaty', 0.99745953), ('vanilla', 0.9971478), ('coumarinic', 0.99686146), ('cheesy', 0.99678963), ('tonka', 0.9955317), ('roasted', 0.99455297), ('camphoreous', 0.99433887), ('jammy', 0.9939801), ('clean', 0.9936921), ('mustard', 0.99341905), ('animal', 0.9933197), ('musk', 0.9922268), ('coconut', 0.9910068), ('mossy', 0.98904395), ('buttery', 0.98865634), ('phenolic', 0.9872745), ('dairy', 0.98716354), ('sulfurous', 0.98709965), ('fungal', 0.9869147), ('vegetable', 0.98689055), ('amber', 0.9854644), ('chocolate', 0.98477143), ('anise', 0.9833919), ('coffee', 0.9831932), ('onion', 0.98213166), ('cocoa', 0.9813953), ('ru

In [40]:
import analysis.fingerprint
mfp = analysis.fingerprint.make_mfpgen()

ys = []
n1s = []
fp1s = []
n2s = []
fp2s = []
empty = 0
for d in tqdm.tqdm(full_data):
    try:
        blnd = graph.utils.canonize(d["blend_notes"])
        if len(blnd) == 0:
            empty+=1
            continue

        fp1 = analysis.fingerprint.smiles_to_embed(mfp,d["mol1"])
        mt1 = graph.utils.multi_hot(d["mol1_notes"],underyling_list=list(all_single_notes))
        
        fp2 = analysis.fingerprint.smiles_to_embed(mfp,d["mol2"])
        mt2 = graph.utils.multi_hot(d["mol2_notes"],underyling_list=list(all_single_notes))
    except TypeError:
        continue
        
    ys.append(graph.utils.multi_hot(blnd))
    assert ys[-1].sum() == len(blnd)
    n1s.append(mt1)
    n2s.append(mt2)
    fp1s.append(fp1)
    fp2s.append(fp2)

n1s = note_translator.predict(n1s)
n2s = note_translator.predict(n2s)
ys = torch.stack(ys).int()
xs = np.concatenate([n1s,n2s],axis=1)

100%|██████████████████████████████████████| 166814/166814 [00:39<00:00, 4274.72it/s]


In [41]:
warnings.filterwarnings("ignore", ".*samples in target*")
auroc = torchmetrics.classification.MultilabelAUROC(ys.shape[1],average=None)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(xs, ys)
print(f"Train:{X_train.shape}->{y_train.shape}")
print(f"Test:{X_test.shape}->{y_test.shape}")
# Try going bak to RidgeCV with alpha=alphas=[1e-3, 1e-2, 1e-1, 1]
lgr = LogitRegression().fit(X_train,y_train)
test_pred = torch.from_numpy(lgr.predict(X_test))
scores = auroc(test_pred,y_test).numpy()
print("AUROC for model from blend -> blend:",np.mean(scores))
idcs = np.flip(np.argsort(scores))
print(list(zip(np.array(all_blend_notes)[idcs],scores[idcs])))

Train:(124316, 154)->torch.Size([124316, 77])
Test:(41439, 154)->torch.Size([41439, 77])
AUROC for model from blend -> blend: 0.9638883
[('thujonic', 0.9998699), ('fusel', 0.99958193), ('licorice', 0.99910927), ('solvent', 0.9988805), ('clean', 0.99861366), ('pine', 0.9985922), ('mustard', 0.99843836), ('meaty', 0.99795306), ('acidic', 0.9975142), ('cheesy', 0.99748904), ('sour', 0.99734473), ('vanilla', 0.9973091), ('alcoholic', 0.9957749), ('tonka', 0.99574125), ('mentholic', 0.9955141), ('roasted', 0.9941913), ('camphoreous', 0.9937157), ('coconut', 0.9935384), ('orris', 0.9926795), ('coumarinic', 0.99248725), ('mossy', 0.99228007), ('jammy', 0.9920994), ('cocoa', 0.991244), ('fresh', 0.99088013), ('phenolic', 0.990392), ('animal', 0.98994017), ('musk', 0.9890408), ('coffee', 0.98681813), ('marine', 0.98646426), ('sulfurous', 0.9859447), ('rummy', 0.985246), ('anise', 0.9844454), ('buttery', 0.984146), ('amber', 0.98326427), ('vegetable', 0.98245746), ('chocolate', 0.9821696), ('oni