In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/laurasisson/odor-pair'

In [2]:
import json
with open("dataset/single_fold.json") as f:
    split_data = json.load(f)
{k:len(v) for k, v in split_data.items()}

{'train': 43992, 'test': 39554, 'covered_notes': 77}

In [3]:
import analysis.fingerprint

mfpgen = analysis.fingerprint.make_mfpgen()

In [4]:
train_mols = set()
for d in split_data["train"]:
    train_mols.update(d["edge"])

test_mols = set()
for d in split_data["test"]:
    test_mols.update(d["edge"])

len(train_mols), len(test_mols)

(1471, 1468)

In [5]:
assert len(train_mols.intersection(test_mols)) == 0

In [6]:
from tqdm.notebook import tqdm

mol_to_embed = dict()
for m in tqdm(train_mols.union(test_mols)):
    try:
        mol_to_embed[m] = analysis.fingerprint.smiles_to_embed(mfpgen,m)
    except Exception as e:
        print(m)
        continue
        
    
next(iter(mol_to_embed.items()))

  0%|          | 0/2939 [00:00<?, ?it/s]

(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC
CC(=O)c1ccc(C)n1
InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3


[10:42:16] SMILES Parse Error: syntax error while parsing: (C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC
[10:42:16] SMILES Parse Error: Failed parsing SMILES '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC' for input: '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC'
[10:42:16] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 8
[10:42:16] SMILES Parse Error: syntax error while parsing: InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3
[10:42:16] SMILES Parse Error: Failed parsing SMILES 'InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3' for input: 'InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3'


('C[C@@H](CCC=C(C)C)C1CCC(=C)C=C1',
 array([0, 1, 0, ..., 0, 0, 0], dtype=uint8))

In [7]:
all_blend_notes = {'oily', 'ethereal', 'fermented', 'bitter', 'soapy', 'phenolic', 'winey', 'roasted', 'spicy', 'fusel', 'tropical', 'anise', 'honey', 'aromatic', 'meaty', 'fresh', 'woody', 'melon', 'mentholic', 'clean', 'camphoreous', 'nutty', 'herbal', 'jammy', 'earthy', 'vegetable', 'caramellic', 'coconut', 'orris', 'bready', 'citrus', 'chemical', 'burnt', 'dairy', 'cheesy', 'fatty', 'floral', 'fruity', 'green', 'marine', 'coumarinic', 'licorice', 'mossy', 'tonka', 'creamy', 'waxy', 'animal', 'acidic', 'brown', 'cocoa', 'chocolate', 'sweet', 'rummy', 'sour', 'balsamic', 'coffee', 'solvent', 'fungal', 'berry', 'amber', 'cooling', 'onion', 'buttery', 'estery', 'powdery', 'musk', 'aldehydic', 'medicinal', 'alliaceous', 'minty', 'vanilla', 'thujonic', 'sulfurous', 'musty'}
all_blend_notes = list(all_blend_notes)

In [8]:
import numpy as np
import graph.utils

def make_notes_vectors(dataset,should_concat=True):
    xs = []
    ys = []
    empty = 0
    for d in tqdm(dataset):
        blnd = graph.utils.canonize(d["blend_notes"])
        
        mol1, mol2 = d["edge"]
        if not mol1 in mol_to_embed or not mol2 in mol_to_embed:
            continue
        
        if should_concat:
            x = np.concatenate([mol_to_embed[mol1],mol_to_embed[mol2]])
        else:
            x = mol_to_embed[mol1] + mol_to_embed[mol2]
        y = graph.utils.multi_hot(blnd,all_blend_notes)

        if x.sum() == 0 or y.sum() == 0:
            empty += 1
            continue

        xs.append(x)
        ys.append(y)
        
    print(f"Found {empty} empty blends.")        
    return np.array(xs), np.array(ys)

train_x, train_y = make_notes_vectors(split_data["train"])
test_x, test_y = make_notes_vectors(split_data["test"])
train_x.shape, test_x.shape, train_y.shape, test_y.shape

  0%|          | 0/43992 [00:00<?, ?it/s]

Found 262 empty blends.


  0%|          | 0/39554 [00:00<?, ?it/s]

Found 286 empty blends.


((43721, 4096), (39237, 4096), (43721, 74), (39237, 74))

### Using concatenation of MFPs

In [9]:
def checksum(vector):
    # Assert every label has a sample and Assert every sample has a label
    return (vector.sum(axis=0) > 0).all() and (vector.sum(axis=1) > 0).all()
assert checksum(train_y)
assert checksum(test_y)
"Data looks good!"

'Data looks good!'

In [None]:
import sklearn.metrics
import sklearn.ensemble
import sklearn.linear_model
import random

def train_evaluate_model_clf(base_classifier, fraction=1):
    # Hack because I want progress bars
    estimators = []
    frac_idcs = random.sample(list(range(len(train_x))),int(fraction*len(train_x)))
    frac_x, frac_y = train_x[frac_idcs], train_y[frac_idcs]
    
    for i in tqdm(range(train_y.shape[1]), desc="Training classifiers for each label",disable=True):
        classifier = sklearn.base.clone(base_classifier) 
        estimators.append(classifier.fit(frac_x, frac_y[:, i]))
        
    test_pred = np.zeros_like(test_y, dtype=float)
    for i in range(test_y.shape[1]):
        test_pred[:, i] = estimators[i].predict_proba(test_x)[:,1]
    
    # Calculate AUROC for micro and macro averaging
    for average in ['macro']:
        auroc = sklearn.metrics.roc_auc_score(test_y, test_pred, average=average)
        print(frac,average,"AUROC:", auroc)
        
    auroc_per_label = {}
    for i, label in enumerate(all_blend_notes):
        auroc = sklearn.metrics.roc_auc_score(test_y[:, i], test_pred[:, i])
        auroc_per_label[label] = auroc
    
    # return auroc_per_label
    
for frac in tqdm(np.linspace(.1, 1, 10),smoothing=0):
    try:
        train_evaluate_model_clf(sklearn.linear_model.LogisticRegression(max_iter=1000),frac)
    except ValueError as e:
        continue

  0%|          | 0/10 [00:00<?, ?it/s]

0.2 macro AUROC: 0.7603564314714192
0.30000000000000004 macro AUROC: 0.7635943275822386
0.4 macro AUROC: 0.7613288908796166
0.5 macro AUROC: 0.754887285791242


In [11]:
import torch
import numpy as np
import scipy
import sklearn.linear_model
import sklearn.metrics

class LogitRegression:
    EPS = 1e-5

    def __init__(self, model):
        self.model = model

    def _clip01(self, arr):
        return np.asarray(arr).clip(self.EPS, 1 - self.EPS)

    def fit(self, x, p):
        p = self._clip01(p)
        y = scipy.special.logit(p)
        return self.model.fit(x, y)

    def predict(self, x):
        y = self.model.predict(x)
        return scipy.special.expit(y)

def train_evaluate_model_logit(base_classifier):
    print("Training the model...")
    model = LogitRegression(base_classifier)
    model.fit(train_x, train_y)

    print("Predicting probabilities for AUROC calculation...")
    test_pred = model.predict(test_x)

     # Calculate AUROC using sklearn
    auc_micro_sklearn = sklearn.metrics.roc_auc_score(test_y, test_pred, average='micro')
    auc_macro_sklearn = sklearn.metrics.roc_auc_score(test_y, test_pred, average='macro')

    # print("Sklearn Micro-averaged AUROC:", auc_micro_sklearn)
    print("Sklearn Macro-averaged AUROC:", auc_macro_sklearn)
    
    auroc_per_label = {}
    for i, label in enumerate(all_blend_notes):
        auroc = sklearn.metrics.roc_auc_score(test_y[:, i], test_pred[:, i])
        auroc_per_label[label] = auroc
    
    # return auroc_per_label
    
train_evaluate_model_logit(sklearn.linear_model.LinearRegression())

Training the model...
Predicting probabilities for AUROC calculation...
Sklearn Macro-averaged AUROC: 0.49064521690036866


In [12]:
train_x, train_y = make_notes_vectors(split_data["train"],should_concat=False)
test_x, test_y = make_notes_vectors(split_data["test"],should_concat=False)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
assert checksum(train_y)
assert checksum(test_y)
"Data looks good!"

  0%|          | 0/43992 [00:00<?, ?it/s]

Found 262 empty blends.


  0%|          | 0/39554 [00:00<?, ?it/s]

Found 286 empty blends.
(43721, 2048) (39237, 2048) (43721, 74) (39237, 74)


'Data looks good!'

In [13]:
print("Logistic Regression")
train_evaluate_model_clf(sklearn.linear_model.LogisticRegression(max_iter=1000))
print("Linear Regression")
train_evaluate_model_logit(sklearn.linear_model.LinearRegression())

Logistic Regression
Training the model...


Training classifiers for each label:   0%|          | 0/74 [00:00<?, ?it/s]

Predicting probabilities for AUROC calculation...


Predicting probabilities:   0%|          | 0/74 [00:00<?, ?it/s]

macro AUROC: 0.7392836214911928
Linear Regression
Training the model...
Predicting probabilities for AUROC calculation...
Sklearn Macro-averaged AUROC: 0.49414529054634415


In [14]:
# import sklearn.ensemble

# print("Random Forest")
# train_evaluate_model(sklearn.ensemble.RandomForestClassifier())

In [15]:
# import sklearn.svm
# import sklearn.multioutput

# print("SVM")
# train_evaluate_model(sklearn.svm.SVC(probability=True))

### Using summation of MFPs

In [16]:
# train_x, train_y = make_notes_vectors(split_data["train"],split_data["covered_notes"],should_concat=False)
# test_x, test_y = make_notes_vectors(split_data["test"],split_data["covered_notes"],should_concat=False)
# print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
# assert checksum(train_y)
# assert checksum(test_y)
# "Data looks good!"

In [17]:
# print("Logistic Regression")
# train_evaluate_model(sklearn.linear_model.LinearRegression())
# print()
# print("Random Forest")
# train_evaluate_model(sklearn.ensemble.RandomForestClassifier())
# print()
# print("SVM")
# train_evaluate_model(sklearn.svm.SVC(probability=True))