In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/laurasisson/odor-pair'

In [2]:
import json
with open("dataset/full.json") as f:
    full_data = json.load(f)
len(full_data), full_data[0]

(166814,
 {'mol1': 'CCCCC/C=C/C(=O)OC',
  'mol1_notes': ['violet',
   'sweet',
   'oily',
   'melon',
   'pear',
   'hairy',
   'costus',
   'fruity',
   'violet leaf',
   'waxy',
   'fresh',
   'green'],
  'mol2': 'CCCCCOC(=O)CCC',
  'mol2_notes': ['cherry',
   'sweet',
   'pineapple',
   'fruity',
   'banana',
   'tropical'],
  'blend_notes': ['animal', 'fruity', 'waxy']})

In [3]:
from tqdm.notebook import tqdm
mol_to_notes = dict()
for d in full_data:
    mol_to_notes[d["mol1"]] = d["mol1_notes"]
    mol_to_notes[d["mol2"]] = d["mol2_notes"]    
len(mol_to_notes)

2971

In [4]:
import single.utils
import graph.utils

raw_single_notes = set()
for d in full_data:
    raw_single_notes.update(d["mol1_notes"])
    raw_single_notes.update(d["mol2_notes"])

all_single_notes = list(single.utils.canonize(raw_single_notes))
f"Before Canonicalization: |Single Notes| = {len(all_single_notes)}. After Canonicalization: |Single Notes| = {len(all_single_notes)}."

'Before Canonicalization: |Single Notes| = 398. After Canonicalization: |Single Notes| = 398.'

In [5]:
import json
with open("dataset/single_fold.json") as f:
    split_data = json.load(f)
{k:len(v) for k, v in split_data.items()}

{'train': 43992, 'test': 39554, 'covered_notes': 77}

In [6]:
split_data["train"][0]

{'edge': ['CCCCCCCCCCCC(OC)OC', 'CCC/C=C\\CO'], 'blend_notes': ['green']}

In [7]:
def get_seen_single_notes(dataset):
    seen = set()
    for d in dataset:
        mol1, mol2 = d["edge"]
        seen.update(mol_to_notes[mol1])
        seen.update(mol_to_notes[mol2])
    return set(single.utils.canonize(seen))
train_singles = get_seen_single_notes(split_data["train"])
test_singles = get_seen_single_notes(split_data["test"])
common_singles = list(train_singles.intersection(test_singles))
len(train_singles), len(test_singles), len(common_singles)

(361, 369, 332)

In [10]:
all_blend_notes = {'oily', 'ethereal', 'fermented', 'bitter', 'soapy', 'phenolic', 'winey', 'roasted', 'spicy', 'fusel', 'tropical', 'anise', 'honey', 'aromatic', 'meaty', 'fresh', 'woody', 'melon', 'mentholic', 'clean', 'camphoreous', 'nutty', 'herbal', 'jammy', 'earthy', 'vegetable', 'caramellic', 'coconut', 'orris', 'bready', 'citrus', 'chemical', 'burnt', 'dairy', 'cheesy', 'fatty', 'floral', 'fruity', 'green', 'marine', 'coumarinic', 'licorice', 'mossy', 'tonka', 'creamy', 'waxy', 'animal', 'acidic', 'brown', 'cocoa', 'chocolate', 'sweet', 'rummy', 'sour', 'balsamic', 'coffee', 'solvent', 'fungal', 'berry', 'amber', 'cooling', 'onion', 'buttery', 'estery', 'powdery', 'musk', 'aldehydic', 'medicinal', 'alliaceous', 'minty', 'vanilla', 'thujonic', 'sulfurous', 'musty'}
all_blend_notes = list(all_blend_notes)

In [11]:
import numpy as np

def make_notes_vectors(dataset):
    xs = []
    ys = []
    empty = 0
    for d in tqdm(dataset):
        blnd = graph.utils.canonize(d["blend_notes"])
        
        mol1, mol2 = d["edge"]
        n1 = set(single.utils.canonize(mol_to_notes[mol1]))
        n2 = set(single.utils.canonize(mol_to_notes[mol2]))
        
        x = graph.utils.multi_hot(n1,common_singles)+graph.utils.multi_hot(n2,common_singles)
        y = graph.utils.multi_hot(blnd,all_blend_notes)

        if x.sum() == 0 or y.sum() == 0:
            empty += 1
            continue

        xs.append(x)
        ys.append(y)
        
    print(f"Found {empty} empty blends.")        
    return np.array(xs), np.array(ys)

train_x, train_y = make_notes_vectors(split_data["train"])
test_x, test_y = make_notes_vectors(split_data["test"])
train_x.shape, test_x.shape, train_y.shape, test_y.shape

  0%|          | 0/43992 [00:00<?, ?it/s]

Found 262 empty blends.


  0%|          | 0/39554 [00:00<?, ?it/s]

Found 286 empty blends.


((43730, 332), (39268, 332), (43730, 74), (39268, 74))

In [12]:
def checksum(vector):
    # Assert every label has a sample and Assert every sample has a label
    return (vector.sum(axis=0) > 0).all() and (vector.sum(axis=1) > 0).all()
assert checksum(train_x)
assert checksum(train_y)
assert checksum(test_x)
assert checksum(test_y)
"Data looks good!"

'Data looks good!'

In [13]:
import numpy as np
import scipy
import sklearn.linear_model
import sklearn.metrics

class LogitRegression:
    EPS = 1e-5

    def __init__(self, model):
        self.model = model

    def _clip01(self, arr):
        return np.asarray(arr).clip(self.EPS, 1 - self.EPS)

    def fit(self, x, p):
        p = self._clip01(p)
        y = scipy.special.logit(p)
        return self.model.fit(x, y)

    def predict(self, x):
        y = self.model.predict(x)
        return scipy.special.expit(y)

def train_evaluate_model(base_classifier):
    print("Training the model...")
    model = LogitRegression(base_classifier)
    model.fit(train_x,train_y)
    
    print("Predicting probabilities for AUROC calculation...")
    test_pred = model.predict(test_x)
    print(test_pred.shape)
    
    # Calculate AUROC for micro and macro averaging
    auc_micro = sklearn.metrics.roc_auc_score(test_y, test_pred, average='micro')
    auc_macro = sklearn.metrics.roc_auc_score(test_y, test_pred, average='macro')

    print("Micro-averaged AUROC:", auc_micro)
    print("Macro-averaged AUROC:", auc_macro)

train_evaluate_model(sklearn.linear_model.LinearRegression())

Training the model...
Predicting probabilities for AUROC calculation...
(39268, 74)
Micro-averaged AUROC: 0.8729345741985826
Macro-averaged AUROC: 0.7486614677589748


In [14]:
import sklearn.ensemble

print("Random Forest")
train_evaluate_model(sklearn.ensemble.RandomForestRegressor())

Random Forest
Training the model...
Predicting probabilities for AUROC calculation...
(39268, 74)
Micro-averaged AUROC: 0.9064059789592686
Macro-averaged AUROC: 0.7013881040721929


In [None]:
import sklearn.svm
import sklearn.multioutput

print("SVM")
train_evaluate_model(sklearn.multioutput.MultiOutputRegressor(sklearn.svm.SVR()))

SVM
Training the model...
