In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/laurasisson/odor-pair'

In [3]:
import json
with open("dataset/full.json") as f:
    full_data = json.load(f)
len(full_data), full_data[0]

(166814,
 {'mol1': 'CCCCC/C=C/C(=O)OC',
  'mol1_notes': ['violet',
   'sweet',
   'oily',
   'melon',
   'pear',
   'hairy',
   'costus',
   'fruity',
   'violet leaf',
   'waxy',
   'fresh',
   'green'],
  'mol2': 'CCCCCOC(=O)CCC',
  'mol2_notes': ['cherry',
   'sweet',
   'pineapple',
   'fruity',
   'banana',
   'tropical'],
  'blend_notes': ['animal', 'fruity', 'waxy']})

In [56]:
from tqdm.notebook import tqdm
mol_to_notes = dict()
for d in full_data:
    mol_to_notes[d["mol1"]] = d["mol1_notes"]
    mol_to_notes[d["mol2"]] = d["mol2_notes"]    
len(mol_to_notes)

2971

In [12]:
import single.utils
import graph.utils

raw_single_notes = set()
for d in data:
    raw_single_notes.update(d["mol1_notes"])
    raw_single_notes.update(d["mol2_notes"])

all_single_notes = list(single.utils.canonize(raw_single_notes))
f"Before Canonicalization: |Single Notes| = {len(all_single_notes)}. After Canonicalization: |Single Notes| = {len(all_single_notes)}."

'Before Canonicalization: |Single Notes| = 398. After Canonicalization: |Single Notes| = 398.'

In [6]:
import json
with open("dataset/single_fold.json") as f:
    split_data = json.load(f)
{k:len(v) for k, v in split_data.items()}

{'train': 43992, 'test': 39554, 'covered_notes': 77}

In [10]:
split_data["train"][0]

{'edge': ['CCCCCCCCCCCC(OC)OC', 'CCC/C=C\\CO'], 'blend_notes': ['green']}

In [62]:
def get_seen_single_notes(dataset):
    seen = set()
    for d in dataset:
        mol1, mol2 = d["edge"]
        seen.update(mol_to_notes[mol1])
        seen.update(mol_to_notes[mol2])
    return set(single.utils.canonize(seen))
train_singles = get_seen_single_notes(split_data["train"])
test_singles = get_seen_single_notes(split_data["test"])
common_singles = list(train_singles.intersection(test_singles))
len(train_singles), len(test_singles), len(common_singles)

(361, 369, 332)

In [63]:
import numpy as np

def make_notes_vectors(dataset,all_blend_notes):
    xs = []
    ys = []
    empty = 0
    for d in tqdm(dataset):
        blnd = graph.utils.canonize(d["blend_notes"])
        
        mol1, mol2 = d["edge"]
        n1 = set(single.utils.canonize(mol_to_notes[mol1]))
        n2 = set(single.utils.canonize(mol_to_notes[mol2]))
        
        x = graph.utils.multi_hot(n1,common_singles)+graph.utils.multi_hot(n2,common_singles)
        y = graph.utils.multi_hot(blnd,all_blend_notes)

        if x.sum() == 0 or y.sum() == 0:
            empty += 1
            continue

        xs.append(x)
        ys.append(y)
        
    print(f"Found {empty} empty blends.")        
    return np.array(xs), np.array(ys)

train_x, train_y = make_notes_vectors(split_data["train"],split_data["covered_notes"])
test_x, test_y = make_notes_vectors(split_data["test"],split_data["covered_notes"])
train_x.shape, test_x.shape, train_y.shape, test_y.shape

  0%|          | 0/43992 [00:00<?, ?it/s]

Found 256 empty blends.


  0%|          | 0/39554 [00:00<?, ?it/s]

Found 243 empty blends.


((43736, 332), (39311, 332), (43736, 77), (39311, 77))

In [65]:
def checksum(vector):
    # Assert every label has a sample and Assert every sample has a label
    return (vector.sum(axis=0) > 0).all() and (vector.sum(axis=1) > 0).all()
assert checksum(train_x)
assert checksum(train_y)
assert checksum(test_x)
assert checksum(test_y)
"Data looks good!"

'Data looks good!'

In [90]:
import sklearn.metrics
import sklearn.ensemble
import sklearn.linear_model

def train_evaluate_model(base_classifier):
    print("Training the model...")
    # Hack because I want progress bars
    estimators = []
    for i in tqdm(range(train_y.shape[1]), desc="Training classifiers for each label"):
        estimators.append(base_classifier.fit(train_x, train_y[:, i]))
        
    print("Predicting probabilities for AUROC calculation...")
    test_pred = np.zeros_like(test_y, dtype=float)
    for i in tqdm(range(test_y.shape[1]), desc="Predicting probabilities"):
        test_pred[:, i] = estimators[i].predict_proba(test_x)[:,1]
    
    # Calculate AUROC for micro and macro averaging
    auc_micro = sklearn.metrics.roc_auc_score(test_y, test_pred, average='micro')
    auc_macro = sklearn.metrics.roc_auc_score(test_y, test_pred, average='macro')

    print("Micro-averaged AUROC:", auc_micro)
    print("Macro-averaged AUROC:", auc_macro)

    model.estimators_ = estimators

print("Logistic Regression")
train_evaluate_model(sklearn.linear_model.LogisticRegression(max_iter=1000))

Logistic Regression
Training the model...


Training classifiers for each label:   0%|          | 0/77 [00:00<?, ?it/s]

Predicting probabilities for AUROC calculation...


Predicting probabilities:   0%|          | 0/77 [00:00<?, ?it/s]

Micro-averaged AUROC: 0.48962054009741324
Macro-averaged AUROC: 0.5134447611351038


In [91]:
print("Random Forest")
train_evaluate_model(sklearn.ensemble.RandomForestClassifier())

Random Forest
Training the model...


Training classifiers for each label:   0%|          | 0/77 [00:00<?, ?it/s]

Predicting probabilities for AUROC calculation...


Predicting probabilities:   0%|          | 0/77 [00:00<?, ?it/s]

Micro-averaged AUROC: 0.4988200219226068
Macro-averaged AUROC: 0.5313256547262031
