In [None]:
import os
os.chdir("..")
os.getcwd()

In [2]:
import json
with open("dataset/single_fold.json") as f:
    split_data = json.load(f)
{k:len(v) for k, v in split_data.items()}

{'train': 43992, 'test': 39554, 'covered_notes': 77}

In [3]:
import analysis.fingerprint

mfpgen = analysis.fingerprint.make_mfpgen(radius=2,fpSize=1024)

In [4]:
train_mols = set()
for d in split_data["train"]:
    train_mols.update(d["edge"])

test_mols = set()
for d in split_data["test"]:
    test_mols.update(d["edge"])

len(train_mols), len(test_mols)

(1471, 1468)

In [5]:
from tqdm.notebook import tqdm

mol_to_embed = dict()
for m in tqdm(train_mols.union(test_mols)):
    try:
        mol_to_embed[m] = analysis.fingerprint.smiles_to_embed(mfpgen,m)
    except Exception as e:
        print(m)
        continue
        
    
next(iter(mol_to_embed.items()))

  0%|          | 0/2939 [00:00<?, ?it/s]

InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3
(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC
CC(=O)c1ccc(C)n1


[17:35:40] SMILES Parse Error: syntax error while parsing: InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3
[17:35:40] SMILES Parse Error: Failed parsing SMILES 'InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3' for input: 'InChI=1/C7H8S/c1-6-4-2-3-5-7(6)8/h2-5,8H,1H3'
[17:35:40] SMILES Parse Error: syntax error while parsing: (C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC
[17:35:40] SMILES Parse Error: Failed parsing SMILES '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC' for input: '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC'
[17:35:40] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 8


('C=CCOC(=O)CCCC1CCCCC1', array([0, 0, 1, ..., 0, 0, 0], dtype=uint8))

In [6]:
all_blend_notes = {'oily', 'ethereal', 'fermented', 'bitter', 'soapy', 'phenolic', 'winey', 'roasted', 'spicy', 'fusel', 'tropical', 'anise', 'honey', 'aromatic', 'meaty', 'fresh', 'woody', 'melon', 'mentholic', 'clean', 'camphoreous', 'nutty', 'herbal', 'jammy', 'earthy', 'vegetable', 'caramellic', 'coconut', 'orris', 'bready', 'citrus', 'chemical', 'burnt', 'dairy', 'cheesy', 'fatty', 'floral', 'fruity', 'green', 'marine', 'coumarinic', 'licorice', 'mossy', 'tonka', 'creamy', 'waxy', 'animal', 'acidic', 'brown', 'cocoa', 'chocolate', 'sweet', 'rummy', 'sour', 'balsamic', 'coffee', 'solvent', 'fungal', 'berry', 'amber', 'cooling', 'onion', 'buttery', 'estery', 'powdery', 'musk', 'aldehydic', 'medicinal', 'alliaceous', 'minty', 'vanilla', 'thujonic', 'sulfurous', 'musty'}
all_blend_notes = list(all_blend_notes)

In [7]:
import numpy as np
import graph.utils

def make_notes_vectors(dataset,should_concat=True):
    xs = []
    ys = []
    empty = 0
    for d in tqdm(dataset):
        blnd = graph.utils.canonize(d["blend_notes"])
        
        mol1, mol2 = d["edge"]
        if not mol1 in mol_to_embed or not mol2 in mol_to_embed:
            continue
        
        if should_concat:
            x = np.concatenate([mol_to_embed[mol1],mol_to_embed[mol2]])
        else:
            x = mol_to_embed[mol1] + mol_to_embed[mol2]
        y = graph.utils.multi_hot(blnd,all_blend_notes)

        if x.sum() == 0 or y.sum() == 0:
            empty += 1
            continue

        xs.append(x)
        ys.append(y)
        
    print(f"Found {empty} empty blends.")        
    return np.array(xs), np.array(ys)

train_x, train_y = make_notes_vectors(split_data["train"])
test_x, test_y = make_notes_vectors(split_data["test"])
train_x.shape, test_x.shape, train_y.shape, test_y.shape

  0%|          | 0/43992 [00:00<?, ?it/s]

Found 262 empty blends.


  0%|          | 0/39554 [00:00<?, ?it/s]

Found 286 empty blends.


((43721, 2048), (39237, 2048), (43721, 74), (39237, 74))

### Using concatenation of MFPs

In [8]:
def checksum(vector):
    # Assert every label has a sample and Assert every sample has a label
    return (vector.sum(axis=0) > 0).all() and (vector.sum(axis=1) > 0).all()
assert checksum(train_y)
assert checksum(test_y)
"Data looks good!"

'Data looks good!'

In [9]:
import sklearn.metrics
import sklearn.ensemble
import sklearn.linear_model

def train_evaluate_model(base_classifier):
    print("Training the model...")
    # Hack because I want progress bars
    estimators = []
    for i in tqdm(range(train_y.shape[1]), desc="Training classifiers for each label"):
        classifier = sklearn.base.clone(base_classifier) 
        estimators.append(classifier.fit(train_x, train_y[:, i]))
        
    print("Predicting probabilities for AUROC calculation...")
    test_pred = np.zeros_like(test_y, dtype=float)
    for i in tqdm(range(test_y.shape[1]), desc="Predicting probabilities"):
        test_pred[:, i] = estimators[i].predict_proba(test_x)[:,1]
    
    # Calculate AUROC for micro and macro averaging
    auc_micro = sklearn.metrics.roc_auc_score(test_y, test_pred, average='micro')
    auc_macro = sklearn.metrics.roc_auc_score(test_y, test_pred, average='macro')

    print("Micro-averaged AUROC:", auc_micro)
    print("Macro-averaged AUROC:", auc_macro)
    
print("Logistic Regression")
train_evaluate_model(sklearn.linear_model.LogisticRegression(max_iter=1000))

Logistic Regression
Training the model...


Training classifiers for each label:   0%|          | 0/74 [00:00<?, ?it/s]

Predicting probabilities for AUROC calculation...


Predicting probabilities:   0%|          | 0/74 [00:00<?, ?it/s]

Micro-averaged AUROC: 0.9099210985253112
Macro-averaged AUROC: 0.7390302686437905


In [None]:
import sklearn.ensemble

print("Random Forest")
train_evaluate_model(sklearn.ensemble.RandomForestClassifier())

Random Forest
Training the model...


Training classifiers for each label:   0%|          | 0/74 [00:00<?, ?it/s]

In [None]:
import sklearn.svm
import sklearn.multioutput

print("SVM")
train_evaluate_model(sklearn.svm.SVC(probability=True))

### Using summation of MFPs

In [None]:
train_x, train_y = make_notes_vectors(split_data["train"],split_data["covered_notes"],should_concat=False)
test_x, test_y = make_notes_vectors(split_data["test"],split_data["covered_notes"],should_concat=False)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
assert checksum(train_y)
assert checksum(test_y)
"Data looks good!"

In [None]:
print("Logistic Regression")
train_evaluate_model(sklearn.linear_model.LinearRegression())
print()
print("Random Forest")
train_evaluate_model(sklearn.ensemble.RandomForestClassifier())
print()
print("SVM")
train_evaluate_model(sklearn.svm.SVC(probability=True))