In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/laurasisson/odor-pair'

In [117]:
import json
with open("dataset/full.json") as f:
    data = json.load(f)
len(data), data[0]

(166814,
 {'mol1': 'CCCCC/C=C/C(=O)OC',
  'mol1_notes': ['violet',
   'sweet',
   'oily',
   'melon',
   'pear',
   'hairy',
   'costus',
   'fruity',
   'violet leaf',
   'waxy',
   'fresh',
   'green'],
  'mol2': 'CCCCCOC(=O)CCC',
  'mol2_notes': ['cherry',
   'sweet',
   'pineapple',
   'fruity',
   'banana',
   'tropical'],
  'blend_notes': ['animal', 'fruity', 'waxy']})

In [12]:
import graph.utils
import single.utils

all_blend_notes = set()
all_single_notes = set()
for d in data:
    all_blend_notes.update(d["blend_notes"])
    all_single_notes.update(d["mol1_notes"])
    all_single_notes.update(d["mol2_notes"])

f"Before Canonicalization: |Blend Notes| = {len(all_blend_notes)}. |Single Notes| = {len(all_single_notes)}."

'Before Canonicalization: |Blend Notes| = 109. |Single Notes| = 496'

In [48]:
all_blend_notes = list(graph.utils.canonize(all_blend_notes))
all_single_notes = list(single.utils.canonize(all_single_notes))
    
f"After Canonicalization: |Blend Notes| = {len(all_blend_notes)}. |Single Notes| = {len(all_single_notes)}."

'After Canonicalization: |Blend Notes| = 104. |Single Notes| = 398.'

In [47]:
f"The following notes appear only in blends: {set(all_blend_notes).difference(set(all_single_notes))}."

"The following notes appear only in blends: {'anise', 'minty'}."

In [53]:
all_notes = set(all_blend_notes).union(set(all_single_notes))
f"Found a total of {len(all_notes)} notes."

'Found a total of 400 notes.'

In [101]:
common_notes = set(all_blend_notes).intersection(set(all_single_notes))
f"Found a total of {len(common_notes)} notes in common."

'Found a total of 102 notes in common.'

In [27]:
unions = []
intersections = []
blends = []
for d in tqdm(data):
    blnd = set(graph.utils.canonize(d["blend_notes"]))
    if not blnd:
        continue

    n1 = set(single.utils.canonize(d["mol1_notes"]))
    n2 = set(single.utils.canonize(d["mol2_notes"]))
    
    unions.append(n1.union(n2))
    intersections.append(n1.intersection(n2))
    blends.append(blnd)

  0%|          | 0/166814 [00:00<?, ?it/s]

In [59]:
from sklearn.metrics import f1_score
import numpy as np

# Initialize empty lists to hold all vectors
all_union_vectors = []
all_intersection_vectors = []
all_blend_vectors = []

for union, intersection, blend in tqdm(zip(unions, intersections, blends), total=len(blends)):
    # Convert sets to binary vectors over the universe of notes (all_notes)
    union_vector = [int(note in union) for note in all_notes]
    intersection_vector = [int(note in intersection) for note in all_notes]
    blend_vector = [int(note in blend) for note in all_notes]
    
    # Append the vectors for bulk calculation
    all_union_vectors.extend(union_vector)
    all_intersection_vectors.extend(intersection_vector)
    all_blend_vectors.extend(blend_vector)

# Convert the lists to numpy arrays for bulk F1 score calculation
all_union_vectors = np.array(all_union_vectors)
all_intersection_vectors = np.array(all_intersection_vectors)
all_blend_vectors = np.array(all_blend_vectors)

# Calculate F1 scores in bulk
f1_union_avg = f1_score(all_blend_vectors, all_union_vectors)
f1_intersection_avg = f1_score(all_blend_vectors, all_intersection_vectors)

print("Average F1 Score (Union):", f1_union_avg)
print("Average F1 Score (Intersection):", f1_intersection_avg)

  0%|          | 0/166542 [00:00<?, ?it/s]

Average F1 Score (Union): 0.20076464784185133
Average F1 Score (Intersection): 0.34769725783076694


In [69]:
emergences = []
suppressions = []

for intersection, blend in zip(intersections,blends):
    emergences.append(blend.difference(intersection))
    suppressions.append(intersection.difference(blend))
    
f"Example: Single notes = {intersections[0]}. Pair notes = {blends[0]}. Emergent notes = {emergences[0]}. Suppressed note = {suppressions[0]}."

"Example: Single notes = {'fruity', 'sweet'}. Pair notes = {'fruity', 'animal', 'waxy'}. Emergent notes = {'animal', 'waxy'}. Suppressed note = {'sweet'}."

In [70]:
    f"For the average blend, we get = {np.mean([len(x) for x in emergences]):.2f} new notes, and lose {np.mean([len(x) for x in suppressions]):.2f} notes."

'For the average blend, we get = 0.89 new notes, and lose 1.17 notes.'

In [103]:
from collections import Counter

# Step 1: Calculate baseline frequency of notes in individual molecules
baseline_individual_freq = Counter(note for intersection in intersections for note in intersection)

# Step 2: Calculate baseline frequency of notes in blends
baseline_blend_freq = Counter(note for blend in blends for note in blend)

# Step 3: Calculate raw frequency of emergent and suppressed notes
emergence_freq = Counter(note for emerg in emergences for note in emerg if note in common_notes)
suppression_freq = Counter(note for supp in suppressions for note in supp if note in common_notes)

# Step 4: Normalize emergences by blend frequencies
normalized_emergence = Counter({note: (emergence_freq[note] / baseline_blend_freq[note])
                               for note in emergence_freq if baseline_blend_freq[note] > 0})

# Step 5: Normalize suppressions by individual frequencies
normalized_suppression = Counter({note: (suppression_freq[note] / baseline_individual_freq[note])
                                 for note in suppression_freq if baseline_individual_freq[note] > 0})


# Step 6: Get the top 10 most likely to emerge and be suppressed
top_10_emergent_notes = normalized_emergence.most_common(25)
top_10_suppressed_notes = normalized_suppression.most_common(25)

# Step 7: Print results
print("Most Likely to Emerge:")
for note, score in top_10_emergent_notes:
    print(f"{note}: {score:.4f}")
print()
print("Most Likely to be Suppressed:")
for note, score in top_10_suppressed_notes:
    print(f"{note}: {score:.4f}")


Most Likely to Emerge:
aromatic: 1.0000
mushroom: 1.0000
juicy: 1.0000
potato: 1.0000
malty: 1.0000
eggy: 1.0000
fresh: 1.0000
cabbage: 1.0000
celery: 1.0000
acetic: 1.0000
salty: 1.0000
pine: 1.0000
hay: 1.0000
medicinal: 0.9942
mossy: 0.9912
moldy: 0.9905
chemical: 0.9900
burnt: 0.9789
solvent: 0.9785
marine: 0.9732
fungal: 0.9711
estery: 0.9702
dairy: 0.9688
toasted: 0.9679
mustard: 0.9649

Most Likely to be Suppressed:
mushroom: 1.0000
fresh: 1.0000
pine: 1.0000
juicy: 1.0000
potato: 1.0000
cabbage: 1.0000
celery: 1.0000
aromatic: 1.0000
hay: 1.0000
eggy: 1.0000
malty: 1.0000
acetic: 1.0000
sweet: 0.9994
peach: 0.9883
powdery: 0.9859
clean: 0.9773
cherry: 0.9764
dairy: 0.9679
oily: 0.9628
pungent: 0.9605
burnt: 0.9540
rooty: 0.9483
garlic: 0.9474
dusty: 0.9429
medicinal: 0.9398


In [98]:
top_10_emergent_notes = reversed(normalized_emergence.most_common()[-10:])
top_10_suppressed_notes = reversed(normalized_suppression.most_common()[-10:])

print("Least Likely to Emerge:")
for note, score in top_10_emergent_notes:
    print(f"{note}: {score:.4f}")
print()
print("Least Likely to be Suppressed:")
for note, score in top_10_suppressed_notes:
    print(f"{note}: {score:.4f}")


Least Likely to Emerge:
peach: 0.1000
garlic: 0.3506
meaty: 0.3735
fruity: 0.3923
green: 0.4337
lactonic: 0.5000
woody: 0.5358
musk: 0.5469
onion: 0.5503
coconut: 0.6190

Least Likely to be Suppressed:
alliaceous: 0.3212
floral: 0.3407
fruity: 0.4041
musk: 0.4687
vanilla: 0.4840
aldehydic: 0.4886
caramellic: 0.4891
sulfurous: 0.4920
tarragon: 0.5000
tonka: 0.5305


In [118]:
import pandas as pd

csv_data = []
for note in common_notes:
    csv_data.append({
        'Note': note,
        'Frequency': baseline_blend_freq.get(note, 0),
        'Normalized Emergence': normalized_emergence.get(note, 0),
        'Normalized Suppression': normalized_suppression.get(note, 0),
    })

# Step 3: Create a DataFrame and sort by `Baseline Blend Frequency`
df = pd.DataFrame(csv_data)
df = df.sort_values(by='Frequency', ascending=False)
df = df.reset_index(drop=True)

# Step 4: Save to CSV
df.to_csv('charts/csv/notes_emergence_suppression.csv', index=False)

df.head(10)

Unnamed: 0,Note,Frequency,Normalized Emergence,Normalized Suppression
0,fruity,50409,0.392271,0.404081
1,floral,47627,0.619187,0.340712
2,green,29375,0.433668,0.621858
3,waxy,11092,0.701316,0.641567
4,herbal,9157,0.680245,0.632207
5,woody,8707,0.535776,0.62529
6,balsamic,7047,0.838655,0.585339
7,citrus,5544,0.795635,0.702468
8,spicy,5291,0.736912,0.634166
9,fatty,4679,0.785211,0.765843


In [161]:
xs = []
ys = []
empty = 0
selfloops = 0

common_notes_list = list(common_notes)

for d in tqdm(data):
    blnd = graph.utils.canonize(d["blend_notes"])
    if not blnd:
        empty += 1
        continue

    n1 = set(single.utils.canonize(d["mol1_notes"]))
    n2 = set(single.utils.canonize(d["mol2_notes"]))
    
    x = graph.utils.multi_hot(n1,all_single_notes)+graph.utils.multi_hot(n2,all_single_notes)
    y = graph.utils.multi_hot(blnd,all_blend_notes)
    
    if x.sum() == 0 or y.sum() == 0:
        empty += 1
        continue

    xs.append(x)
    ys.append(y)

print(f"Found {empty} empty blends and {selfloops} self loops.")

  0%|          | 0/166814 [00:00<?, ?it/s]

Found 272 empty blends and 0 self loops.


In [162]:
for y in ys:
    assert y.sum() > 0 and y.sum() < len(y)
    
for x in xs:
    assert x.sum() > 0 and x.sum() < len(x)

In [166]:
# Convert the data to NumPy arrays (if not already)
import numpy as np
X = np.array(xs)  # Intersected notes as features
Y = np.array(ys)  # Multi-hot vector blend notes as target labels

X.shape, Y.shape

((166542, 398), (166542, 104))

In [168]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from tqdm import tqdm
import numpy as np

# Step 1: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Step 2: Initialize the Random Forest Classifier
base_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 3: Fit the OneVsRestClassifier with verbosity
clf = OneVsRestClassifier(base_clf)

# Adding progress tracking with tqdm for the training process
print("Training the model...")
for i in tqdm(range(Y.shape[1]), desc="Training classifiers for each label"):
    clf.estimators_ = [base_clf.fit(X_train, y_train[:, i]) for i in range(Y.shape[1])]

print("Training complete!")

# Step 4: Predict on the test set with verbosity
y_pred = np.zeros_like(y_test)
print("Predicting the test set...")
for i in tqdm(range(Y.shape[1]), desc="Predicting labels"):
    y_pred[:, i] = clf.estimators_[i].predict(X_test)

# Step 5: Calculate F1 score for multi-label classification
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')

# Step 6: Predict probabilities (needed for AUROC) with verbosity
y_pred_prob = np.zeros_like(y_test, dtype=float)
print("Predicting probabilities for AUROC calculation...")
for i in tqdm(range(Y.shape[1]), desc="Predicting probabilities"):
    y_pred_prob[:, i] = clf.estimators_[i].predict_proba(X_test)[:, 1]

# Step 7: Calculate AUROC for micro and macro averaging
auc_micro = roc_auc_score(y_test, y_pred_prob, average='micro')
auc_macro = roc_auc_score(y_test, y_pred_prob, average='macro')

# Step 8: Print the results
print("Micro-averaged F1 Score:", f1_micro)
print("Macro-averaged F1 Score:", f1_macro)
print("Micro-averaged AUROC:", auc_micro)
print("Macro-averaged AUROC:", auc_macro)

Training the model...


Training classifiers for each label:   1%| | 1/104 [35:03<60:10:43, 2103.33s/it]


KeyboardInterrupt: 

In [155]:

# Step 5: Calculate F1 score for multi-label classification
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')

print("Micro-averaged F1 Score:", f1_micro)
print("Macro-averaged F1 Score:", f1_macro)

# Step 6: Predict probabilities (needed for AUROC)
# This is done per label (one classifier per label)
y_pred_prob = clf.predict_proba(X_test)

# Step 7: Calculate AUROC for micro and macro averaging
auc_micro = roc_auc_score(y_test, y_pred_prob, average='micro')
auc_macro = roc_auc_score(y_test, y_pred_prob, average='macro')

# Step 8: Print the results

print("Micro-averaged AUROC:", auc_micro)
print("Macro-averaged AUROC:", auc_macro)


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Micro-averaged F1 Score: 0.7727141492882048
Macro-averaged F1 Score: 0.6848094010220757


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [172]:
import numpy as np

# Assuming y_test is a NumPy array (num_samples x num_labels)

# Step 1: Find indices where only one class is present across instances for each label
single_class_indices = []

# Loop over each label (column) in y_test
for label_idx in range(y_test.shape[1]):
    unique_classes = np.unique(y_test[:, label_idx])
    
    # If the label has only one unique value (either all 0's or all 1's), record the indices
    if len(unique_classes) == 1:
        # Store all instances (row indices) for this label where this happens
        for instance_idx in range(y_test.shape[0]):
            single_class_indices.append((instance_idx, label_idx))

        
# Step 2: Output the indices with single class in y_test
# print("Instance and label indices where only one class is present in y_test:")
# for idx in single_class_indices:
#     print(f"Instance {idx[0]}, Label {idx[1]} has only class {y_test[idx[0], idx[1]]} present.")
#     break

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

