In [1]:
import os
os.chdir("..")
os.getcwd()

'/Users/laurasisson/odor-pair'

In [2]:
import json
with open("dataset/full.json") as f:
    data = json.load(f)
len(data), data[0]

(166814,
 {'mol1': 'CCCCC/C=C/C(=O)OC',
  'mol1_notes': ['violet',
   'sweet',
   'oily',
   'melon',
   'pear',
   'hairy',
   'costus',
   'fruity',
   'violet leaf',
   'waxy',
   'fresh',
   'green'],
  'mol2': 'CCCCCOC(=O)CCC',
  'mol2_notes': ['cherry',
   'sweet',
   'pineapple',
   'fruity',
   'banana',
   'tropical'],
  'blend_notes': ['animal', 'fruity', 'waxy']})

In [3]:
import graph.utils
import single.utils

all_blend_notes = set()
all_single_notes = set()
for d in data:
    all_blend_notes.update(d["blend_notes"])
    all_single_notes.update(d["mol1_notes"])
    all_single_notes.update(d["mol2_notes"])

f"Before Canonicalization: |Blend Notes| = {len(all_blend_notes)}. |Single Notes| = {len(all_single_notes)}."

'Before Canonicalization: |Blend Notes| = 109. |Single Notes| = 496.'

In [4]:
all_blend_notes = list(graph.utils.canonize(all_blend_notes))
all_single_notes = list(single.utils.canonize(all_single_notes))
    
f"After Canonicalization: |Blend Notes| = {len(all_blend_notes)}. |Single Notes| = {len(all_single_notes)}."

'After Canonicalization: |Blend Notes| = 104. |Single Notes| = 398.'

In [5]:
f"The following notes appear only in blends: {set(all_blend_notes).difference(set(all_single_notes))}."

"The following notes appear only in blends: {'minty', 'anise'}."

In [6]:
all_notes = set(all_blend_notes).union(set(all_single_notes))
f"Found a total of {len(all_notes)} notes."

'Found a total of 400 notes.'

In [7]:
common_notes = set(all_blend_notes).intersection(set(all_single_notes))
f"Found a total of {len(common_notes)} notes in common."

'Found a total of 102 notes in common.'

In [74]:
from tqdm.notebook import tqdm
unions = []
intersections = []
blends = []
for d in tqdm(data):
    blnd = set(graph.utils.canonize(d["blend_notes"]))
    if not blnd:
        continue

    n1 = set(single.utils.canonize(d["mol1_notes"]))
    n2 = set(single.utils.canonize(d["mol2_notes"]))
    
    unions.append(n1.union(n2))
    intersections.append(n1.intersection(n2))
    blends.append(blnd)
assert len(unions) == len(intersections) and len(intersections) == len(blends)

  0%|          | 0/166814 [00:00<?, ?it/s]

In [75]:
import numpy as np
def jaccard(s1,s2):
    return len(s1.intersection(s2)) / len(s1.union(s2))
union_jaccard = np.mean([jaccard(union,blend) for union, blend in zip(unions,blends)])
intersection_jaccard = np.mean([jaccard(intersection,blend) for intersection, blend in zip(intersections,blends)])
f"Union <-> Blend Jaccard = {union_jaccard:.2f}. Intersection <-> Blend Jaccard = {intersection_jaccard:.2f}."

'Union <-> Blend Jaccard = 0.12. Intersection <-> Blend Jaccard = 0.24.'

In [84]:
emergences = []
suppressions = []
suppression_intersections = []

for union, blend in zip(unions,blends):
    emergences.append(blend.difference(union))
    suppressions.append(union.difference(blend))
    suppression_intersections.append(union.intersection(blend))
    
f"Example: Single notes = {intersections[0]}. Pair notes = {blends[0]}. Emergent notes = {emergences[0]}. Suppressed note = {suppressions[0]}."

"Example: Single notes = {'fruity', 'sweet'}. Pair notes = {'animal', 'fruity', 'waxy'}. Emergent notes = {'animal'}. Suppressed note = {'banana', 'cherry', 'leafy', 'pineapple', 'tropical', 'pear', 'hairy', 'sweet', 'fresh', 'green', 'violet', 'oily', 'melon', 'costus'}."

In [86]:
f"For the average blend, we get = {np.mean([len(x) for x in emergences]):.2f} new notes, and lose {np.mean([len(x) for x in suppression_intersections]):.2f} notes."

'For the average blend, we get = 0.18 new notes, and lose 1.26 notes.'

In [79]:
from collections import Counter

# Calculate baseline frequency of notes in individual molecules
baseline_individual_freq = Counter(note for union in unions for note in union)

# Calculate baseline frequency of notes in blends
baseline_blend_freq = Counter(note for blend in blends for note in blend)

# Calculate raw frequency of emergent and suppressed notes
emergence_freq = Counter(note for emerg in emergences for note in emerg if note in common_notes)
suppression_freq = Counter(note for supp in suppressions for note in supp if note in common_notes)

In [80]:
for note in common_notes:
    # A note can appear in the blend if:
    #  * It appeared in the individual molecules and wasn't suppressed
    #  * It emerged
    appearing_freq = (baseline_individual_freq[note] - suppression_freq[note])
    assert appearing_freq + emergence_freq[note] == baseline_blend_freq[note] 

In [82]:
suppressed_ratio = dict()
emerge_ratio = dict()
for note in common_notes:
    # Ratio of # note is suppressed to # note isn't suppressed (appears in blend and wasn't an emergence)
    not_suppressed = (baseline_blend_freq[note] - emergence_freq[note])
    suppressed_ratio[note] = suppression_freq[note] / not_suppressed if not_suppressed > 0 else float('inf')
        
    # Ratio of # note emergences to # note does not emerge (appears in single molecules and wasn't suppressed)
    not_emergent = (baseline_individual_freq[note] - suppression_freq[note])
    emerge_ratio[note] = emergence_freq[note] / not_emergent if not_emergent > 0 else float('inf')


In [83]:
import pandas as pd

csv_data = []
for note in common_notes:
    csv_data.append({
        'Note': note,
        'Frequency': baseline_blend_freq[note],
        'Emergence Odds-Ratio': emerge_ratio[note],
        'Suppression Odds-Ratio': suppressed_ratio[note]
    })

# Step 3: Create a DataFrame and sort by `Baseline Blend Frequency`
df = pd.DataFrame(csv_data)
df = df.sort_values(by='Frequency', ascending=False)
df = df.reset_index(drop=True)

# Step 4: Save to CSV
df.to_csv('charts/csv/notes_emergence_suppression.csv', index=False)

df

Unnamed: 0,Note,Frequency,Emergence Odds-Ratio,Suppression Odds-Ratio
0,fruity,50409,0.043125,1.173451
1,floral,47627,0.132924,0.963344
2,green,29375,0.038500,2.617302
3,waxy,11092,0.155657,4.086164
4,herbal,9157,0.205503,4.438125
...,...,...,...,...
97,pine,13,0.000000,419.846154
98,peach,10,0.000000,1069.400000
99,ammoniacal,9,0.000000,7.777778
100,hay,2,inf,inf
