In [12]:
import json
def load_json(data_path):
    with open(data_path, "r", encoding="utf-8") as file_1:
        data = json.load(file_1)
    return data

In [13]:
data = load_json("FoodOn/train/foodon_train_pairs.json")

In [14]:
len(data), data[:5]

(53020,
 [{'ID': 'TR_148e834b', 'parent': 'steroid', 'child': 'steroid hormone'},
  {'ID': 'TR_b89bbe1d',
   'parent': 'molecular entity',
   'child': 'elemental molecular entity'},
  {'ID': 'TR_a99a497b',
   'parent': 'florida pompano fillet (skinless)',
   'child': 'florida pompano fillet (skinless, raw)'},
  {'ID': 'TR_9edca995',
   'parent': 'food mix product',
   'child': 'chicken tetrazzini mix'},
  {'ID': 'TR_2e4152bf', 'parent': 'carboxylic acid', 'child': 'amino acid'}])

In [15]:
from collections import defaultdict

class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        # Path compression
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(x)] = self.find(y)

def build_clusters(relationships):
    uf = UnionFind()
    
    for rel in relationships:
        parent = rel["parent"]
        child = rel["child"]
        uf.union(parent, child)

    clusters = defaultdict(set)
    for term in set([item for rel in relationships for item in (rel["parent"], rel["child"])]):
        root = uf.find(term)
        clusters[root].add(term)

    # Add isolated nodes (no parent/child relationship)
    all_terms = set()
    for rel in relationships:
        all_terms.add(rel["parent"])
        all_terms.add(rel["child"])
    for term in all_terms:
        if term not in uf.parent:
            clusters[term].add(term)

    return list(clusters.values())


In [16]:
clusters = build_clusters(data)

In [17]:
len(clusters)

1086

In [18]:
clusters

[{'Valerianella',
  'wombat berry plant',
  'bone meal wafer',
  'butchery cut of european anchovy',
  'Ditammari',
  'raspberry spread',
  'piece of dark animal meat',
  'veal sub-primal cut',
  'Coleoidea',
  'chocolate chip ice cream',
  '10000105 - baby/infant - specialised beverages (shelf stable) (gs1 gpc)',
  'egg simulated product',
  'State of Vermont',
  'pink shrimp (cooked)',
  'Paeonol',
  'icicle, no sugar added except lactose',
  'South Asian',
  'light purple',
  'lemon savory plant',
  'swartzia plant',
  'Enteroctopus dofleini',
  'turkey eggshell',
  'hemp seed (dehulled, raw)',
  'huckleberry',
  'sponge crab family',
  'red cayenne pepper paste',
  '21870 - cooked bratwurst-type sausage (efsa foodex2)',
  'piece of animal back',
  'rainbow smelt (dressed)',
  'Trachurus japonicus',
  'cape hope squid',
  'blackberry pie',
  'oat flakes',
  '38080 - mixed supplements/formulations (efsa foodex2)',
  'kidney bean (canned)',
  'shrimp and fish frankfurter',
  'hemp see

In [19]:
# Load a text file in Python
final_content = []
with open("FoodOn/train/foodon_train_types.txt", "r", encoding="utf-8") as f:
    contents = f.readlines()
    for content in contents:
        final_content.append(content.strip('\n'))


print(contents[:500])  # Print first 500 characters


['25000 - edible crab (efsa foodex2)\n', 'Pseudupeneus prayensis\n', 'connective tissue\n', 'japanese huchen\n', 'tetrapyrrole\n', 'SugarBee apple tree\n', 'Chordata\n', 'Ocimeae\n', 'brine\n', 'Salvia fruticosa\n', 'sweetened condensed milk\n', 'whole wheat pastry flour\n', 'lamb sub-primal cut (raw)\n', 'oregano leaf\n', '39340 - pizza and similar with meat, and vegetables (efsa foodex2)\n', 'veal material\n', 'illipe butter\n', 'remoulade\n', 'mince pie filling (frozen)\n', 'barracuda family\n', '10006193 - red currants (gs1 gpc)\n', '42630 - fennel flavour (efsa foodex2)\n', 'pate\n', 'winter wheat kernel\n', 'piece(s) of chicken meat (with skin)\n', 'sausage with a starch content of more that 6%\n', 'pasta (fresh)\n', '15390 - musky strawberries (efsa foodex2)\n', 'piece of sheep (with skin)\n', 'turkey thigh (skinless, with bone)\n', 'swiss chard leaf\n', 'pigeon pea (mature)\n', 'field pea (canned)\n', 'raspberry preserve or jam\n', 'Atheresthes evermanni\n', 'Rheum\n', 'sweet c

In [20]:
len(contents), contents[0]

(31076, '25000 - edible crab (efsa foodex2)\n')

In [21]:
len(final_content), final_content[0]

(31076, '25000 - edible crab (efsa foodex2)')

In [None]:
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import copy

# --- Sample Input (Replace with your data) ---


# --- Assignment Function ---
def assign_term_to_cluster(term, clusters):
    new_words = term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for cluster_term in cluster:
            for word in cluster_term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)

    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]

    if max_score == 0:
        return None, term
    return best_cluster_index, term

# --- Parallel Execution with tqdm ---
def parallel_assign_terms(terms, clusters):
    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(assign_term_to_cluster, term, clusters): term for term in terms}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Assigning terms"):
            results.append(future.result())
    return results

# --- Cluster Assignment ---
final_clusters = [set(cluster) for cluster in copy.deepcopy(clusters)]
assignments = parallel_assign_terms(final_content, final_clusters)

# --- Update Final Clusters ---
for cluster_idx, term in assignments:
    if cluster_idx is not None:
        final_clusters[cluster_idx].add(term)
    else:
        final_clusters.append(set([term]))

# --- Output ---
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")


In [None]:
from collections import Counter
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import copy

# Sample clusters and final_content (replace with your data)
# clusters = [
#     {'steroid', 'steroid hormone'},
#     {'molecular entity', 'elemental molecular entity'},
#     {'carboxylic acid', 'amino acid'},
# ]

# final_content = [
#     "acid molecule", "entity class", "steroid precursor",
#     "amino acid derivative", "biological steroid", "unknown compound"
# ]

# Step 3: Assign function
def assign_term_to_cluster(args):
    new_term, clusters = args
    new_words = new_term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for term in cluster:
            for word in term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)
    
    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]

    if max_score == 0:
        return None, new_term  # No match
    return best_cluster_index, new_term

# Step 4: Parallel assignment
def parallel_assign(final_content, clusters):
    results = []
    with ThreadPoolExecutor() as executor:
        futures = list(tqdm(executor.map(assign_term_to_cluster, [(term, clusters) for term in final_content]), total=len(final_content)))
        results.extend(futures)
    return results

# Final clusters copy
final_clusters = [set(cluster) for cluster in copy.deepcopy(clusters)]

# Assign
results = parallel_assign(final_content, final_clusters)

# Step 5: Update clusters
for cluster_idx, term in results:
    if cluster_idx is not None:
        final_clusters[cluster_idx].add(term)
    else:
        final_clusters.append(set([term]))

# Step 6: Convert to list of lists
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

# Display final clusters
for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")


In [11]:
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import copy

# --- Sample Input (Replace with your data) ---
clusters = [
    {'steroid', 'steroid hormone'},
    {'molecular entity', 'elemental molecular entity'},
    {'carboxylic acid', 'amino acid'},
]

final_content = [
    "acid molecule", "entity class", "steroid precursor",
    "amino acid derivative", "biological steroid", "unknown compound"
]

# --- Assignment Function ---
def assign_term_to_cluster(term, clusters):
    new_words = term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for cluster_term in cluster:
            for word in cluster_term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)

    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]

    if max_score == 0:
        return None, term
    return best_cluster_index, term

# --- Parallel Execution with tqdm ---
def parallel_assign_terms(terms, clusters):
    results = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(assign_term_to_cluster, term, clusters): term for term in terms}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Assigning terms"):
            results.append(future.result())
    return results

# --- Cluster Assignment ---
final_clusters = [set(cluster) for cluster in copy.deepcopy(clusters)]
assignments = parallel_assign_terms(final_content, final_clusters)

# --- Update Final Clusters ---
for cluster_idx, term in assignments:
    if cluster_idx is not None:
        final_clusters[cluster_idx].add(term)
    else:
        final_clusters.append(set([term]))

# --- Output ---
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")


Assigning terms:   0%|          | 0/6 [00:00<?, ?it/s]

Cluster 1: ['biological steroid', 'steroid', 'steroid hormone', 'steroid precursor']
Cluster 2: ['elemental molecular entity', 'entity class', 'molecular entity']
Cluster 3: ['acid molecule', 'amino acid', 'amino acid derivative', 'carboxylic acid']
Cluster 4: ['unknown compound']


In [22]:
# Step 3: Assign function
from collections import Counter
from tqdm.notebook import tqdm
def assign_term_to_cluster(new_term, clusters):
    new_words = new_term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for term in cluster:
            for word in term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)
    
    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]
    
    if max_score == 0:
        return None  # No match
    return best_cluster_index

# Step 4: Assign all new terms
final_clusters = [set(cluster) for cluster in clusters]  # copy original clusters

for term in tqdm(final_content):
    assigned_idx = assign_term_to_cluster(term, final_clusters)
    if assigned_idx is not None:
        final_clusters[assigned_idx].add(term)
    else:
        # Create new cluster if no match found
        final_clusters.append(set([term]))

# Step 5: Convert to list of lists and return
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

# Display final clusters
for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")

  0%|          | 0/31076 [00:00<?, ?it/s]

Cluster 1: ['!Kung', "'hyacinth bean (dried)'", "(+)-catechin-3'-methyl ether", '(+)-exo-5-hydroxycamphor', '(+)-nootkatone', '(+)-pinoresinol', '(+)-taxifolin', '(-)-Variabilin', '(-)-alpha-thujone', '(-)-lariciresinol', '(-)-matairesinol', '(-)-menthol', '(-)-secoisolariciresinol', '(1->4)-beta-D-glucan', '(11Z)-icos-11-enoic acid', '(1S,4R)-fenchone', '(20S)-ginsenoside Rg3', '(2E,6E)-farnesyl monophosphate', '(2S)-poncirin', '(6R)-5,10-methenyltetrahydrofolate', '(6Z,9Z,12Z,15Z,18Z,21Z)-tetracosahexaenoic acid', '(E)-sinapaldehyde', '(E)-trans-miyabenol C', '(R)-camphor', '(R)-propane-1,2-diol', '(R)-rosmarinic acid', '(R,R,R)-alpha-tocopherol', '(R,R,S)-alpha-tocopherol', '(R,S,S)-alpha-tocopherol', '(S)-(-)-perillyl alcohol', '(S)-naringenin', '(S,R,S)-alpha-tocopherol', '(S,S,R)-alpha-tocopherol', '(S,S,S)-alpha-tocopherol', '(percent) fat free claim', '00010 - grains and grain-based products (efsa foodex2)', '00020 - cereals and cereal primary derivatives (efsa foodex2)', '0003