In [1]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from openai import OpenAI
from typing import List
from sklearn.metrics import silhouette_score
from tqdm import tqdm
import argparse
import json
import os
import random
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures
import pickle

In [11]:
def construct_args():
    parser = argparse.ArgumentParser(description='Cluster entities using hierarchical clustering and refine the clusters using LLM.')
    parser.add_argument('--output_dir', type=str, default="/data/pj20/lamake_data")
    parser.add_argument('--data_dir', type=str, default="/home/pj20/server-03/lamake/data")
    parser.add_argument('--dataset', type=str, default="FB15K-237", help='Path to the dataset file containing the list of entities to cluster.')
    parser.add_argument('--dimensions', type=int, default=1024, help='Dimensionality of the embeddings. Default: 1024.')
    parser.add_argument('--num_threads', type=int, default=10, help='Number of threads to use for multi-threaded processes. Default: 10.')
    parser.add_argument('--max_entities', type=int, default=100, help='Maximum number of entities to include in an LLM request. Default: 100.')
    
    args = parser.parse_args(args=[])
    args.log_dir = f"{args.output_dir}/{args.dataset}/logs"
    
    return args

In [12]:
args = construct_args()

In [23]:
from cluster import  read_entities, create_entity_info_emb_dict, generate_embeddings, build_hierarchy

In [2]:
import pickle

clustering_file = "/data/pj20/lamake_data/FB15K-237/clustering/clustering_0.52.pkl"
with open(clustering_file, "rb") as f:
    clustering = pickle.load(f)

In [15]:
entities = read_entities('/home/pj20/server-03/lamake/data/FB15K-237/entities.dict')
entity_info, entity_embeddings = create_entity_info_emb_dict(args, entities)
entities_text, original_descriptions = [], []
for entity in entities:
    entities_text.append(entity_info[entity]["text_label"])
    original_descriptions.append(entity_info[entity]["original_description"])
    
print("Start Generating Embeddings...")
embeddings, entity_info, entity_embeddings = generate_embeddings(args, entity_info=entity_info, entity_embeddings=entity_embeddings, dim=args.dimensions)

Start Generating Embeddings...
Loading existing entity embeddings from /data/pj20/lamake_data/FB15K-237/entity_embeddings.json...
Done.
All entities have valid embeddings. Skipping embedding generation.
Loading existing entity info from /data/pj20/lamake_data/FB15K-237/entity_info.json...
Done.


In [150]:
clusters = {}
for i in range(clustering.n_clusters_):
    cluster_indices = np.where(clustering.labels_ == i)[0]
    cluster_entities = [entities_text[idx] for idx in cluster_indices]
    clusters[f"Cluster_{i+1}"] = cluster_entities

In [54]:
clusters_ = {int(i): entities for i, entities in enumerate(clusters.values())}

In [21]:
len(entities)

14541

In [18]:
clustering.children_

array([[ 1422, 12516],
       [ 1275,  4592],
       [12324, 13070],
       ...,
       [29074, 29076],
       [ 5241, 29077],
       [29078, 29079]])

In [151]:
initial_hier = build_hierarchy(clustering.children_, len(entities), entity_labels=entities_text, clustering=clustering)

In [152]:
clusters_ = {int(i): entities for i, entities in enumerate(clusters.values())}
entity2clusterid = {}

for i, cluster in enumerate(clusters_.values()):
    for entity in cluster:
        entity2clusterid[entity] = i
        
clusterid2count = defaultdict(int)

In [153]:
entity2clusterid

{'comedy-drama': 0,
 'romance film': 0,
 'sketch comedy': 0,
 'television comedy': 0,
 'comedian': 0,
 'romantic comedy': 0,
 'black comedy': 0,
 'comedy film': 0,
 'Comedy': 0,
 'comedy of manners': 0,
 'stand-up comedy': 0,
 'sex comedy': 0,
 'screwball comedy film': 0,
 'situation comedy': 0,
 'Paddington': 1,
 'Hammersmith': 1,
 'Kensington': 1,
 'Wandsworth': 1,
 'Chiswick': 1,
 'Hampstead': 1,
 'Marylebone': 1,
 'Ealing': 1,
 'Kannada': 2,
 'Mangalore': 2,
 'Mysore': 2,
 'Karnataka': 2,
 'Metropolis Records': 3,
 'Tooth & Nail Records': 3,
 'Southern Lord Records': 3,
 'Revelation Records': 3,
 'Roadrunner Records': 3,
 'Century Media Records': 3,
 'Nuclear Blast': 3,
 'Equal Vision Records': 3,
 'The End Records': 3,
 'Relapse Records': 3,
 'Cleopatra Records': 3,
 'Chrysalis Records': 3,
 'Victory Records': 3,
 'Frontiers Records': 3,
 'Candlelight Records': 3,
 'Napalm Records': 3,
 'Metal Blade Records': 3,
 'Season of Mist': 3,
 'Earache Records': 3,
 'Spinefarm Records': 3,

In [28]:
with open('./initial_hier.json', 'w') as f:
    json.dump(initial_hier, f, indent=4)

In [113]:
entity2clusterid = {}

for i, cluster in enumerate(clusters_.values()):
    for entity in cluster:
        entity2clusterid[entity] = i

In [154]:
from collections import defaultdict

clusterid2count = defaultdict(int)


def label_(d, leaf_keys=None, leaf_values=None):
    if leaf_keys is None:
        leaf_keys = []
    if leaf_values is None:
        leaf_values = []
    for key, value in d.items():
        if isinstance(value, dict):  # If the value is another dictionary, recurse into it
            label_(value, leaf_keys, leaf_values)
        else:  # If the value is not a dictionary, then it's a leaf node
            cluster_id = entity2clusterid[value]
            d[key] = [cluster_id, clusterid2count[cluster_id]]
            clusterid2count[entity2clusterid[value]] += 1
    return d


In [155]:
hierarchy = label_(initial_hier)

In [132]:
with open('./initial_hier_numeric.json', 'w') as f:
    json.dump(hierarchy, f, indent=4)

In [135]:
def refine_1(d, clusters_, leaf_keys=None, leaf_values=None):
    if leaf_keys is None:
        leaf_keys = []
    if leaf_values is None:
        leaf_values = []
    
    keys_to_delete = []  # List to hold keys of items to be deleted
    items_to_update = {}  # Dictionary to hold items to be updated

    for key, value in list(d.items()):  # Convert dict_items to a list to safely iterate
        if isinstance(value, dict):  # If the value is another dictionary, recurse into it
            refine_1(value, clusters_, leaf_keys, leaf_values)
        else:
            if value[1] > 0:
                keys_to_delete.append(key)
            else:
                items_to_update[key] = clusters_[value[0]]

    # Now, delete keys marked for deletion
    for key in keys_to_delete:
        del d[key]

    # Update the dictionary with new values
    for key, new_value in items_to_update.items():
        d[key] = new_value

    return d


In [143]:
hierarchy = refine_1(hierarchy, clusters_)
with open('./refined_hier.json', 'w') as f:
    json.dump(hierarchy, f, indent=4)

In [145]:
def refine_2(d):
    # Recursive function to process and refine each dictionary
    def process_dict(sub_dict):
        for key in list(sub_dict.keys()):  # Iterate over a copy of the keys
            value = sub_dict[key]
            if isinstance(value, dict):
                if value:  # Check if the dictionary is not empty
                    result = process_dict(value)
                    # If the result is a single entry with a list, replace the current dict
                    if len(result) == 1 and isinstance(list(result.values())[0], list):
                        sub_dict[key] = list(result.values())[0]
                    else:
                        sub_dict[key] = result
                else:
                    del sub_dict[key]  # Remove empty dictionaries
        return sub_dict

    # Copy the original dictionary to avoid modification issues
    refined_dict = process_dict(d.copy())
    return refined_dict

In [146]:
hierarchy = refine_2(hierarchy)
with open('./refined_hier.json', 'w') as f:
    json.dump(hierarchy, f, indent=4)

In [148]:
def refine_3(d):
    # Recursive function to process and refine each dictionary
    def process_dict(sub_dict):
        new_dict = {}  # To accumulate refined results
        for key, value in list(sub_dict.items()):
            if isinstance(value, dict):
                processed = process_dict(value)  # Recursively process
                if processed:  # Only add non-empty results
                    new_dict[key] = processed
            else:  # Keep non-dict items as they are
                new_dict[key] = value
        return new_dict

    # Start the processing with the original dictionary
    refined_dict = process_dict(d)
    return refined_dict

In [149]:
hierarchy = refine_3(hierarchy)
with open('./refined_hier.json', 'w') as f:
    json.dump(hierarchy, f, indent=4)

In [1]:
from utils import refine_4
import json

with open('/data/pj20/lamake_data/FB15K-237/seed_clusters.json', 'r') as f:
    seed_clusters = json.load(f)
    
hierarchy = refine_4(seed_clusters)

with open('./refined_hier.json', 'w') as f:
    json.dump(hierarchy, f, indent=4)

In [5]:
from openai import OpenAI

with open('./openai_api.key', 'r') as f:
    api_key = f.read().strip()
client = OpenAI(api_key=api_key)

def gpt_chat_return_response(model, prompt, seed=44):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,
        temperature=0,
        seed=seed,
        logprobs=True
    )
    return response

In [36]:
def find_lca(root, node1, node2):
    if root is None:
        return None
    
    if isinstance(root, list):
        if node1 in root or node2 in root:
            return root
    
    if root == node1 or root == node2:
        return root

    lca_list = []
    if isinstance(root, dict):
        for child in root.values():
            lca = find_lca(child, node1, node2)
            if lca is not None:
                lca_list.append(lca)
            if len(lca_list) > 1:
                return root

    return lca_list[0] if lca_list else None

def find_distance_from_root_to_node(root, node, distance=0):
    if root is None:
        return -1

    if isinstance(root, list):
        if node in root:
            return distance

    if root == node:
        return distance

    if isinstance(root, dict):
        for child in root.values():
            dist = find_distance_from_root_to_node(child, node, distance + 1)
            if dist != -1:
                return dist

    return -1

def distance_between_nodes(root, node1, node2):
    lca = find_lca(root, node1, node2)
    if lca is None:
        return -1

    distance1 = find_distance_from_root_to_node(lca, node1, 0)
    distance2 = find_distance_from_root_to_node(lca, node2, 0)
    
    return distance1 + distance2 if distance1 != -1 and distance2 != -1 else -1

# 树结构示例
tree = {
    "Cluster_29078": {
        "Cluster_29074": {
            "Cluster_29030": {
                "Cluster_27322": {
                    "Cluster_1143": ["American Kennel Club"],
                    "Cluster_25030": {
                        "Cluster_22602": ["dog", "bulldog"],
                        "Cluster_23900": {
                            "Cluster_20210": ["Golden Retriever", "Labrador Retriever", "German Shepherd dog"],
                            "Cluster_20718": ["Chihuahua", "Yorkshire Terrier"]
                        }
                    }
                }
            }
        }
    }
}

# Recalculate the distance with the revised functions
distance = distance_between_nodes(tree, "Golden Retriever", "American Kennel Club")
print(f"Distance between Golden Retriever and American Kennel Club: {distance}")

distance = distance_between_nodes(tree, "Golden Retriever", "bulldog")
print(f"Distance between Golden Retriever and bulldog: {distance}")

distance = distance_between_nodes(tree, "Golden Retriever", "Labrador Retriever")
print(f"Distance between Golden Retriever and Labrador Retriever: {distance}")

distance = distance_between_nodes(tree, "Golden Retriever", "Chihuahua")
print(f"Distance between Golden Retriever and Chihuahua: {distance}")

Distance between Golden Retriever and American Kennel Club: 4
Distance between Golden Retriever and bulldog: 3
Distance between Golden Retriever and Labrador Retriever: 0
Distance between Golden Retriever and Chihuahua: 2


In [17]:
def compute_tree_depth(root):
    if root is None:
        return 0
    
    if isinstance(root, list) or isinstance(root, str):
        return 1  # Leaf nodes contribute a depth of 1
    
    if isinstance(root, dict):
        max_depth = 0
        for child in root.values():
            child_depth = compute_tree_depth(child)
            if child_depth > max_depth:
                max_depth = child_depth
        return 1 + max_depth  # Add 1 for the depth from the current node to its children

    return 0

In [66]:
with open('/data/pj20/lamake_data/FB15K-237/seed_hierarchy.json', 'r') as f:
    tree = json.load(f)

compute_tree_depth(tree)

47

In [5]:
len(entity_embeddings), len(entity_info)

(14541, 14541)

In [None]:
import json
import numpy as np

# Load entity embeddings
with open('/data/pj20/lamake_data/FB15K-237/entity_embeddings.json', 'r') as file:
    entity_embeddings = json.load(file)
    
# load entity info
with open('/data/pj20/lamake_data/FB15K-237/entity_info.json', 'r') as file:
    entity_info = json.load(file)
    
label2entity = {entity_info[entity]['text_label']: entity for entity in entity_info.keys()}


In [19]:
def compute_cluster_embedding(cluster, embeddings_dict, cluster_id):
    if isinstance(cluster, list):
        # Base case: cluster is a list of entities
        embeddings = [entity_embeddings.get(label2entity.get(entity)) for entity in cluster]
        cluster_embedding = np.mean(embeddings, axis=0)
        print(f"Computed embedding for {cluster_id} with entities: {cluster}")
        print(f"Cluster embedding: {cluster_embedding}")
    elif isinstance(cluster, dict):
        # Recursive case: cluster has sub-clusters
        sub_embeddings = []
        for sub_cluster_id, sub_cluster in cluster.items():
            sub_embedding = compute_cluster_embedding(sub_cluster, embeddings_dict, sub_cluster_id)
            sub_embeddings.append(sub_embedding)
        cluster_embedding = np.mean(sub_embeddings, axis=0)
        print(f"Computed embedding for {cluster_id} with sub-clusters: {list(cluster.keys())}")
        print(f"Parent cluster embeddings: {cluster_embedding}")

    embeddings_dict[cluster_id] = cluster_embedding
    return cluster_embedding


# Initialize dictionary to store embeddings
cluster_embeddings = {}
# Trigger the recursive computation
for cluster_id, cluster_data in tree.items():
    compute_cluster_embedding(cluster_data, cluster_embeddings, cluster_id)



Computed embedding for Cluster_1143 with entities: ['American Kennel Club']
Cluster embedding: [-0.00423184  0.04183974 -0.02882874 ... -0.02072082  0.02445128
  0.00687305]
Computed embedding for Cluster_22602 with entities: ['dog', 'bulldog']
Cluster embedding: [-0.053087   -0.0045369  -0.00478628 ...  0.01311163 -0.00561521
 -0.00221055]
Computed embedding for Cluster_20210 with entities: ['Golden Retriever', 'Labrador Retriever', 'German Shepherd dog']
Cluster embedding: [-0.0291969   0.02037954 -0.01243354 ... -0.01191179 -0.01795114
  0.01735777]
Computed embedding for Cluster_20718 with entities: ['Chihuahua', 'Yorkshire Terrier']
Cluster embedding: [-0.04773875  0.0299663  -0.0071249  ...  0.03034772 -0.0281039
 -0.00916671]
Computed embedding for Cluster_23900 with sub-clusters: ['Cluster_20210', 'Cluster_20718']
Parent cluster embeddings: [-0.03846782  0.02517292 -0.00977922 ...  0.00921796 -0.02302752
  0.00409553]
Computed embedding for Cluster_25030 with sub-clusters: ['Cl

In [31]:
from utils import *

entity2clusterid = {}
leaf_keys, leaf_values = find_leaves(tree)

In [32]:
for i in range(len(leaf_values)):
    for entity_label in leaf_values[i]:
        entity2clusterid[label2entity[entity_label]] = leaf_keys[i]

In [33]:
entity2clusterid

{'/m/0ljl8': 'Cluster_1143',
 '/m/0bt9lr': 'Cluster_22602',
 '/m/01k3tq': 'Cluster_22602',
 '/m/01t032': 'Cluster_20210',
 '/m/0km3f': 'Cluster_20210',
 '/m/0km5c': 'Cluster_20210',
 '/m/0183z2': 'Cluster_20718',
 '/m/01_gx_': 'Cluster_20718',
 '/m/0g4pl7z': 'Cluster_5906',
 '/m/0hsph': 'Cluster_6601',
 '/m/03k3r': 'Cluster_5562',
 '/m/0g68zt': 'Cluster_8804',
 '/m/065z3_x': 'Cluster_4708',
 '/m/0298n7': 'Cluster_14069',
 '/m/02y74': 'Cluster_23662',
 '/m/018w8': 'Cluster_23662',
 '/m/0crlz': 'Cluster_23662',
 '/m/01gqfm': 'Cluster_23662',
 '/m/03krj': 'Cluster_23662',
 '/m/03tmr': 'Cluster_23662',
 '/m/039yzs': 'Cluster_23662',
 '/m/07_53': 'Cluster_23662',
 '/m/04lgq': 'Cluster_23662',
 '/m/07bs0': 'Cluster_18777',
 '/m/07jbh': 'Cluster_18777',
 '/m/0194d': 'Cluster_18777',
 '/m/09xp_': 'Cluster_22340',
 '/m/037hz': 'Cluster_22340',
 '/m/018jz': 'Cluster_22340',
 '/m/06f3l': 'Cluster_15934',
 '/m/06br8': 'Cluster_15934',
 '/m/0z74': 'Cluster_21961',
 '/m/047vnfs': 'Cluster_21961',
 '

In [38]:
with open('/data/pj20/lamake_data/FB15K-237/entity_info.json', 'r') as f:
    entity_info = json.load(f)

In [37]:
entity_info['/m/08k05y']

{'text_label': 'SV Ried',
 'original_description': 'association football club in Austria',
 'llm_description': 'SV Ried is a professional football club based in Ried im Innkreis, Austria.',
 'a': 1}

In [39]:
map_child_to_parent(tree)

{'Cluster_29074': 'Cluster_29078',
 'Cluster_29030': 'Cluster_29074',
 'Cluster_27322': 'Cluster_29030',
 'Cluster_1143': 'Cluster_27322',
 'Cluster_25030': 'Cluster_27322',
 'Cluster_22602': 'Cluster_25030',
 'Cluster_23900': 'Cluster_25030',
 'Cluster_20210': 'Cluster_23900',
 'Cluster_20718': 'Cluster_23900',
 'Cluster_28881': 'Cluster_29030',
 'Cluster_5906': 'Cluster_28881',
 'Cluster_28716': 'Cluster_28881',
 'Cluster_6601': 'Cluster_28716',
 'Cluster_27949': 'Cluster_28716',
 'Cluster_23927': 'Cluster_27949',
 'Cluster_5562': 'Cluster_23927',
 'Cluster_8804': 'Cluster_23927',
 'Cluster_25064': 'Cluster_27949',
 'Cluster_4708': 'Cluster_25064',
 'Cluster_14069': 'Cluster_25064',
 'Cluster_29073': 'Cluster_29074',
 'Cluster_29046': 'Cluster_29073',
 'Cluster_28944': 'Cluster_29046',
 'Cluster_28546': 'Cluster_28944',
 'Cluster_27396': 'Cluster_28546',
 'Cluster_24689': 'Cluster_27396',
 'Cluster_23662': 'Cluster_24689',
 'Cluster_24264': 'Cluster_24689',
 'Cluster_18777': 'Cluster

In [64]:
def find_lca_key(root, key1, key2):
    if root is None:
        return None

    # If the current root (or dict) contains the key directly, we check its keys
    if key1 in root or key2 in root:
        return root  # Found one of the keys at the current level, return this root

    lca_list = []
    if isinstance(root, dict):
        for key, child in root.items():
            if key == key1 or key == key2:
                lca_list.append(key)
            lca = find_lca_key(child, key1, key2)
            if lca is not None:
                lca_list.append(lca)
            if len(lca_list) > 1:
                return root  # Both keys found in different subtrees

    return lca_list[0] if lca_list else None

def find_distance_from_root_to_key(root, key, distance=0):
    if root is None:
        return -1

    # Check if the key is the current root's direct key
    if key in root:
        return distance

    if isinstance(root, dict):
        for child_key, child in root.items():
            if child_key == key:
                return distance + 1
            dist = find_distance_from_root_to_key(child, key, distance + 1)
            if dist != -1:
                return dist

    return -1

def distance_between_keys(root, key1, key2):
    lca = find_lca_key(root, key1, key2)
    if lca is None:
        return -1

    distance1 = find_distance_from_root_to_key(lca, key1, 0)
    distance2 = find_distance_from_root_to_key(lca, key2, 0)
    
    return distance1 + distance2 if distance1 != -1 and distance2 != -1 else -1


tree = {
    "Cluster_29078": {
        "Cluster_29074": {
            "Cluster_29030": {
                "Cluster_27322": {
                    "Cluster_1143": ["American Kennel Club"],
                    "Cluster_25030": {
                        "Cluster_22602": ["dog", "bulldog"],
                        "Cluster_23900": {
                            "Cluster_20210": ["Golden Retriever", "Labrador Retriever", "German Shepherd dog"],
                            "Cluster_20718": ["Chihuahua", "Yorkshire Terrier"]
                        }
                    }
                }
            }
        }
    }
}

# Example usage with keys from the tree:
key1 = "Cluster_29078"
key2 = "Cluster_1143"

# Calculate the distance between two keys
distance = distance_between_keys(tree, key1, key2)
print(f"Distance between {key1} and {key2}: {distance}")


Distance between Cluster_29078 and Cluster_1143: 4


In [69]:
all_keys = get_all_keys(tree)

In [78]:
def find_distance(parent_map, key, root):
    distance = 0
    while key != root:
        key = parent_map[key]
        distance += 1
    return distance


def find_nearest_keys_lca_based(tree, input_key, n, parent_map):
    all_keys = set(parent_map.keys())
    distances = []
    
    for key in all_keys:
        if key != input_key:
            dist = distance_between_keys(parent_map, input_key, key)
            if dist != -1:  # Only consider valid distances
                distances.append((key, dist))
    
    # Sort the list of distances based on distance, and return the first n keys
    distances.sort(key=lambda x: x[1])
    return [key for key, dist in distances[:n]]

def find_lca(parent_map, key1, key2):
    ancestors = set()
    # Climb up from key1 to the root, collecting all ancestors
    while key1 in parent_map:
        ancestors.add(key1)
        key1 = parent_map.get(key1, None)  # Safely get parent or None if not exists
        if key1 is None:
            break
    # Climb up from key2 until we find the first common ancestor
    while key2 not in ancestors:
        key2 = parent_map.get(key2, None)  # Safely get parent or None if not exists
        if key2 is None:
            return None  # If reached the top without finding an ancestor, return None
    return key2

def distance_between_keys(parent_map, key1, key2):
    # Find root two levels above current key
    root1 = parent_map.get(key1)
    if root1:
        root1 = parent_map.get(root1)
    
    root2 = parent_map.get(key2)
    if root2:
        root2 = parent_map.get(root2)

    # Find LCA considering two levels up as the root
    if root1 and root2:
        lca = find_lca(parent_map, key1, key2)
        if lca:
            distance1 = find_distance(parent_map, key1, lca)
            distance2 = find_distance(parent_map, key2, lca)
            return distance1 + distance2
    return -1  # Return -1 if no valid LCA is found

# Building the parent map
parent_map = map_child_to_parent(tree)

# Example usage:
input_key = "Cluster_23662"
nearest_keys = find_nearest_keys_lca_based(tree, input_key, 5, parent_map)
print(f"The nearest 3 keys to {input_key} are: {nearest_keys}")



The nearest 3 keys to Cluster_23662 are: ['Cluster_24689', 'Cluster_27396', 'Cluster_24264', 'Cluster_25080', 'Cluster_18777']


In [55]:
from utils import *
def node2parentpath(d, source_cluster):
    parent_path = []
    parent_distances = []
    child_parent = map_child_to_parent(d)
    current_parent = child_parent[source_cluster]
    while current_parent in child_parent.keys():
        parent_path.append(current_parent)
        parent_distances.append(distance_between_keys(d, current_parent, source_cluster))
        current_parent = child_parent[current_parent]
    
    return parent_path, parent_distances


In [56]:
node2parentpath(tree, 'Cluster_1143')

(['Cluster_27322', 'Cluster_29030', 'Cluster_29074'], [1, 2, 3])

In [59]:
len(tree)

2

In [62]:
with open('/data/pj20/lamake_data/FB15K-237/seed_hierarchy.json', 'r') as f:
    tree = json.load(f)

tree = {
    "Cluster_top": tree
}

with open('/data/pj20/lamake_data/FB15K-237/seed_hierarchy.json', 'w') as f:
    json.dump(tree, f, indent=4)

In [63]:
def rename_clusters_to_ints(original_dict, start_index=0, key_map=None):
    """
    Recursively renames keys of the nested dictionary to integers, incrementing from a given start index.
    Also tracks the mapping from original keys to new keys.
    """
    if key_map is None:
        key_map = {}

    new_dict = {}
    index = start_index

    for key, value in original_dict.items():
        key_map[key] = index
        if isinstance(value, dict):
            new_dict[index], index, key_map = rename_clusters_to_ints(value, index + 1, key_map)
        else:
            new_dict[index] = value
            index += 1

    key_map_inv = {v: k for k, v in key_map.items()}
    return new_dict, index, key_map, key_map_inv

with open('/data/pj20/lamake_data/FB15K-237/seed_hierarchy.json', 'r') as f:
    tree = json.load(f)
    
tree, _, key_map, key_map_inv = rename_clusters_to_ints(tree)


{0: {1: {2: {}, 3: {}}, 4: {}}, 5: {}}


In [4]:
import itertools

def rename_unique_keys(d, prefix="Cluster"):
    counter = itertools.count(1)  # Initialize the counter outside the function

    def rename_recursively(d):
        """Recursively renames all keys using a globally unique counter."""
        new_dict = {}
        for key, value in d.items():
            new_key = f"{prefix}_{next(counter)}"  # Generate a globally unique key
            if isinstance(value, dict):
                # Recursively rename keys in sub-dictionaries
                new_dict[new_key] = rename_recursively(value)
            else:
                # Apply new keys to values that are lists
                new_dict[new_key] = value
        return new_dict

    return rename_recursively(d)  # Start the recursive renaming

# Original nested dictionary
nested_dict = {
    "Cluster_llm_root": {
        "Cluster_5241": [
            "Vidyasagar"
        ],
        "Cluster_29077": {
            "Cluster_29033": {
                "Cluster_25423": {
                    "Cluster_6928": [
                        "old age"
                    ],
                    "Cluster_21691": [
                        "adolescence",
                        "young adult",
                        "coming of age"
                    ]
                },
                "Cluster_28201": {
                    "Cluster_16051": [
                        "20th century",
                        "19th century"
                    ],
                    "Cluster_26871": {
                        "Cluster_20840": [
                            "modern architecture",
                            "modernism"
                        ],
                        "Cluster_24018": {
                            "Cluster_1631": [
                                "Surrealism"
                            ],
                            "Cluster_2924": [
                                "New Romanticism"
                            ]
                        }
                    }
                }
            }
        }
    }
}

# Renaming all keys in the dictionary with unique names
renamed_dict = rename_unique_keys(nested_dict, "Cluster")
print(renamed_dict)


{'Cluster_1': {'Cluster_2': ['Vidyasagar'], 'Cluster_3': {'Cluster_4': {'Cluster_5': {'Cluster_6': ['old age'], 'Cluster_7': ['adolescence', 'young adult', 'coming of age']}, 'Cluster_8': {'Cluster_9': ['20th century', '19th century'], 'Cluster_10': {'Cluster_11': ['modern architecture', 'modernism'], 'Cluster_12': {'Cluster_13': ['Surrealism'], 'Cluster_14': ['New Romanticism']}}}}}}}


In [4]:
from trialmind.llm import call_llm # for chat models
from trialmind.llm import function_call_llm # for function  call

outputs = call_llm(
        prompt_template="tell me a joke about {item}",
        inputs = {"item": "dog"},
        llm="gpt-4"
    )

In [5]:
outputs

"Why did the dog sit in the shade?\n\nBecause he didn't want to be a hot dog!"

In [2]:
import json

entity_info_file = "/shared/pj20/lamake_data/WN18RR/entity_info.json"
entity_embedding_file = "/shared/pj20/lamake_data/WN18RR/entity_init_embeddings.json"

with open(entity_info_file, "r") as f:
    entity_info = json.load(f)
    
with open(entity_embedding_file, "r") as f:
    entity_embeddings = json.load(f)

In [5]:
entity_info['06611376'], entity_info['13555775']

({'text_label': 'shit',
  'original_description': 'obscene words for unacceptable behavior; "I put up with a lot of bullshit from that jerk"; "what he said was mostly bull"',
  'llm_description': 'shit is a vulgar term used to refer to feces or excrement. It is also commonly used as a slang term to express frustration, anger, or disappointment.'},
 {'text_label': 'shit',
  'original_description': 'a coarse term for defecation; "he took a shit"',
  'llm_description': 'shit is a coarse term for defecation.'})

In [34]:
list(entity_info.keys()), list(entity_embeddings.keys())

(['14854262',
  '00590383',
  '08769179',
  '02338145',
  '01990168',
  '11812094',
  '07491708',
  '02106662',
  '01667449',
  '08838556',
  '01973759',
  '08349548',
  '03848348',
  '10770059',
  '13547513',
  '01036804',
  '07123710',
  '09799607',
  '07528212',
  '03646916',
  '10261041',
  '12906334',
  '13625237',
  '01219893',
  '12046251',
  '09166902',
  '01119950',
  '01479937',
  '01218766',
  '01487743',
  '02426634',
  '08207672',
  '02076535',
  '06743230',
  '05376844',
  '11377851',
  '03731695',
  '04197235',
  '04792357',
  '08168117',
  '04066476',
  '10109662',
  '10738515',
  '11659500',
  '02772868',
  '01545314',
  '12523475',
  '11883137',
  '05779712',
  '08988609',
  '02125223',
  '01534034',
  '09166534',
  '00284409',
  '05064722',
  '01364587',
  '02169497',
  '15203565',
  '10368113',
  '03912664',
  '05998356',
  '05303232',
  '08564307',
  '01455754',
  '06439712',
  '14170772',
  '01591490',
  '01986538',
  '00558963',
  '00853487',
  '09194357',
  '011

In [35]:
entities_dict = ""
entity_ids = list(entity_info.keys())
for i in range(len(entity_ids)):
    entities_dict += f"{i}\t{entity_ids[i]}\n"
    
with open("../../data/WN18RR/entities.dict", "w") as f:
    f.write(entities_dict)

In [32]:
#compute cosine similarity between two entities
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(entity1, entity2, entity_info, entity_embeddings):
    entity1_emb = entity_embeddings[entity1]
    entity2_emb = entity_embeddings[entity2]
    return cosine_similarity([entity1_emb], [entity2_emb])[0][0]

compute_similarity('14252320', '02722458', entity_info, entity_embeddings)

0.4319915038286003

In [1]:
import csv
from collections import defaultdict, deque

# Function to read the entity index to entity ID mapping
def read_entity_mapping(filename):
    entity_to_index = {}
    with open(filename, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            index, entity_id = int(row[0]), row[1]
            entity_to_index[entity_id] = index
    return entity_to_index

# Function to read the knowledge graph triples
def read_knowledge_graph(filename):
    graph = defaultdict(list)
    with open(filename, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            e1, relation, e2 = row[0], row[1], row[2]
            graph[e1].append(e2)
            graph[e2].append(e1)  # Assuming the graph is undirected; if directed, remove this line
    return graph

# Function to perform BFS and find k-hop neighbors
def find_k_hop_neighbors(graph, entity_to_index, k):
    k_hop_neighbors = {}
    for entity_id in graph:
        visited = set()
        queue = deque([(entity_id, 0)])
        neighbors = set()

        while queue:
            current, depth = queue.popleft()
            if depth > k:
                break
            if current in visited:
                continue
            visited.add(current)

            if depth == k:
                neighbors.add(current)
            else:
                for neighbor in graph[current]:
                    if neighbor not in visited:
                        queue.append((neighbor, depth + 1))

        k_hop_neighbors[entity_id] = [entity_to_index[neighbor] for neighbor in neighbors if neighbor in entity_to_index]
    
    return k_hop_neighbors

# Main function
def main(entity_mapping_file, kg_file, k):
    entity_to_index = read_entity_mapping(entity_mapping_file)
    graph = read_knowledge_graph(kg_file)
    k_hop_neighbors = find_k_hop_neighbors(graph, entity_to_index, k)

    return k_hop_neighbors

# Example usage
entity_mapping_file = '/home/pj20/server-03/lamake/data/WN18RR/entities.dict'
kg_file = '/home/pj20/server-03/lamake/data/WN18RR/train.txt'
k = 2  # Replace with desired k-hop value

k_hop_neighbors = main(entity_mapping_file, kg_file, k)

In [2]:
k_hop_neighbors

{'00260881': [21412,
  13151,
  19504,
  39669,
  12823,
  13379,
  25618,
  40085,
  18937,
  13202,
  29375,
  1311,
  19031,
  36974,
  28977,
  14716,
  18752,
  13800,
  11489,
  16450,
  38072,
  7513,
  12814],
 '00260622': [18016,
  23119,
  8062,
  29493,
  35706,
  14310,
  15934,
  3562,
  32404,
  1320,
  39776,
  40242,
  40634,
  28964,
  16612,
  28243,
  9964],
 '01332730': [17944,
  3699,
  14025,
  14444,
  1418,
  38693,
  2104,
  20290,
  22803,
  4884,
  36378,
  26785,
  34359,
  14392,
  27917,
  26765,
  1092,
  35536,
  12827,
  22924,
  20522,
  40879,
  16849,
  22937,
  3174,
  25966,
  27939,
  34058,
  22850,
  15168,
  6851,
  23970,
  11640,
  39919,
  22679,
  3989,
  27865,
  27357,
  15892,
  3011,
  1205,
  6519,
  14019,
  22927,
  12241,
  32632,
  936,
  40765,
  22653,
  22678,
  2519,
  17733,
  12405,
  7743,
  23294,
  11707,
  18118,
  27130,
  17309,
  14480,
  22917,
  21957,
  14973,
  8954,
  2415,
  10833,
  23029,
  22065,
  34082,
  47

In [3]:
import json

entity_info_file = "/shared/pj20/lamake_data/WN18RR/entity_info_seed_hier.json"

with open(entity_info_file, "r") as f:
    entity_info = json.load(f)

In [5]:
len(entity_info['00260881']['k_hop_neighbors'])

3392

In [1]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [2]:
from openai import OpenAI

with open('./openai_api.key', 'r') as f:
    api_key = f.read().strip()
    

client = OpenAI(api_key=api_key)

In [3]:
relation = "_member_of_domain_usage".replace("_", " ")
relation_emb = client.embeddings.create(
                input=relation,
                model="text-embedding-3-large",
                dimensions=1024,
            ).data[0].embedding

In [None]:
relation_emb = np.array(relation_emb)

In [4]:
ent_emb_file = "/shared/pj20/lamake_data/WN18RR/entity_init_embeddings.npy"

entity_embs = np.load(ent_emb_file)

In [5]:
e1_emb = entity_embs[34590][:1024]
e2_emb = entity_embs[31909][:1024]

cosine_similarity(e1_emb+relation_emb, e2_emb)

In [30]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Load entity embeddings
entity_embs = np.load("/shared/pj20/lamake_data/WN18RR/entity_init_embeddings.npy")
relation_embs= np.load("/shared/pj20/lamake_data/WN18RR/checkpoints/pRotatE_seed_batch_512_hidden_512_dist_cosine/relation_embedding.npy")

# Entity embeddings
e1_emb = entity_embs[18015]
e2_emb = entity_embs[19725]

# Compute relation embedding
relation = "_derivationally_related_form".replace("_", " ")
relation_emb = client.embeddings.create(
                input=relation,
                model="text-embedding-3-large",
                dimensions=1024,
            ).data[0].embedding

relation_emb = np.concatenate([relation_emb, relation_emb])
# relation_emb = relation_embs[0]

# Compute target embedding
target_emb = e1_emb + relation_emb

# Compute cosine similarities with all entity embeddings
similarities = np.array([cosine_similarity(target_emb, emb) for emb in entity_embs])

# Sort entities by similarity
sorted_indices = np.argsort(similarities)[::-1]

# Find rank of e2_emb
e2_index = 19725
e2_rank = np.where(sorted_indices == e2_index)[0][0] + 1  # +1 for 1-based rank

print(f"Rank of e2_emb: {e2_rank}")

Rank of e2_emb: 122


0.20138492839785052