In [16]:
import json
import numpy as np
import requests
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from collections import defaultdict
import os

CACHE_FILE = "conceptnet_embeddings.json"

def load_cache():
    """ Load cached embeddings if available. """
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, "r") as f:
            return json.load(f)
    return {}

def save_cache(cache):
    with open(CACHE_FILE, "w") as f:
        json.dump(cache, f)

def get_conceptnet_embedding(word):
    """ Fetch word embedding from ConceptNet. """
    url = f"http://api.conceptnet.io/c/en/{word}"
    response = requests.get(url).json()
    print("Fetching embedding for word:", word)
    print(response)
    for edge in response.get("edges", []):
        if 'embedding' in edge['start']:
            return edge['start']['embedding']['vector']
        elif 'embedding' in edge['end']:
            return edge['end']['embedding']['vector']
    
    return None  # Return None if embedding is not found

def get_embeddings(words):
    """ Get embeddings for a list of words, using a cache to reduce API calls. """
    cache = load_cache()
    embeddings = {}

    for word in words:
        word_cleaned = word.lower().replace(" ", "_")

        if word_cleaned in cache:
            embeddings[word] = np.array(cache[word_cleaned])
        else:
            embedding = get_conceptnet_embedding(word_cleaned)
            if embedding is not None:
                embeddings[word] = np.array(embedding)
                cache[word_cleaned] = embedding  # Store in cache

    save_cache(cache)  # Save cache after fetching missing embeddings
    return embeddings

def cluster_keywords(embeddings, n_clusters=3):
    words = list(embeddings.keys())
    vectors = np.array(list(embeddings.values()))
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(vectors)
    
    clustered_words = defaultdict(list)
    for word, label in zip(words, labels):
        clustered_words[label].append(word)
    
    return clustered_words

def plot_clusters(embeddings, clustered_words):
    words = list(embeddings.keys())
    vectors = np.array(list(embeddings.values()))
    
    pca = PCA(n_components=2)
    reduced_vectors = pca.fit_transform(vectors)
    
    plt.figure(figsize=(10, 6))
    
    for label, words in clustered_words.items():
        indices = [words.index(word) for word in words if word in embeddings]
        plt.scatter(reduced_vectors[indices, 0], reduced_vectors[indices, 1], label=f"Cluster {label}")
        for i in indices:
            plt.annotate(words[i], (reduced_vectors[i, 0], reduced_vectors[i, 1]))
    
    plt.legend()
    plt.show()

def main():
    with open("./all_signals.json", "r") as f:
        data = json.load(f)
    
    sensory_keywords = set()
    emotional_keywords = set()
    associative_keywords = set()
    
    for entry in data:
        sensory = (entry["sensory"]["keywords"][0].split(","))
        emotional =(entry["emotional"]["keywords"][0].split(","))
        associative = (entry["associative"]["keywords"][0].split(","))
        for word in sensory:
            sensory_keywords.add(word.strip())
        for word in emotional:
            emotional_keywords.add(word.strip())
        for word in associative:
            associative_keywords.add(word.strip())
    
    all_keywords = list(sensory_keywords)
    
    embeddings = get_embeddings(all_keywords)
    clustered_words = cluster_keywords(embeddings, n_clusters=5)
    plot_clusters(embeddings, clustered_words)
    
    print("Clustered Keywords:")
    for cluster, words in clustered_words.items():
        print(f"Cluster {cluster}: {', '.join(words)}")
    
if __name__ == "__main__":
    main()


Fetching embedding for word: cutting_wood
{'@context': ['http://api.conceptnet.io/ld/conceptnet5.7/context.ld.json'], '@id': '/c/en/cutting_wood', 'edges': [{'@id': '/a/[/r/UsedFor/,/c/en/saw/,/c/en/cutting_wood/]', '@type': 'Edge', 'dataset': '/d/conceptnet/4/en', 'end': {'@id': '/c/en/cutting_wood', '@type': 'Node', 'label': 'cutting wood', 'language': 'en', 'term': '/c/en/cutting_wood'}, 'license': 'cc:by/4.0', 'rel': {'@id': '/r/UsedFor', '@type': 'Relation', 'label': 'UsedFor'}, 'sources': [{'@id': '/and/[/s/activity/omcs/commons_manual_entry/,/s/contributor/omcs/bami25/]', '@type': 'Source', 'activity': '/s/activity/omcs/commons_manual_entry', 'contributor': '/s/contributor/omcs/bami25'}, {'@id': '/and/[/s/activity/omcs/commons_manual_entry/,/s/contributor/omcs/digba121/]', '@type': 'Source', 'activity': '/s/activity/omcs/commons_manual_entry', 'contributor': '/s/contributor/omcs/digba121'}, {'@id': '/and/[/s/activity/omcs/omcs1_possibly_free_text/,/s/contributor/omcs/bluerock/]'

KeyboardInterrupt: 