In [60]:
import json
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, cut_tree, fcluster
import pickle as pkl
from collections import defaultdict
import openai
import time
from dotenv import dotenv_values

In [61]:
config = dotenv_values(".env")  
openai.api_key = config['OPENAI_API_KEY']

In [62]:
with open("data/og/embeddings.json") as f:
    embeddings = json.load(f)

reverse_embeddings = {}
for i in range(len(embeddings.keys())):
    reverse_embeddings[tuple(list(embeddings.values())[i])] = list(embeddings.keys())[i]

with open("data/og/openai_summaries_2", "rb") as fp:
    summaries = pkl.load(fp)

with open('data/og/LinkageLabels2.pkl', 'rb') as f:
    linkage_labels = pkl.load(f)

with open('data/pca5_normalized.json') as f:
    pca5_normalized = json.load(f)

with open("data/real_linkage_labels.pkl", "rb") as fp:
    real_linkage_labels = pkl.load(fp)

# Get cluster average values and add summary to each row

In [63]:
# Added weights to prompt depending on cluster size
def get_summary(topics, weight=None, model_engine="gpt-3.5-turbo", prompt="In a minimum of 1 word and a maximum of 3 words find the most specific commonality between the following topics"):
    if weight != None:
        prompt += f"by the respective weight of {weight} (a weight (0-1, 0 preferences topic1, 1 preferences topic2): "
    else:
        prompt += ": "
    main_text = "{} {}".format(prompt, " ".join(topics))
    prompt = {"role":"system", "content": main_text}
    try:
        response = openai.ChatCompletion.create(model = model_engine, messages = [prompt])["choices"][0]["message"]["content"]
    except:
        print("Error with rate limit waiting 60 seconds")
        time.sleep(60)
        response = openai.ChatCompletion.create(model = model_engine, messages = [prompt])["choices"][0]["message"]["content"]
    return response

# https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
#  Creates value of links, such as color, by blending together leaf values
def create_link_values(linkage_matrix, leaf_values,
                    merge_function = lambda cluster1_val, cluster2_val, weight: cluster1_val * (1 - weight) + cluster2_val * weight,
                    weight_function = lambda cluster1_size, cluster2_size: cluster2_size / (cluster1_size + cluster2_size)):
    # Merge function takes val1, val2, and a weight (0-1, 0 preferences val1, 1 preferences val2) and returns a merged value

    num_leaves = len(leaf_values)
    
    new_values = leaf_values.copy()
    for i, row in enumerate(linkage_matrix):
        cluster1_id = int(row[0])
        cluster2_id = int(row[1])

        if cluster1_id < num_leaves: # it is a sample/leaaf
            cluster1_size = 1
        else:
            cluster1_size = linkage_matrix[num_leaves - cluster1_id][-1]
        
        if cluster2_id < num_leaves: # it is a sample/leaf
            cluster2_size = 1
        else:
            cluster2_size = linkage_matrix[num_leaves - cluster2_id][-1]

        # If a cluster is huge and is merged with a leaf, the weight should preference the cluster
        # Can try different weighting here
        weight = weight_function(cluster1_size, cluster2_size)

        # get summary of the two clusters based on the weight
        summary_label = get_summary([new_values[cluster1_id][-1], new_values[cluster2_id][-1]], weight=weight)
        print(str(cluster1_id), str(cluster2_id) + ": "+ summary_label)

        # merge the values based on the weight
        merged_cluster_value = [merge_function(new_values[cluster1_id][i], new_values[cluster2_id][i], weight) for i in range(5)]
        merged_cluster_value.append(summary_label)

        # try:
        #   merged_cluster_value = [merge_function(new_values[cluster1_id][i], new_values[cluster2_id][i], weight) for i in range(5)]
        #     # find same cluster in real linkage values based on cluster ids and index 0 and 1 in linkage matrix
        #     summary = ""
        #     for j in range(len(real_linkage_labels)):
        #         if real_linkage_labels[j][0] == cluster1_id and real_linkage_labels[j][1] == cluster2_id:
        #             summary = real_linkage_labels[j][4]
        #             break
                    
        #     merged_cluster_value.append(summary)

        # except:
        #     print("INDEX OUT OF RANGE: " + cluster1_id, cluster2_id, weight, cluster1_size, cluster2_size)

        new_values.append(merged_cluster_value)

    return new_values

In [64]:
# Linkage matrix
Z = linkage(list(pca5_normalized.values()))

# base files
leaf_values = [list(map(float,i)) for i in list(pca5_normalized.values())]
leaf_labels = list(pca5_normalized.keys())
leaf_values = [leaf_values[i] + [leaf_labels[i]] for i in range(len(leaf_values))]

all_values = create_link_values(Z, leaf_values)

249 471: Transformation.
133 137: Electronics components
406 466: Molière's plays
240 458: Philosophy/Puzzles (0.5/0.5)
405 412: Machine Learning
170 373: Philosophical works
360 599: None
391 430: Shakespeare's Plays
266 468: Animal kingdom
47 465: Finance policies.
159 322: Dystopian Society
332 474: Deception/Deceit.
459 583: Self-improvement.
307 419: Measurement systems
169 254: Music history.
227 233: 18th Century France
283 350: Parallel Universe
204 228: Complete Theatre.
346 352: Achievement-oriented.
117 592: Abstraction skills
274 320: Animal kingdom
90 196: Learning/Knowledge
48 278: Entrepreneurship
210 220: Animal kingdom.
109 161: Educational methods.
356 437: Books/stories.
206 403: Mindfulness. (weight: 0.5)
384 426: Entrepreneurship visionary
236 410: Theater play.
270 289: French literature
212 603: Success principles.
198 217: Chaos Theory
398 597: Probability Puzzles
80 103: Writing techniques
35 470: Biography writing.
452 594: None.
325 347: Intelligence augmenta

# Add layer number to each row

In [92]:
new_cluster_values = all_values[-len(Z):].copy()
max_layers = 12
cluster_assignments = cut_tree(Z, n_clusters=max_layers)

complete_clusters = [row + [cluster[0]] for row, cluster in zip(new_cluster_values, cluster_assignments)]
complete_clusters

[[-0.7884902891906875,
  0.14546930363833793,
  122.04197831070857,
  121.16801483201219,
  126.53006796468324,
  'Transformation.',
  0],
 [0.3467322875018421,
  -0.4780907925666585,
  78.70960755409203,
  146.57128098689162,
  101.96685105205927,
  'Electronics components',
  0],
 [-0.7508190660619655,
  0.04846988588954755,
  134.97682131823046,
  143.32721358014186,
  123.22428875847427,
  "Molière's plays",
  0],
 [-0.5485880990765386,
  0.13342405658554427,
  130.64417269589097,
  100.47712263273344,
  118.33596176715277,
  'Philosophy/Puzzles (0.5/0.5)',
  0],
 [-0.40430760925493625,
  0.0315766360349769,
  102.6442227207043,
  89.57924174519468,
  118.39165803480196,
  'Machine Learning',
  0],
 [0.11742743564964914,
  0.032497583858571225,
  128.60484045507457,
  118.06225628600114,
  139.093517665656,
  'Philosophical works',
  0],
 [0.11604985483317898,
  0.03251958444805507,
  128.60394914729292,
  118.05871969747079,
  139.09525759139183,
  'None',
  0],
 [-0.6076535318744

# GEOJSON TIME

In [93]:
def format_geojson_layer(rows):
    features = []
    
    for row in rows:
        # Extract x, y location and RGB values from coords_rgb
        x, y = row[:2]
        rgb = row[2:5]
        label = row[5]
        
        # Create a GeoJSON feature for each label
        feature = {
            "type": "Feature",
            "properties": {
                "label": label,
                "color": f"rgb({rgb[0]}, {rgb[1]}, {rgb[2]})"
            },
            "geometry": {
                "type": "Point",
                "coordinates": [x, y]
            }
        }
        
        features.append(feature)
    
    # Create the GeoJSON object
    geojson = {
        "type": "FeatureCollection",
        "features": features
    }
    
    return json.dumps(geojson, indent=4)

In [97]:
file_layer = format_geojson_layer(leaf_values)
with open("data/layers/complete/final_layer.geojson", "w") as f:
    f.write(file_layer)

# split complete_clusters by cluster (index 6)
cluster_layers = []
for i in range(max_layers):
    cluster_layers.append(format_geojson_layer([row for row in complete_clusters if row[6] == i]))

for i, layer in enumerate(cluster_layers):
    with open(f"data/layers/complete/layer{i}.geojson", "w") as f:
        f.write(layer)

Questions for Kaanan:

Are summaries in order? (i.e. is the first row of the summary the first row of the linkage matrix?)

DUPLICATE?!?!
[399.0, 594.0, 0.42786448395325166, 3.0, 'Alchemy - France.', ['Book - Philosophy - Alchemy', 'France- (or French)'], array([0])]
[204.0, 595.0, 0.15685175833221093, 3.0, 'Drama.', ['Category: Theater.', 'Literature/classic/play.'], array([0])]
[303.0, 596.0, 0.1652177944881287, 4.0, 'Literary analysis.', ['Shakespearean play analysis.', 'Drama.'], array([0])]
[303.0, 596.0, 0.1652177944881287, 4.0, 'Literary analysis.', ['Shakespearean play analysis.', 'Drama.'], array([0])]
[277.0, 379.0, 0.1695001531610153, 2.0, 'Russian fiction.', ['Les Frères Karamazov I - Dostoïevski.md', 'Les Frères Karamazov Ii - Dostoïevski.md'], array([0])]
[288.0, 599.0, 0.4305627017428426, 3.0, 'Design principles.', ['Engineering book.', 'Machines'], array([0])]

So the logic is in these rows, [0] lines up with [5][0] and [1] lines up with [5][1] and so [4] lines up with the obersvation in all_values that shares the [0] and [1] clusters. However, I have 69 that are left over with no summary label. I did notice on duplicate row in linkage_labels which is kinda sus. Maybe if statement, line 18 below 'summary to filename' in processing.ipynb was messing me up... Maybe some of the summaries where there are three leaves are notes too?? IDK man    

There are some duplicates idk why man. I have all spaces filled now but idk why some of the .md files are all over the place......   

Do the summaries do better when the original file names are summarized? Probably should used summarized ones...