In [41]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree, is_valid_linkage
from scipy.spatial import Voronoi, voronoi_plot_2d
import matplotlib.pyplot as plt
import pickle as pkl
import geojson

# Load Embeddings → PCA-5


In [42]:
with open("data/og/embeddings.json") as f:
    embeddings = json.load(f)

vectors = list(embeddings.values())
embeddings_array = np.array(vectors)

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings_array)

n_components = 5
pca = PCA(n_components=n_components)
reduced_embeddings = pca.fit_transform(scaled_embeddings)

reduced_dict = {}

# Iterate over the filenames and corresponding reduced vectors
for filename, vector in zip(embeddings.keys(), reduced_embeddings):
    reduced_dict[filename] = vector.tolist()

with open('data/pca5_embeddings.json', 'w') as f:
    json.dump(reduced_dict, f)

In [43]:
with open("data/og/embeddings.json") as f:
    embeddings = json.load(f)

vectors = list(embeddings.values())
embeddings_array = np.array(vectors)

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings_array)

tsne = TSNE(n_components=5, method='exact', random_state=42)
reduced_embeddings = tsne.fit_transform(scaled_embeddings)

reduced_dict = {}

# Iterate over the filenames and corresponding reduced vectors
for filename, vector in zip(embeddings.keys(), reduced_embeddings):
    reduced_dict[filename] = vector.tolist()

with open('data/tsne5_embeddings.json', 'w') as f:
    json.dump(reduced_dict, f)

In [44]:
embeddings_values = list(embeddings.values())
Z = linkage(embeddings_values)

max_layers = 12
clusters = cut_tree(Z, n_clusters=max_layers)

# Load Linkage Labels

In [45]:
# Add cluster labels to linkage_labels
with open('data/og/LinkageLabels2.pkl', 'rb') as f:
    linkage_labels = pkl.load(f)

linkage_labels = [row + [cluster] for row, cluster in zip(linkage_labels, clusters)]

linkage_labels[0]
# [0] - cluster1, [1] - cluster2, [2] - distance, [3] - cluster size, [4] - cluster label, [5] - array([cluster1 label, cluster2 label]), [6] - layer number

[399.0,
 594.0,
 0.42786448395325166,
 3.0,
 'Alchemy - France.',
 ['Book - Philosophy - Alchemy', 'France- (or French)'],
 array([0])]

In [46]:
# find cluster with highest cluster children (root)
max_children = 0
root = 0
for i in range(len(linkage_labels)):
    if int(linkage_labels[i][3]) == 2:
        max_children = int(linkage_labels[i][3])
        root = i

print(linkage_labels[root])



In [47]:
# find row that contains "Create a programming language.md"
for i in range(len(linkage_labels)):
    if "Create a programming language.md" == linkage_labels[i][4] or "Create a programming language.md" in linkage_labels[i][5]:
        print(linkage_labels[i])

# Layer (.geojson) Generation for MapBox

### Formatting final layer with all files

In [48]:
def normalize_embeddings(embeddings_dict):
    embeddings = list(embeddings_dict.values())

    # find max values
    max0 = 0
    max1 = 0
    max2 = 0
    max3 = 0
    max4 = 0
    for i in range(len(embeddings)):
        if abs(embeddings[i][0]) > max0:
            max0 = abs(embeddings[i][0])
        if abs(embeddings[i][1]) > max1:
            max1 = abs(embeddings[i][1])
        if abs(embeddings[i][2]) > max2:
            max2 = abs(embeddings[i][2])
        if abs(embeddings[i][3]) > max3:
            max3 = abs(embeddings[i][3])
        if abs(embeddings[i][4]) > max4:
            max4 = abs(embeddings[i][4])

    # normalize, for indexes 2, 3, 4  raise the negatives to 0 and the positives to 256 and have the appropriate range inbetween, 0 goes to 128
    for i in range(len(embeddings)):
        embeddings[i][0] = embeddings[i][0] / max0
        embeddings[i][1] = embeddings[i][1] / max1
        embeddings[i][2] = (embeddings[i][2] / max2) * 128 + 128
        embeddings[i][3] = (embeddings[i][3] / max3) * 128 + 128
        embeddings[i][4] = (embeddings[i][4] / max4) * 128 + 128

    return dict(zip(embeddings_dict.keys(), embeddings))


with open('data/tsne5_embeddings.json') as f:
    tsne5 = json.load(f)

tsne5_normalized = normalize_embeddings(tsne5)

with open('data/pca5_embeddings.json') as f:
    pca5 = json.load(f)

pca5_normalized = normalize_embeddings(pca5)

with open('data/pca5_normalized.json', 'w') as f:
    f.write(json.dumps(pca5_normalized, indent=4))

In [49]:
def create_final_layer_geojson(reduced_embeddings):
    features = []
    
    for label, coords_rgb in reduced_embeddings.items():
        # Extract x, y location and RGB values from coords_rgb
        x, y = coords_rgb[:2]
        rgb = coords_rgb[2:]
        
        # Create a GeoJSON feature for each label
        feature = {
            "type": "Feature",
            "properties": {
                "label": label,
                "color": f"rgb({rgb[0]}, {rgb[1]}, {rgb[2]})"
            },
            "geometry": {
                "type": "Point",
                "coordinates": [x, y]
            }
        }
        
        features.append(feature)
    
    # Create the GeoJSON object
    geojson = {
        "type": "FeatureCollection",
        "features": features
    }
    
    return json.dumps(geojson, indent=4)


result = create_final_layer_geojson(tsne5_normalized)
with open('data/layers/tsne5_final_layer.geojson', 'w') as f:
    f.write(result)

result = create_final_layer_geojson(pca5_normalized)
with open('data/layers/pca5_final_layer.geojson', 'w') as f:
    f.write(result)

## Voronoi GeoJSON

In [50]:
def calculate_polygon_area(points):
    n = len(points)
    if n < 3:
        raise ValueError("Polygon must have at least 3 points.")

    # Ensure the polygon is closed (first and last points are the same)
    if not np.array_equal(points[0], points[-1]):
        points = np.vstack([points, points[0]])

    area = 0.0
    for i in range(n):
        area += (points[i, 0] * points[i + 1, 1]) - (points[i + 1, 0] * points[i, 1])

    return abs(area / 2.0)

def make_voronoi_geojson(polygons, min_zoom, max_zoom):
    features = []
    for polygon in polygons:
        feature = {
            "type": "Feature",
            "properties": {
                "min_zoom": min_zoom,
                "max_zoom": max_zoom
            },
            "geometry": {
                "type": "Polygon",
                "coordinates": [polygon]
            }
        }
        features.append(feature)

    geojson = {
        "type": "FeatureCollection",
        "features": features
    }
    return geojson

points = [x[:2] for x in list(pca5_normalized.values())]

# Compute Voronoi diagram
vor = Voronoi(points)

polygons = []
for region in vor.regions:
    if -1 not in region and len(region) > 0:
        polygon = [vor.vertices[i] for i in region]
        p_list = []
        for point in polygon:
            p_list.append([point[0], point[1]])
        polygons.append(p_list)

geojson = make_voronoi_geojson(polygons, 5, 22)
with open('data/layers/pca5_voronoi.geojson', 'w') as f:
    f.write(json.dumps(geojson, indent=4))

# Split Dendrogram

## Summary to filename

In [51]:
with open("data/og/openai_summaries_2", "rb") as fp:
    label_to_summary = pkl.load(fp)

file_labels = list(label_to_summary.keys())
values = list(label_to_summary.values())

# {key: 1, key2: 2, key3: 3, ...}
summary_to_index = {value: i for i, value in enumerate(values)}

summary_to_index

real_linkage_labels = linkage_labels.copy()

# replace summary with corresponding key value in linkage_labels, index 5 is an array of summaries, if the summary exists
for i in range(len(linkage_labels)):
    for j in range(len(linkage_labels[i][5])):
        current_summary = linkage_labels[i][5][j]
        # if current_summary in summary_to_index.keys() and linkage_labels[i][3] == 2:
        if current_summary in summary_to_index.keys():
            real_linkage_labels[i][5][j] = file_labels[summary_to_index[linkage_labels[i][5][j]]]


# display all rows with 2 leaves (index 3)
for i in range(len(real_linkage_labels)):
    print(real_linkage_labels[i])

with open("data/real_linkage_labels.pkl", "wb") as fp:
    pkl.dump(real_linkage_labels, fp)

[399.0, 594.0, 0.42786448395325166, 3.0, 'Alchemy - France.', ["Les Admirables Secrets D'albert Le Grand - Albert Le Grand.md", 'France- (or French)'], array([0])]
[204.0, 595.0, 0.15685175833221093, 3.0, 'Drama.', ['Théâtre Complet Tom 2 - William Shakespeare.md', 'Literature/classic/play.'], array([0])]
[303.0, 596.0, 0.1652177944881287, 4.0, 'Literary analysis.', ['Théâtre Complet Tom 3 - William Shakespeare.md', 'Drama.'], array([0])]
[303.0, 596.0, 0.1652177944881287, 4.0, 'Literary analysis.', ['Théâtre Complet Tom 3 - William Shakespeare.md', 'Drama.'], array([0])]
[277.0, 379.0, 0.1695001531610153, 2.0, 'Russian fiction.', ['Les Frères Karamazov I - Dostoïevski.md', 'Les Frères Karamazov Ii - Dostoïevski.md'], array([0])]
[288.0, 599.0, 0.4305627017428426, 3.0, 'Design principles.', ['Cours De Machines : Première Partie  - Haton De La Goupilliere .md', 'Machines'], array([0])]
[216.0, 407.0, 0.19415253415551106, 2.0, 'Future predictions.', ['Les Oracles De Nostradamus : Tome 2 

## Getting average coordinates for each cluster

In [52]:
# find all children that aren't a cluster themselves except if they are a cluster of 2
for i in range(len(real_linkage_labels)):
    if real_linkage_labels[i][3] != 2:
        for j in range(len(real_linkage_labels[i][5])):
            if real_linkage_labels[i][5][j] in summary_to_index.keys():
                print(real_linkage_labels[i][5][j])

In [53]:


# get all unique labels and children
all_labels = []
children = []
for i in range(len(real_linkage_labels)):
    child1 = real_linkage_labels[i][5][0]
    child2 = real_linkage_labels[i][5][1]
    cluster_label = real_linkage_labels[i][4]
    if child1 not in all_labels:
        all_labels.append(child1)
    if child2 not in all_labels:
        all_labels.append(child2)
    if cluster_label not in all_labels:
        all_labels.append(cluster_label)
    
all_labels_to_coordinates = dict(zip(all_labels, [[]]*len(all_labels)))
clusters_to_children = dict(zip([x[4] for x in real_linkage_labels], [x[5] for x in real_linkage_labels]))

for file in pca5_normalized.keys():
    all_labels_to_coordinates[file] = pca5_normalized[file]
    
def getAverageValues(label, all_labels_to_coordinates, clusters_to_children):
    child1 = clusters_to_children[label][0]
    child2 = clusters_to_children[label][1]
    child1_coords = getAverageValues(child1, all_labels_to_coordinates, clusters_to_children)
    child2_coords = getAverageValues(child2, all_labels_to_coordinates, clusters_to_children)
    return [(child1_coords[0] + child2_coords[0])/2, (child1_coords[1] + child2_coords[1])/2, (child1_coords[2] + child2_coords[2])/2, (child1_coords[3] + child2_coords[3])/2, (child1_coords[4] + child2_coords[4])/2]

for label in all_labels_to_coordinates.keys():
    if all_labels_to_coordinates[label] == []:
        all_labels_to_coordinates[label] = getAverageValues(label, all_labels_to_coordinates, clusters_to_children)

KeyError: 'France- (or French)'

In [None]:
all_labels_to_coordinates

{'Book - Philosophy - Alchemy': [],
 'France- (or French)': [],
 'Alchemy - France.': [],
 'Category: Theater.': [],
 'Literature/classic/play.': [],
 'Drama.': [],
 'Shakespearean play analysis.': [],
 'Literary analysis.': [],
 'Les Frères Karamazov I - Dostoïevski.md': [-0.7746388062005809,
  0.18571989026353591,
  124.83467332690195,
  115.28556987920143,
  116.3325814432415],
 'Les Frères Karamazov Ii - Dostoïevski.md': [-0.7594982802152129,
  0.188423337354402,
  126.90910450745427,
  120.93562122876179,
  112.40057788436417],
 'Russian fiction.': [],
 'Engineering book.': [],
 'Machines': [],
 'Design principles.': [],
 'Les Oracles De Nostradamus : Tome 2 - Nostradamus.md': [-0.6759788867938,
  0.14562113692899423,
  116.10063265263483,
  125.27052936081118,
  115.17525488151499],
 'Les Oracles De Nostradamus : Tome 1 - Nostradamus.md': [-0.678213664259028,
  0.13234877702394826,
  113.08509266887468,
  122.45559986606457,
  110.64028206199856],
 'Future predictions.': [],
 'Bo

In [None]:
# # get average coordinate for rows with 2 leaves
# def getCoordinateFromLabel(label):
#     return pca5_normalized[label][0], pca5_normalized[label][1]

# def findRow(label, real_linkage_labels):
#     for i in range(len(real_linkage_labels)):
#         if label == real_linkage_labels[i][4]:
#             return real_linkage_labels[i]
#     return f"label not found: {label}"


# def getAverageCoordinate(row, real_linkage_labels):
#     if row[3] == 2:
#         x1, y1 = getCoordinateFromLabel(row[5][0])
#         x2, y2 = getCoordinateFromLabel(row[5][1])
#         return (x1 + x2) / 2, (y1 + y2) / 2
#     elif len(row) <= 7:
#         x3, y3 = getAverageCoordinate(findRow(row[5][0], real_linkage_labels), real_linkage_labels)
#         x4, y4 = getAverageCoordinate(findRow(row[5][1], real_linkage_labels), real_linkage_labels)
#         return (x3 + x4) / 2, (y3 + y4) / 2
#     elif len(row) == 8:
#         x5, y5 = getAverageCoordinate(findRow(row[7][0], real_linkage_labels), real_linkage_labels)
#         x6, y6 = getAverageCoordinate(findRow(row[7][1], real_linkage_labels), real_linkage_labels)
#         return (x5 + x6) / 2, (y5 + y6) / 2 
        

# for i in range(len(real_linkage_labels)):
#     real_linkage_labels[i].append(getAverageCoordinate(real_linkage_labels[i], real_linkage_labels))

TypeError: cannot unpack non-iterable NoneType object

In [None]:
real_linkage_labels

[[399.0,
  594.0,
  0.42786448395325166,
  3.0,
  'Alchemy - France.',
  ['Book - Philosophy - Alchemy', 'France- (or French)'],
  array([0]),
  []],
 [204.0,
  595.0,
  0.15685175833221093,
  3.0,
  'Drama.',
  ['Category: Theater.', 'Literature/classic/play.'],
  array([0]),
  []],
 [303.0,
  596.0,
  0.1652177944881287,
  4.0,
  'Literary analysis.',
  ['Shakespearean play analysis.', 'Drama.'],
  array([0]),
  []],
 [303.0,
  596.0,
  0.1652177944881287,
  4.0,
  'Literary analysis.',
  ['Shakespearean play analysis.', 'Drama.'],
  array([0]),
  []],
 [277.0,
  379.0,
  0.1695001531610153,
  2.0,
  'Russian fiction.',
  ['Les Frères Karamazov I - Dostoïevski.md',
   'Les Frères Karamazov Ii - Dostoïevski.md'],
  array([0]),
  array([-0.76706856,  0.18707413]),
  [-0.7670685607778208, 0.1870741323337599],
  [-0.7670685607778208, 0.1870741323337599]],
 [288.0,
  599.0,
  0.4305627017428426,
  3.0,
  'Design principles.',
  ['Engineering book.', 'Machines'],
  array([0]),
  []],
 [216

In [None]:
# for all rows in real_linkage_labels, add a new column with the averaged x, y coordinates of the two leaves, the rows with two leaves will get their averaged locatons from the leaves since they are the file names




def getLinkageLocations(linkage_labels):
    for i in range(len(linkage_labels)):
        if linkage_labels[i][3] == 2:
            x1, y1 = getCoordinateFromLabel(linkage_labels[i][0])
            x2, y2 = getCoordinateFromLabel(linkage_labels[i][1])
            linkage_labels[i].append([(x1 + x2) / 2, (y1 + y2) / 2])
        else:
            if len(linkage_labels[i]) <= 7:

            
            
        


SyntaxError: incomplete input (2598502300.py, line 17)