In [152]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree, is_valid_linkage
from scipy.spatial import Voronoi, voronoi_plot_2d
import matplotlib.pyplot as plt
import pickle as pkl
import geojson

# Load Embeddings → PCA-5


In [153]:
with open("data/og/embeddings.json") as f:
    embeddings = json.load(f)

vectors = list(embeddings.values())
embeddings_array = np.array(vectors)

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings_array)

n_components = 5
pca = PCA(n_components=n_components)
reduced_embeddings = pca.fit_transform(scaled_embeddings)

reduced_dict = {}

# Iterate over the filenames and corresponding reduced vectors
for filename, vector in zip(embeddings.keys(), reduced_embeddings):
    reduced_dict[filename] = vector.tolist()

with open('data/pca5_embeddings.json', 'w') as f:
    json.dump(reduced_dict, f)

In [154]:
with open("data/og/embeddings.json") as f:
    embeddings = json.load(f)

vectors = list(embeddings.values())
embeddings_array = np.array(vectors)

scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings_array)

tsne = TSNE(n_components=5, method='exact', random_state=42)
reduced_embeddings = tsne.fit_transform(scaled_embeddings)

reduced_dict = {}

# Iterate over the filenames and corresponding reduced vectors
for filename, vector in zip(embeddings.keys(), reduced_embeddings):
    reduced_dict[filename] = vector.tolist()

with open('data/tsne5_embeddings.json', 'w') as f:
    json.dump(reduced_dict, f)

In [155]:
embeddings_values = list(embeddings.values())
Z = linkage(embeddings_values)

max_layers = 12
clusters = cut_tree(Z, n_clusters=max_layers)

# Load Linkage Labels

In [156]:
# Add cluster labels to linkage_labels
with open('data/og/LinkageLabels2.pkl', 'rb') as f:
    linkage_labels = pkl.load(f)

linkage_labels = [row + [cluster] for row, cluster in zip(linkage_labels, clusters)]
linkage_labels[0]

[399.0,
 594.0,
 0.42786448395325166,
 3.0,
 'Alchemy - France.',
 ['Book - Philosophy - Alchemy', 'France- (or French)'],
 array([0])]

In [157]:
# find cluster with highest cluster children (root)
max_children = 0
root = 0
for i in range(len(linkage_labels)):
    if int(linkage_labels[i][3]) == 2:
        max_children = int(linkage_labels[i][3])
        root = i

print(linkage_labels[root])



In [158]:
# find row that contains "Create a programming language.md"
for i in range(len(linkage_labels)):
    if "Create a programming language.md" == linkage_labels[i][4] or "Create a programming language.md" in linkage_labels[i][5]:
        print(linkage_labels[i])

# Layer (.geojson) Generation for MapBox

### Formatting final layer with all files

In [159]:
def normalize_embeddings(embeddings_dict):
    embeddings = list(embeddings_dict.values())

    # find max values
    max0 = 0
    max1 = 0
    max2 = 0
    max3 = 0
    max4 = 0
    for i in range(len(embeddings)):
        if abs(embeddings[i][0]) > max0:
            max0 = abs(embeddings[i][0])
        if abs(embeddings[i][1]) > max1:
            max1 = abs(embeddings[i][1])
        if abs(embeddings[i][2]) > max2:
            max2 = abs(embeddings[i][2])
        if abs(embeddings[i][3]) > max3:
            max3 = abs(embeddings[i][3])
        if abs(embeddings[i][4]) > max4:
            max4 = abs(embeddings[i][4])

    # normalize, for indexes 2, 3, 4  raise the negatives to 0 and the positives to 256 and have the appropriate range inbetween, 0 goes to 128
    for i in range(len(embeddings)):
        embeddings[i][0] = embeddings[i][0] / max0
        embeddings[i][1] = embeddings[i][1] / max1
        embeddings[i][2] = (embeddings[i][2] / max2) * 128 + 128
        embeddings[i][3] = (embeddings[i][3] / max3) * 128 + 128
        embeddings[i][4] = (embeddings[i][4] / max4) * 128 + 128

    return dict(zip(embeddings_dict.keys(), embeddings))


with open('data/tsne5_embeddings.json') as f:
    tsne5 = json.load(f)

tsne5_normalized = normalize_embeddings(tsne5)

with open('data/pca5_embeddings.json') as f:
    pca5 = json.load(f)

pca5_normalized = normalize_embeddings(pca5)

In [160]:
def create_final_layer_geojson(reduced_embeddings):
    features = []
    
    for label, coords_rgb in reduced_embeddings.items():
        # Extract x, y location and RGB values from coords_rgb
        x, y = coords_rgb[:2]
        rgb = coords_rgb[2:]
        
        # Create a GeoJSON feature for each label
        feature = {
            "type": "Feature",
            "properties": {
                "label": label,
                "color": f"rgb({rgb[0]}, {rgb[1]}, {rgb[2]})"
            },
            "geometry": {
                "type": "Point",
                "coordinates": [x, y]
            }
        }
        
        features.append(feature)
    
    # Create the GeoJSON object
    geojson = {
        "type": "FeatureCollection",
        "features": features
    }
    
    return json.dumps(geojson, indent=4)


result = create_final_layer_geojson(tsne5_normalized)
with open('data/layers/tsne5_final_layer.geojson', 'w') as f:
    f.write(result)

result = create_final_layer_geojson(pca5_normalized)
with open('data/layers/pca5_final_layer.geojson', 'w') as f:
    f.write(result)

## Voronoi GeoJSON

In [201]:
def calculate_polygon_area(points):
    n = len(points)
    if n < 3:
        raise ValueError("Polygon must have at least 3 points.")

    # Ensure the polygon is closed (first and last points are the same)
    if not np.array_equal(points[0], points[-1]):
        points = np.vstack([points, points[0]])

    area = 0.0
    for i in range(n):
        area += (points[i, 0] * points[i + 1, 1]) - (points[i + 1, 0] * points[i, 1])

    return abs(area / 2.0)

def make_voronoi_geojson(polygons, min_zoom, max_zoom):
    features = []
    for polygon in polygons:
        feature = {
            "type": "Feature",
            "properties": {
                "min_zoom": min_zoom,
                "max_zoom": max_zoom
            },
            "geometry": {
                "type": "Polygon",
                "coordinates": [polygon]
            }
        }
        features.append(feature)

    geojson = {
        "type": "FeatureCollection",
        "features": features
    }
    return geojson

points = [x[:2] for x in list(pca5_normalized.values())]

# Compute Voronoi diagram
vor = Voronoi(points)

polygons = []
for region in vor.regions:
    if -1 not in region and len(region) > 0:
        polygon = [vor.vertices[i] for i in region]
        p_list = []
        for point in polygon:
            p_list.append([point[0], point[1]])
        polygons.append(p_list)

geojson = make_voronoi_geojson(polygons, 5, 22)
with open('data/layers/pca5_voronoi.geojson', 'w') as f:
    f.write(json.dumps(geojson, indent=4))

# split polygons into big and small by 25% percentile
# areas = []
# for polygon in polygons:
#     areas.append(calculate_polygon_area(polygon))

# median = np.percentile(areas, 80)
# big_polygons = []
# small_polygons = []
# for i in range(len(polygons)):
#     if areas[i] > median:
#         big_polygons.append(polygons[i])
#     else:
#         small_polygons.append(polygons[i])



# geojson = make_voronoi_geojson(big_polygons, 5, 22)
# with open('data/layers/pca5_voronoi_big.geojson', 'w') as f:
#     f.write(json.dumps(geojson, indent=4))

# geojson = make_voronoi_geojson(small_polygons, 9, 22)
# with open('data/layers/pca5_voronoi_small.geojson', 'w') as f:
#     f.write(json.dumps(geojson, indent=4))