In [125]:
import json
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, cut_tree, fcluster
import pickle as pkl
from collections import defaultdict

In [126]:
with open("data/og/embeddings.json") as f:
    embeddings = json.load(f)

reverse_embeddings = {}
for i in range(len(embeddings.keys())):
    reverse_embeddings[tuple(list(embeddings.values())[i])] = list(embeddings.keys())[i]

with open("data/og/openai_summaries_2", "rb") as fp:
    summaries = pkl.load(fp)

In [127]:
with open('data/og/LinkageLabels2.pkl', 'rb') as f:
    linkage_labels = pkl.load(f)

with open('data/pca5_normalized.json') as f:
    pca5_normalized = json.load(f)

In [128]:
with open("data/real_linkage_labels.pkl", "rb") as fp:
    real_linkage_labels = pkl.load(fp)

# Get cluster average values

In [129]:
# https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
#  Creates value of links, such as color, by blending together leaf values
def create_link_values(linkage_matrix, leaf_values,
                    merge_function = lambda cluster1_val, cluster2_val, weight: cluster1_val * (1 - weight) + cluster2_val * weight,
                    weight_function = lambda cluster1_size, cluster2_size: cluster2_size / (cluster1_size + cluster2_size)):
    # Merge function takes val1, val2, and a weight (0-1, 0 preferences val1, 1 preferences val2) and returns a merged value

    num_leaves = len(leaf_values)
    
    new_values = leaf_values.copy()
    for i, row in enumerate(linkage_matrix):
        cluster1_id = int(row[0])
        cluster2_id = int(row[1])

        if cluster1_id < num_leaves: # it is a sample/leaaf
            cluster1_size = 1
        else:
            cluster1_size = linkage_matrix[num_leaves - cluster1_id][-1]
        
        if cluster2_id < num_leaves: # it is a sample/leaf
            cluster2_size = 1
        else:
            cluster2_size = linkage_matrix[num_leaves - cluster2_id][-1]

        # If a cluster is huge and is merged with a leaf, the weight should preference the cluster
        # Can try different weighting here
        weight = weight_function(cluster1_size, cluster2_size)

        # merge the values for each index, 5 indexes
        try:
            merged_cluster_value = [merge_function(new_values[cluster1_id][i], new_values[cluster2_id][i], weight) for i in range(5)]

            # find same cluster in real linkage values based on cluster ids and index 0 and 1 in linkage matrix
            summary = ""
            for j in range(len(real_linkage_labels)):
                if real_linkage_labels[j][0] == cluster1_id and real_linkage_labels[j][1] == cluster2_id:
                    summary = real_linkage_labels[j][4]
                    break
                    
            merged_cluster_value.append(summary)

        except:
            print("INDEX OUT OF RANGE: " + cluster1_id, cluster2_id, weight, cluster1_size, cluster2_size)

        new_values.append(merged_cluster_value)

    return new_values

In [130]:
Z = linkage(list(embeddings.values()))
leaf_values = [list(map(float,i)) for i in list(pca5_normalized.values())]
leaf_labels = list(pca5_normalized.keys())

leaf_values = [leaf_values[i] + [leaf_labels[i]] for i in range(len(leaf_values))]

all_values = create_link_values(Z, leaf_values)
new_cluster_values = all_values[-len(Z):]

# labels_dict = defaultdict(str)
# for row in real_linkage_labels:
#     index1 = row[0]
#     index2 = row[1]
    
#     summary1 = row[5][0]
#     summary2 = row[5][1]

#     labels_dict[index1] = summary1
#     labels_dict[index2] = summary2

# for i in labels_dict.keys():
#     i = int(i)
#     all_values[i].append(labels_dict[i])

In [138]:
# len(real_linkage_labels), len(new_cluster_values)
linkage_labels

[[399.0,
  594.0,
  0.42786448395325166,
  3.0,
  'Alchemy - France.',
  ['Book - Philosophy - Alchemy', 'France- (or French)']],
 [204.0,
  595.0,
  0.15685175833221093,
  3.0,
  'Drama.',
  ['Category: Theater.', 'Literature/classic/play.']],
 [303.0,
  596.0,
  0.1652177944881287,
  4.0,
  'Literary analysis.',
  ['Shakespearean play analysis.', 'Drama.']],
 [303.0,
  596.0,
  0.1652177944881287,
  4.0,
  'Literary analysis.',
  ['Shakespearean play analysis.', 'Drama.']],
 [277.0,
  379.0,
  0.1695001531610153,
  2.0,
  'Russian fiction.',
  ['Literature, Russian, Free.', 'Book, Fiction, Russian-Language.']],
 [288.0,
  599.0,
  0.4305627017428426,
  3.0,
  'Design principles.',
  ['Engineering book.', 'Machines']],
 [216.0,
  407.0,
  0.19415253415551106,
  2.0,
  'Future predictions.',
  ['Prophecy book.', 'Prophecies.']],
 [243.0,
  601.0,
  0.4949923818620001,
  3.0,
  'Information source.',
  ['Book - Bernard Werber - Encyclopedia', 'Reference material.']],
 [320.0,
  602.0,
 

In [134]:
# leaf_values == all_values[:len(leaf_values)]
all_values[len(leaf_values):] == new_cluster_values

# find all rows that have an empty string for last value
count = 0
for i in range(len(all_values)):
    if all_values[i][-1] == "":
        print(i)
        count += 1

count


594
595
599
601
602
604
605
606
608
609
612
616
632
635
641
643
647
649
651
652
653
658
660
661
663
664
665
666
670
679
683
686
687
688
694
702
706
709
711
712
717
732
740
741
750
752
777
799
844
860
861
884
891
896
897
900
904
916
948
969
1009
1010
1013
1046
1052
1064
1086
1093
1133


69

# Add respective label and layer number to each row

Questions for Kaanan:

Are summaries in order? (i.e. is the first row of the summary the first row of the linkage matrix?)