In [188]:
import json
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, cut_tree, fcluster
import pickle as pkl
from collections import defaultdict

In [189]:
with open("data/og/embeddings.json") as f:
    embeddings = json.load(f)

reverse_embeddings = {}
for i in range(len(embeddings.keys())):
    reverse_embeddings[tuple(list(embeddings.values())[i])] = list(embeddings.keys())[i]

with open("data/og/openai_summaries_2", "rb") as fp:
    summaries = pkl.load(fp)

In [190]:
with open('data/og/LinkageLabels2.pkl', 'rb') as f:
    linkage_labels = pkl.load(f)

with open('data/pca5_normalized.json') as f:
    pca5_normalized = json.load(f)

In [191]:
with open("data/real_linkage_labels.pkl", "rb") as fp:
    real_linkage_labels = pkl.load(fp)

# Get cluster average values

In [192]:
# https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
#  Creates value of links, such as color, by blending together leaf values
def create_link_values(linkage_matrix, leaf_values,
                    merge_function = lambda cluster1_val, cluster2_val, weight: cluster1_val * (1 - weight) + cluster2_val * weight,
                    weight_function = lambda cluster1_size, cluster2_size: cluster2_size / (cluster1_size + cluster2_size)):
    # Merge function takes val1, val2, and a weight (0-1, 0 preferences val1, 1 preferences val2) and returns a merged value

    num_leaves = len(leaf_values)
    
    new_values = leaf_values.copy()
    for i, row in enumerate(linkage_matrix):
        cluster1_id = int(row[0])
        cluster2_id = int(row[1])

        if cluster1_id < num_leaves: # it is a sample/leaaf
            cluster1_size = 1
        else:
            cluster1_size = linkage_matrix[num_leaves - cluster1_id][-1]
        
        if cluster2_id < num_leaves: # it is a sample/leaf
            cluster2_size = 1
        else:
            cluster2_size = linkage_matrix[num_leaves - cluster2_id][-1]

        # If a cluster is huge and is merged with a leaf, the weight should preference the cluster
        # Can try different weighting here
        weight = weight_function(cluster1_size, cluster2_size)

        # merge the values for each index, 5 indexes
        try:
            merged_cluster_value = [merge_function(new_values[cluster1_id][i], new_values[cluster2_id][i], weight) for i in range(5)]

            # find same cluster in real linkage values based on cluster ids and index 0 and 1 in linkage matrix
            summary = ""
            for j in range(len(real_linkage_labels)):
                if real_linkage_labels[j][0] == cluster1_id and real_linkage_labels[j][1] == cluster2_id:
                    summary = real_linkage_labels[j][4]
                    break
                    
            merged_cluster_value.append(summary)

        except:
            print("INDEX OUT OF RANGE: " + cluster1_id, cluster2_id, weight, cluster1_size, cluster2_size)

        new_values.append(merged_cluster_value)

    return new_values

In [193]:
Z = linkage(list(embeddings.values()))
leaf_values = [list(map(float,i)) for i in list(pca5_normalized.values())]
leaf_labels = list(pca5_normalized.keys())

leaf_values = [leaf_values[i] + [leaf_labels[i]] for i in range(len(leaf_values))]

all_values = create_link_values(Z, leaf_values)
# new_cluster_values = all_values[-len(Z):]

labels_dict = defaultdict(str)
for row in real_linkage_labels:
    index1 = row[0]
    index2 = row[1]
    
    summary1 = row[5][0]
    summary2 = row[5][1]

    labels_dict[index1] = summary1
    labels_dict[index2] = summary2

for i in labels_dict.keys():
    i = int(i)
    all_values[i].append(labels_dict[i])

new_cluster_values = all_values[-len(Z):]

In [194]:
# len(real_linkage_labels), len(new_cluster_values)
all_values

[[0.6508792887267072,
  -0.03368594147500735,
  81.74438917579135,
  104.47192267625915,
  114.39322563698086,
  'Create a programming language.md'],
 [0.6902856538865418,
  0.02820255252055549,
  107.63253043586033,
  105.26765582031973,
  112.84345105146453,
  'Programming languages.md'],
 [0.5528910118855592,
  0.6891953120204469,
  173.59784843884685,
  168.52286367786002,
  51.067562102288804,
  'Inspectional reading.md',
  'Inspectional reading.md'],
 [0.3469958851072213,
  0.6046609218673841,
  169.40545807440242,
  145.05457689115653,
  55.18920763183186,
  'Reading philosophy.md',
  'Reading philosophy.md'],
 [0.4916103343386223,
  0.7147807594959095,
  168.81026376127676,
  171.37260700619444,
  53.254813380252386,
  'Reading imaginary.md'],
 [0.6160317399114056,
  0.8613115848166475,
  168.27107587668314,
  150.75966307655696,
  81.60966644171326,
  'Reading technics should differ depending on book type.md'],
 [0.5660596718782442,
  0.46308264228446033,
  178.90110065039192,

In [198]:
# leaf_values == all_values[:len(leaf_values)]
print(all_values[len(leaf_values):] == new_cluster_values)

# find all rows that have an empty string for last value
count = 0
for i in range(len(all_values)):
    if all_values[i][-1] == "":
        print(i)
        count += 1

count


True


0

# Add respective label and layer number to each row

In [202]:
# if len(row) == 7, write row[6], else row[5]
ssss = [row[6] if len(row) == 7 else row[5] for row in all_values]
# write sss to a file, make it readable
with open("data/TESTTESTMAYBEIDIDIT.txt", "w") as f:
    for i, row in enumerate(ssss):
        f.write(str(i) + " " + row + "\n")

In [203]:
new_cluster_values

[[-0.7302799771581137,
  0.13579780068902975,
  130.7809495592481,
  121.14126199958531,
  137.30626116744372,
  '',
  'France- (or French)'],
 [-0.6076535318744656,
  0.037439022634607175,
  128.72823876522367,
  119.23074775059784,
  104.26600831903758,
  '',
  'Literature/classic/play.'],
 [-0.6076402867189018,
  0.037444856375648705,
  128.73414332955548,
  119.23139998629911,
  104.27215371064247,
  'Drama.',
  'Drama.'],
 [-0.6076522656175559,
  0.037430071322404786,
  128.73544062462776,
  119.23547629579005,
  104.2769436680397,
  'Literary analysis.',
  '202106131132 Litterature notes helps comparing different authors ideas.md'],
 [-0.7670685484953554,
  0.18707210249746992,
  125.85974787339259,
  118.11243109412555,
  114.3866999595293,
  'Russian fiction.',
  'Russian fiction.'],
 [-0.6826158047740333,
  -0.004981441004507382,
  108.41604008951347,
  97.02865969600929,
  110.66573572975248,
  '',
  'Machines'],
 [-0.6770962705926772,
  0.1389823309034109,
  114.5820333996,


Questions for Kaanan:

Are summaries in order? (i.e. is the first row of the summary the first row of the linkage matrix?)

DUPLICATE?!?!
[399.0, 594.0, 0.42786448395325166, 3.0, 'Alchemy - France.', ['Book - Philosophy - Alchemy', 'France- (or French)'], array([0])]
[204.0, 595.0, 0.15685175833221093, 3.0, 'Drama.', ['Category: Theater.', 'Literature/classic/play.'], array([0])]
[303.0, 596.0, 0.1652177944881287, 4.0, 'Literary analysis.', ['Shakespearean play analysis.', 'Drama.'], array([0])]
[303.0, 596.0, 0.1652177944881287, 4.0, 'Literary analysis.', ['Shakespearean play analysis.', 'Drama.'], array([0])]
[277.0, 379.0, 0.1695001531610153, 2.0, 'Russian fiction.', ['Les Frères Karamazov I - Dostoïevski.md', 'Les Frères Karamazov Ii - Dostoïevski.md'], array([0])]
[288.0, 599.0, 0.4305627017428426, 3.0, 'Design principles.', ['Engineering book.', 'Machines'], array([0])]

Yo so the logic is in these rows, [0] lines up with [5][0] and [1] lines up with [5][1] and so [4] lines up with the obersvation in all_values that shares the [0] and [1] clusters. However, I have 69 that are left over with no summary label. I did notice on duplicate row in linkage_labels which is kinda sus. Maybe if statement, line 18 below 'summary to filename' in processing.ipynb was messing me up... Maybe some of the summaries where there are three leaves are notes too?? IDK man    

There are some duplicates idk why man. I have all spaces filled now but idk why some of the .md files are all over the place......