In [1]:
# Reloads the hierarchical_clustering module and runs a test on the sample embeddings CSV file
import importlib
import hierarchical_clustering
importlib.reload(hierarchical_clustering)
# Import the necessary functions from the hierarchical_clustering module
from hierarchical_clustering import build_tree, get_all_centroids, calculate_balance
from scipy.spatial.distance import euclidean


# Test the build_tree() function with a sample embeddings CSV file
embedding_file = "./sample_embeddings.csv"
tree = build_tree(embedding_file)
calculate_balance(tree)

# Check the structure of the tree (e.g., print the number of files in each node)
def print_tree_structure(node, depth=0):
    if node is not None:
        print(f"Depth {depth}: Node has {node.get_children()} files with balance {node.balance}")
        print_tree_structure(node.left, depth+1)
        print_tree_structure(node.right, depth+1)

print_tree_structure(tree)

# Get the centroids for each level
levels = get_all_centroids(tree, embedding_file)

# Function to print statistics for each level
def print_level_stats(levels):
    for level, centroids in enumerate(levels):
        num_centroids = len(centroids)
        distances = [euclidean(centroids[i], centroids[j]) for i in range(num_centroids) for j in range(i + 1, num_centroids)]

        print(f"Level {level}:")
        print(f"  Number of centroids: {num_centroids}")
        if distances:
            min_distance = min(distances)
            max_distance = max(distances)
            avg_distance = sum(distances) / len(distances)
            print(f"  Min distance: {min_distance:.2f}")
            print(f"  Max distance: {max_distance:.2f}")
            print(f"  Avg distance: {avg_distance:.2f}")
        else:
            print("  No distances to calculate")
        print()

# Print statistics for each level
print_level_stats(levels)



Depth 0: Node has 594 files with balance 222
Depth 1: Node has 1 files with balance 0
Depth 1: Node has 593 files with balance 221
Depth 2: Node has 1 files with balance 0
Depth 2: Node has 592 files with balance 220
Depth 3: Node has 1 files with balance 0
Depth 3: Node has 591 files with balance 219
Depth 4: Node has 1 files with balance 0
Depth 4: Node has 590 files with balance 218
Depth 5: Node has 1 files with balance 0
Depth 5: Node has 589 files with balance 217
Depth 6: Node has 1 files with balance 0
Depth 6: Node has 588 files with balance 216
Depth 7: Node has 1 files with balance 0
Depth 7: Node has 587 files with balance 215
Depth 8: Node has 1 files with balance 0
Depth 8: Node has 586 files with balance 214
Depth 9: Node has 1 files with balance 0
Depth 9: Node has 585 files with balance 211
Depth 10: Node has 3 files with balance 1
Depth 11: Node has 1 files with balance 0
Depth 11: Node has 2 files with balance 0
Depth 12: Node has 1 files with balance 0
Depth 12: Nod

  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid

Level 0:
  Number of centroids: 1
  No distances to calculate

Level 1:
  Number of centroids: 2
  Min distance: 0.59
  Max distance: 0.59
  Avg distance: 0.59

Level 2:
  Number of centroids: 2
  Min distance: 0.59
  Max distance: 0.59
  Avg distance: 0.59

Level 3:
  Number of centroids: 2
  Min distance: 0.57
  Max distance: 0.57
  Avg distance: 0.57

Level 4:
  Number of centroids: 2
  Min distance: 0.57
  Max distance: 0.57
  Avg distance: 0.57

Level 5:
  Number of centroids: 2
  Min distance: 0.57
  Max distance: 0.57
  Avg distance: 0.57

Level 6:
  Number of centroids: 2
  Min distance: 0.58
  Max distance: 0.58
  Avg distance: 0.58

Level 7:
  Number of centroids: 2
  Min distance: 0.59
  Max distance: 0.59
  Avg distance: 0.59

Level 8:
  Number of centroids: 2
  Min distance: 0.56
  Max distance: 0.56
  Avg distance: 0.56

Level 9:
  Number of centroids: 2
  Min distance: 0.56
  Max distance: 0.56
  Avg distance: 0.56

Level 10:
  Number of centroids: 2
  Min distance: 0.45

  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid = df.loc[df['filename'].isin(files)].mean()
  centroid

In [10]:
[1] + [1,[1,2]]

[1, 1, [1, 2]]