In [6]:
%load_ext autoreload
%autoreload 2
import os
from huggingface_hub import login
from collections import defaultdict
from projects.wiki_experts.src.evolution.utils import get_svd_embedding

from mttl.models.modifiers.expert_containers.expert_library import LocalExpertLibrary,  HFExpertLibrary
from mttl.models.modifiers.expert_containers.library_transforms import SVDEmbeddingTransform, SVDEmbeddingTransformConfig
from huggingface_hub import login, HfApi
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create clsuters

In [19]:
hf_api_key = os.environ["HF_TOKEN"]
login(token=hf_api_key)
user = HfApi(token=hf_api_key).whoami()
# hf_repo_id = "sordonia/library-phi_2-v3"
hf_repo_id = "sordonia/library-phi_2-v3-2epc"
local_lib_location = f"/tmp/{hf_repo_id}"
if not os.path.exists(local_lib_location):
    os.makedirs(local_lib_location)
    expert_lib: LocalExpertLibrary = LocalExpertLibrary.create_from_remote(
        HFExpertLibrary(hf_repo_id), local_lib_location
    )
else:
    expert_lib: LocalExpertLibrary = LocalExpertLibrary(local_lib_location)

experts_to_remove = [
    "bool_q_1_0_0",
    "ai2_arc_ARC_Easy_1_0_0",
    "openbookqa_0_1_0",
    "ai2_arc_ARC_Challenge_1_0_0",
    "hellaswag_1_1_0",
    "piqa_1_0_0",
    "winogrande_1_1_0",
]
for expert_name in experts_to_remove:
    if expert_name in expert_lib:
        expert_lib.remove_expert(expert_name)
assert len(expert_lib) == 256

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/v-oostapenko/.cache/huggingface/token
Login successful


In [20]:
def create_embeddings():
    svd_embedder = SVDEmbeddingTransform(
        SVDEmbeddingTransformConfig(sparsity_threshold=0.5)
    )
    embeddings, svd = svd_embedder.transform(expert_lib, upload_to_hf=True)
    del svd_embedder
    return embeddings, svd


embeds = expert_lib.get_auxiliary_data("embeddings")
if len(embeds) == 0:
    print("creating embeddings")
    _, svd = create_embeddings()

# module to embedding
module2embed = {}
for n, m in expert_lib.items():
    module2embed[n] = get_svd_embedding(expert_lib, n)

In [25]:
# Extract the embeddings as a numpy array
embeddings = np.array(list(module2embed.values()))
cosine_sim_matrix = cosine_similarity(embeddings, embeddings)
K = 25
kmeans = KMeans(n_clusters=K, init="k-means++", n_init=10, random_state=42)
kmeans.fit(cosine_sim_matrix)
cluster_labels = kmeans.labels_

In [32]:
clusters = defaultdict(list)
# Print the cluster labels for each embedding
for key, label in zip(module2embed.keys(), cluster_labels_direct):
    clusters[label].append(key)

for c, l in clusters.items():
    print(f"Cluster {c} has {len(l)} elements")
    print(f"c{c}o{K}_2e = {l}")

Cluster 7 has 37 elements
c7o10_2e = ['glue_sst2_2_0_0', 'super_glue_rte_1_0_2', 'wiki_qa_found_on_google', 'app_reviews_categorize_rating_using_review', 'super_glue_wic_1_0_2', 'yelp_polarity_reviews_0_2_0', 'anli_r3_0_1_0', 'super_glue_cb_1_0_2', 'glue_cola_2_0_0', 'paws_wiki_1_1_0', 'super_glue_wsc_fixed_1_0_2', 'cot_creak', 'wiki_qa_Is_This_True_', 'super_glue_multirc_1_0_2', 'snli_1_1_0', 'cot_strategyqa', 'glue_qqp_2_0_0', 'imdb_reviews_plain_text_1_0_0', 'wiki_qa_exercise', 'wiki_qa_automatic_system', 'cot_creak_ii', 'anli_r2_0_1_0', 'qasc_is_correct_1', 'anli_r1_0_1_0', 'glue_stsb_2_0_0', 'glue_qnli_2_0_0', 'cot_sensemaking_ii', 'glue_mnli_2_0_0', 'super_glue_copa_1_0_2', 'social_i_qa_Check_if_a_random_answer_is_valid_or_not', 'qasc_is_correct_2', 'cosmos_qa_1_0_0', 'wiki_qa_Decide_good_answer', 'definite_pronoun_resolution_1_1_0', 'glue_wnli_2_0_0', 'glue_mrpc_2_0_0', 'cot_strategyqa_ii']
Cluster 3 has 29 elements
c3o10_2e = ['dream_read_the_following_conversation_and_answer_t

### Rand score

In [14]:
from sklearn.metrics.cluster import adjusted_rand_score

In [29]:
adjusted_rand_score(cluster_labels, cluster_labels_direct)

0.47611902409439244

# Save svd file

In [None]:
hf_api_key = os.environ["HF_TOKEN"]
login(token=hf_api_key)
user = HfApi(token=hf_api_key).whoami()
hf_repo_id = "ostapeno/library-phi_2-v3-10-flan-clusters"
local_lib_location = f"/tmp/{hf_repo_id}"
if not os.path.exists(local_lib_location):
    os.makedirs(local_lib_location)
    expert_lib: LocalExpertLibrary = LocalExpertLibrary.create_from_remote(
        HFExpertLibrary(hf_repo_id), local_lib_location
    )
else:
    expert_lib: LocalExpertLibrary = LocalExpertLibrary(local_lib_location)

In [None]:
_, svd = create_embeddings()

In [None]:
# upload embeddings
remote_lib = HFExpertLibrary.from_local(
    expert_lib,
    hf_repo_id,
    force=True,
    upload_aux_data=True,
)

In [None]:
import pickle
# To save
pickle.dump(svd, open("svd.pkl", "wb"))