In [1]:
!pip install geoopt
!pip install datasets
!pip install mwparserfromhell



In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

In [None]:
from huggingface_hub import login

#Enter your token here
HF_TOKEN_NAMAN = ''
login(HF_TOKEN_NAMAN)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

baichuan_path = 'baichuan-inc/Baichuan-7B'
baichuan_model = AutoModelForCausalLM.from_pretrained(baichuan_path, trust_remote_code=True,
                                                   output_hidden_states=True,
                                                  )
baichuan_model = baichuan_model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
tokenizer = AutoTokenizer.from_pretrained(baichuan_path, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [6]:
from datasets import get_dataset_config_names

configs = get_dataset_config_names("wikipedia")
print(configs)


['20220301.aa', '20220301.ab', '20220301.ace', '20220301.ady', '20220301.af', '20220301.ak', '20220301.als', '20220301.am', '20220301.an', '20220301.ang', '20220301.ar', '20220301.arc', '20220301.arz', '20220301.as', '20220301.ast', '20220301.atj', '20220301.av', '20220301.ay', '20220301.az', '20220301.azb', '20220301.ba', '20220301.bar', '20220301.bat-smg', '20220301.bcl', '20220301.be', '20220301.be-x-old', '20220301.bg', '20220301.bh', '20220301.bi', '20220301.bjn', '20220301.bm', '20220301.bn', '20220301.bo', '20220301.bpy', '20220301.br', '20220301.bs', '20220301.bug', '20220301.bxr', '20220301.ca', '20220301.cbk-zam', '20220301.cdo', '20220301.ce', '20220301.ceb', '20220301.ch', '20220301.cho', '20220301.chr', '20220301.chy', '20220301.ckb', '20220301.co', '20220301.cr', '20220301.crh', '20220301.cs', '20220301.csb', '20220301.cu', '20220301.cv', '20220301.cy', '20220301.da', '20220301.de', '20220301.din', '20220301.diq', '20220301.dsb', '20220301.dty', '20220301.dv', '20220301.d

In [7]:
dataset = load_dataset("wikipedia", "20220301.simple", split="train[:50]", trust_remote_code=True)
texts = dataset["text"]

dataset = load_dataset("wikipedia", "20220301.zh-classical", split="train[:50]", trust_remote_code=True)
texts.extend(dataset["text"])

In [8]:
texts[0]

'April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril\'s flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nThe Month \n\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\n\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other\'s last days are exactly 35 weeks (245 days) apart.\n\nIn common years, April starts on the same day of the week as October of the previous year, and i

In [9]:
def process_text_in_chunks(text, tokenizer, model, chunk_size=512, max_length=512):
    """
    Process a single text input in smaller chunks to avoid memory overflow.
    Returns accumulated activations for each layer.
    """
    # Tokenize text
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
    input_ids = tokens["input_ids"][0]  # Get the first sequence

    # Get the number of layers from the model's configuration
    num_layers = model.config.num_hidden_layers

    # Initialize accumulation list on CPU
    accumulated_hidden_states = [torch.zeros(0, model.config.hidden_size).to("cpu") for _ in range(num_layers)]

    # Process input in chunks
    for i in range(0, len(input_ids), chunk_size):
        chunk = input_ids[i : i + chunk_size].unsqueeze(0).to("cuda")  # Move to GPU

        with torch.no_grad():
            outputs = model(input_ids=chunk, output_hidden_states=True)
            hidden_states = outputs.hidden_states  # List of tensors, one per layer
            #print("Length of hidden states", len(hidden_states))
        # Offload activations for each layer to CPU and accumulate
        for layer_idx, layer_activation in enumerate(hidden_states[1:]):
            #print(layer_idx)
            accumulated_hidden_states[layer_idx] = torch.cat(
                [accumulated_hidden_states[layer_idx], layer_activation.squeeze(0).to("cpu")], dim=0
            )

        # Free GPU memory
        del chunk, hidden_states
        torch.cuda.empty_cache()

    return accumulated_hidden_states

# Step 4: Process all samples efficiently
def process_all_samples(texts, tokenizer, model, chunk_size=512):
    """
    Process all texts and compute accumulated hidden states for each layer.
    """
    # Get the number of layers from the model's configuration
    num_layers = model.config.num_hidden_layers

    # Initialize global accumulation list for all layers
    global_accumulated_hidden_states = [torch.zeros(0, model.config.hidden_size).to("cpu") for _ in range(num_layers)]

    for text in texts:
        sample_hidden_states = process_text_in_chunks(text, tokenizer, model, chunk_size)

        # Accumulate results across all samples for each layer
        for layer_idx, layer_activation in enumerate(sample_hidden_states):
            global_accumulated_hidden_states[layer_idx] = torch.cat(
                [global_accumulated_hidden_states[layer_idx], layer_activation], dim=0
            )
        # Free CPU memory for intermediate sample_hidden_states
        del sample_hidden_states

    return global_accumulated_hidden_states


def compute_layer_averages(global_accumulated_hidden_states, num_samples):
    """
    Compute average hidden states for each layer, collapsing both tokens and samples.
    """
    return [layer_hidden_states.mean(dim=0, keepdim=True).mean(dim=0, keepdim=True) for layer_hidden_states in global_accumulated_hidden_states]

# Execute the pipeline
chunk_size = 128
global_hidden_states = process_all_samples(texts, tokenizer, baichuan_model, chunk_size)
averaged_hidden_states = compute_layer_averages(global_hidden_states, len(texts))

In [None]:
# import pickle
# with open('averaged_hidden_states.pkl', 'wb') as f:
#     pickle.dump(averaged_hidden_states, f)

In [11]:
# import pickle
# with open('averaged_hidden_states.pkl', 'rb') as f:
#     averaged_hidden_states = pickle.load(f)

In [12]:
import math

import torch
from torch import Tensor

def exp_map0(x: Tensor, curv: float | Tensor = 1.0, eps: float = 1e-8) -> Tensor:
    """
    Map points from the tangent space at the vertex of hyperboloid, on to the
    hyperboloid. This mapping is done using the exponential map of Lorentz model.

    Args:
        x: Tensor of shape `(B, D)` giving batch of Euclidean vectors to project
            onto the hyperboloid. These vectors are interpreted as velocity
            vectors in the tangent space at the hyperboloid vertex.
        curv: Positive scalar denoting negative hyperboloid curvature.
        eps: Small float number to avoid division by zero.

    Returns:
        Tensor of same shape as `x`, giving space components of the mapped
        vectors on the hyperboloid.
    """

    rc_xnorm = curv**0.5 * torch.norm(x, dim=-1, keepdim=True)

    # Ensure numerical stability in sinh by clamping input.
    sinh_input = torch.clamp(rc_xnorm, min=eps, max=math.asinh(2**15))
    _output = torch.sinh(sinh_input) * x / torch.clamp(rc_xnorm, min=eps)
    return _output

def pairwise_inner(x: Tensor, y: Tensor, curv: float | Tensor = 1.0):
    """
    Compute pairwise Lorentzian inner product between input vectors.

    Args:
        x: Tensor of shape `(B1, D)` giving a space components of a batch
            of vectors on the hyperboloid.
        y: Tensor of shape `(B2, D)` giving a space components of another
            batch of points on the hyperboloid.
        curv: Positive scalar denoting negative hyperboloid curvature.
        eps: Small float number to avoid numerical instability.

    Returns:
        Tensor of shape `(B1, B2)` giving pairwise Lorentzian inner product
        between input vectors.
    """

    x_time = torch.sqrt(1 / curv + torch.sum(x**2, dim=-1, keepdim=True))
    y_time = torch.sqrt(1 / curv + torch.sum(y**2, dim=-1, keepdim=True))
    xyl = x @ y.T - x_time @ y_time.T
    return xyl


def pairwise_dist(
    x: Tensor, y: Tensor, curv: float | Tensor = 1.0, eps: float = 1e-8
) -> Tensor:
    """
    Compute the pairwise geodesic distance between two batches of points on
    the hyperboloid.

    Args:
        x: Tensor of shape `(B1, D)` giving a space components of a batch
            of point on the hyperboloid.
        y: Tensor of shape `(B2, D)` giving a space components of another
            batch of points on the hyperboloid.
        curv: Positive scalar denoting negative hyperboloid curvature.
        eps: Small float number to avoid numerical instability.

    Returns:
        Tensor of shape `(B1, B2)` giving pairwise distance along the geodesics
        connecting the input points.
    """

    c_xyl = -curv * pairwise_inner(x, y, curv)
    _distance = torch.acosh(torch.clamp(c_xyl, min=1 + eps))
    return _distance / curv**0.5



In [14]:
import torch

# Step 1: Project each layer's hidden state onto the hyperboloid
def project_layers_to_hyperboloid(averaged_hidden_states, curv=1.0):
    """
    Projects each layer's averaged hidden state onto the Lorentz hyperboloid.
    Args:
        averaged_hidden_states (list of torch.Tensor): List of tensors, each of shape [1, hidden_size].
        curv (float): Curvature of the hyperboloid.
    Returns:
        torch.Tensor: Projected embeddings on the hyperboloid of shape [num_layers, hidden_size].
    """
    # Apply exp_map0 to each tensor in the list
    projected_embeddings = torch.cat([exp_map0(state, curv=curv) for state in averaged_hidden_states], dim=0)
    return projected_embeddings

proj_embeddings = project_layers_to_hyperboloid(averaged_hidden_states, curv=1.0)

print("proj_embeddings:")
print(proj_embeddings)


proj_embeddings:
tensor([[-1.4257e+00, -2.7491e+00, -1.4808e+00,  ..., -1.7245e+00,
          3.9758e+00,  1.1430e+00],
        [-1.7000e+01, -1.2098e+01,  1.5052e+01,  ...,  7.9979e+00,
          6.0188e+01,  6.1079e+01],
        [-3.6613e+01, -4.5963e+01,  5.0800e+01,  ..., -8.6555e+00,
          9.7909e+01,  1.1145e+02],
        ...,
        [ 2.7265e+02, -1.8691e+02, -1.4212e+01,  ...,  8.9775e+01,
         -1.3123e+02,  3.0310e+00],
        [ 6.6495e+02, -4.8696e+02, -5.5979e+02,  ...,  4.1126e+02,
         -5.1421e+02,  7.0104e+02],
        [ 5.1030e+02, -1.2849e+02,  2.0052e+02,  ...,  7.1778e+02,
         -5.0017e+02,  1.3480e+03]])


In [15]:
proj_embeddings.shape

torch.Size([32, 4096])

In [None]:
import numpy as np
def hyperbolic_spectral_clustering(projected_embeddings, n_clusters=5):
    """
    Perform spectral clustering using hyperbolic inner product.
    """
    def hyperbolic_inner_product(x, y, curv=1.0):
        x_time = torch.sqrt(1 / curv + torch.sum(x**2, dim=-1, keepdim=True))
        y_time = torch.sqrt(1 / curv + torch.sum(y**2, dim=-1, keepdim=True))
        return x @ y.T - x_time @ y_time.T

    flat_embeddings = projected_embeddings.squeeze(1)

    affinity_matrix = hyperbolic_inner_product(flat_embeddings, flat_embeddings).cpu().numpy()

    affinity_matrix = (affinity_matrix - affinity_matrix.min()) / (affinity_matrix.max() - affinity_matrix.min())

    from sklearn.cluster import SpectralClustering
    clustering = SpectralClustering(
        n_clusters=n_clusters,
        affinity='precomputed',
        random_state=21
    ).fit(affinity_matrix)

    cluster_labels = clustering.labels_

    clusters = {}
    for cluster in range(n_clusters):
        cluster_indices = np.where(cluster_labels == cluster)[0]
        clusters[cluster] = {
            'layers': cluster_indices.tolist()
        }

    return clusters, cluster_labels

def compute_cluster_similarities(clusters, similarity_matrix):
    """
    Compute intra-cluster similarities and identify top clusters.
    """
    # Convert similarity matrix to numpy if it's a torch tensor
    if torch.is_tensor(similarity_matrix):
        similarity_matrix = similarity_matrix.numpy()

    # Store cluster similarities
    cluster_similarities = {}

    # Compute similarities for each cluster
    for label, cluster_info in clusters.items():
        cluster_indices = cluster_info['layers']

        # Extract intra-cluster similarity matrix
        cluster_similarities_matrix = similarity_matrix[cluster_indices][:, cluster_indices]

        # Compute average similarity within the cluster
        avg_similarity = np.mean(cluster_similarities_matrix)
        cluster_similarities[label] = avg_similarity

    # Sort clusters by average similarity
    sorted_clusters = sorted(cluster_similarities.items(), key=lambda x: x[1], reverse=True)

    # Get top clusters
    top_clusters = {label: cluster_similarities[label] for label, _ in sorted_clusters}

    return clusters, cluster_similarities, top_clusters

# Usage
clusters, labels = hyperbolic_spectral_clustering(proj_embeddings, 6)
clusters_dict, similarities, top_clusters = compute_cluster_similarities(clusters, proj_embeddings)

# Sort clusters by their maximum similarity in descending order
sorted_clusters = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Print results
print("Clusters sorted by maximum similarity:")
for label, sim in sorted_clusters:
    layers = clusters_dict[label]['layers']  # Get layers associated with the cluster
    print(f"Cluster {label}: Layers {layers}, Max Similarity = {sim:.4f}")

print("\nTop Clusters:")
for label, sim in top_clusters.items():
    layers = clusters_dict[label]['layers']
    print(f"Cluster {label}: Layers {layers}, Average Similarity = {sim:.4f}")




Clusters sorted by maximum similarity:
Cluster 4: Layers [20, 21, 22], Max Similarity = 211.8329
Cluster 0: Layers [23, 24, 25, 26, 27, 28, 29], Max Similarity = 83.4615
Cluster 3: Layers [0, 8, 9, 10, 11, 12, 13, 14, 15, 16], Max Similarity = 14.7306
Cluster 2: Layers [1, 2, 3, 4, 5, 6, 7], Max Similarity = -23.4180
Cluster 5: Layers [17, 18, 19], Max Similarity = -141.4587
Cluster 1: Layers [30, 31], Max Similarity = -539.7925

Top Clusters:
Cluster 4: Layers [20, 21, 22], Average Similarity = 211.8329
Cluster 0: Layers [23, 24, 25, 26, 27, 28, 29], Average Similarity = 83.4615
Cluster 3: Layers [0, 8, 9, 10, 11, 12, 13, 14, 15, 16], Average Similarity = 14.7306
Cluster 2: Layers [1, 2, 3, 4, 5, 6, 7], Average Similarity = -23.4180
Cluster 5: Layers [17, 18, 19], Average Similarity = -141.4587
Cluster 1: Layers [30, 31], Average Similarity = -539.7925


In [17]:
def merge_layers_in_place_baichuan(model, merge_base_lay, merge_layer_num):
    # Determine the number of layers to merge, ensuring we don't exceed model limits
    merge_layer_num = min(merge_layer_num, len(model.model.layers) - merge_base_lay - 1)

    for diff_lay in range(merge_base_lay + 1, merge_base_lay + 1 + merge_layer_num):
        print(f"Merging layer {diff_lay} into layer {merge_base_lay}")

        # Retrieve the base and merging layers
        base_layer = model.model.layers[merge_base_lay]
        merging_layer = model.model.layers[diff_lay]

        # Update MLP weights: gate_proj, down_proj, up_proj
        base_layer.mlp.gate_proj.weight.data.add_(
            merging_layer.mlp.gate_proj.weight.data - base_layer.mlp.gate_proj.weight.data
        )
        base_layer.mlp.down_proj.weight.data.add_(
            merging_layer.mlp.down_proj.weight.data - base_layer.mlp.down_proj.weight.data
        )
        base_layer.mlp.up_proj.weight.data.add_(
            merging_layer.mlp.up_proj.weight.data - base_layer.mlp.up_proj.weight.data
        )

        # Update self-attention weights: W_pack, o_proj
        base_layer.self_attn.W_pack.weight.data.add_(
            merging_layer.self_attn.W_pack.weight.data - base_layer.self_attn.W_pack.weight.data
        )
        base_layer.self_attn.o_proj.weight.data.add_(
            merging_layer.self_attn.o_proj.weight.data - base_layer.self_attn.o_proj.weight.data
        )

    # Remove merged layers in reverse order to avoid shifting indices
    layers_to_delete = list(range(merge_base_lay + merge_layer_num, merge_base_lay, -1))
    for diff_lay in layers_to_delete:
        print(f"Deleting layer {diff_lay}")
        del model.model.layers[diff_lay]

    return model


In [18]:
def merge_clusters(model, clusters, cluster_similarities, threshold, merge_layers_fn):
    """
    Merge model layers based on clusters and their average similarities.

    Args:
    - model: The model to merge layers in.
    - clusters: Dictionary of clusters with layers as lists.
    - cluster_similarities: Dictionary with cluster IDs as keys and average similarities as values.
    - threshold: Similarity threshold to decide whether to merge a cluster.
    - merge_layers_fn: Function to handle merging of layers (e.g., merge_layers_return_model).

    Returns:
    - model: The updated model with merged layers.
    """
    # Sort clusters by similarity in descending order
    sorted_clusters = sorted(cluster_similarities.items(), key=lambda x: x[1], reverse=True)

    for cluster_id, avg_similarity in sorted_clusters:
        if avg_similarity > threshold:
            layers_to_merge = clusters[cluster_id]['layers']
            if len(layers_to_merge) > 1:
                print(f"Merging Cluster {cluster_id} with layers: {layers_to_merge} and avg similarity: {avg_similarity}")
                model = merge_layers_fn(model, min(layers_to_merge), len(layers_to_merge)-1)
            else:
                print(f"Skipping Cluster {cluster_id} with only one layer: {layers_to_merge}")
        else:
            print(f"Skipping Cluster {cluster_id}: Avg similarity {avg_similarity} below threshold.")

    return model



In [19]:
# Define a similarity threshold
SIMILARITY_THRESHOLD = 80

# Merge clusters
merged_model = merge_clusters(
    model=baichuan_model,
    clusters=clusters,
    cluster_similarities=similarities,
    threshold=SIMILARITY_THRESHOLD,
    merge_layers_fn=merge_layers_in_place_baichuan
)


Merging Cluster 4 with layers: [20, 21, 22] and avg similarity: 211.83290100097656
Merging layer 21 into layer 20
Merging layer 22 into layer 20
Deleting layer 22
Deleting layer 21
Merging Cluster 0 with layers: [23, 24, 25, 26, 27, 28, 29] and avg similarity: 83.46148681640625
Merging layer 24 into layer 23
Merging layer 25 into layer 23
Merging layer 26 into layer 23
Merging layer 27 into layer 23
Merging layer 28 into layer 23
Merging layer 29 into layer 23
Deleting layer 29
Deleting layer 28
Deleting layer 27
Deleting layer 26
Deleting layer 25
Deleting layer 24
Skipping Cluster 3: Avg similarity 14.73055648803711 below threshold.
Skipping Cluster 2: Avg similarity -23.417970657348633 below threshold.
Skipping Cluster 5: Avg similarity -141.458740234375 below threshold.
Skipping Cluster 1: Avg similarity -539.79248046875 below threshold.


In [20]:
merged_model.config.num_hidden_layers = len(merged_model.model.layers)
merged_model

BaiChuanForCausalLM(
  (model): Model(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-23): 24 x DecoderLayer(
        (self_attn): Attention(
          (W_pack): Linear(in_features=4096, out_features=12288, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): RotaryEmbedding()
        )
        (mlp): MLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
      )
    )
    (norm): RMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=64000, bias=False)
)

In [21]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def generate_text(model, tokenizer, prompt, max_length=150, num_return_sequences=1):
    """
    Generates text from the given model and prompt.
    Args:
        model: HuggingFace transformer model (e.g., LLaMA).
        tokenizer: Corresponding tokenizer for the model.
        prompt (str): Input text prompt for generation.
        max_length (int): Maximum length of generated text.
        num_return_sequences (int): Number of text sequences to generate.
    Returns:
        List[str]: List of generated text sequences.
    """
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate text
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,  # Prevent repetitive text
        early_stopping=True
    )

    # Decode and return the generated sequences
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]


# # Generate text with the unpruned model
prompt = "The weather looks nice today"
# unpruned_text = generate_text(llama_model, tokenizer, prompt)
# print("Unpruned Model Output:")
# print(unpruned_text)

# Generate text with the pruned model
pruned_text = generate_text(merged_model, tokenizer, prompt)
print("\nPruned Model Output:")
print(pruned_text)





Pruned Model Output:
['The weather looks nice today!!..but.\nToday we went to the beach. We went swimming too! I swim swimming swimming!\nI got lots of beach sand too. I got sand sand s.S.s.!']


In [22]:
merged_model.save_pretrained("/content/pruned_model")
tokenizer.save_pretrained("/content/pruned_model")

('/content/pruned_model/tokenizer_config.json',
 '/content/pruned_model/special_tokens_map.json',
 '/content/pruned_model/tokenizer.model',
 '/content/pruned_model/added_tokens.json')

In [23]:
with open("/content/pruned_model/README.md", "w") as f:
    f.write("# Merged Baichuan Model\n\nThis is a merged version of the Baichuan-7b model based on hyperboloid method where similarity was calculated based on hyperbolic distance. 10 layers have been merged.")

In [24]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id="namannn/baichuan2-7b-hyperbolic-cluster-pruned", exist_ok=True)
api.upload_folder(
    folder_path="/content/pruned_model",
    repo_id="namannn/baichuan2-7b-hyperbolic-cluster-pruned",
    repo_type="model",
)

- empty or missing yaml metadata in repo card
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/namannn/baichuan2-7b-hyperbolic-cluster-pruned/commit/59d2f672553c9307a29e696895854886e2df29b2', commit_message='Upload folder using huggingface_hub', commit_description='', oid='59d2f672553c9307a29e696895854886e2df29b2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/namannn/baichuan2-7b-hyperbolic-cluster-pruned', endpoint='https://huggingface.co', repo_type='model', repo_id='namannn/baichuan2-7b-hyperbolic-cluster-pruned'), pr_revision=None, pr_num=None)