In [2]:
!pip install geoopt
!pip install datasets


Collecting geoopt
  Downloading geoopt-0.5.0-py3-none-any.whl.metadata (6.7 kB)
Downloading geoopt-0.5.0-py3-none-any.whl (90 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/90.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: geoopt
Successfully installed geoopt-0.5.0
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

In [None]:
from huggingface_hub import login

#Enter your token here
HF_TOKEN_NAMAN = ''
login(HF_TOKEN_NAMAN)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

llama_path = 'meta-llama/Llama-2-13b-hf'
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_path,
    trust_remote_code=True,
    output_hidden_states=True,
    torch_dtype=torch.float16,
    device_map="auto"
)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(llama_path, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [8]:
dataset = load_dataset("wikipedia", "20220301.simple", split="train[:100]", trust_remote_code=True)
texts = dataset["text"]

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

wikipedia.py:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/134M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/205328 [00:00<?, ? examples/s]

In [9]:
def process_text_in_chunks(text, tokenizer, model, chunk_size=512, max_length=512):
    """
    Process a single text input in smaller chunks to avoid memory overflow.
    Returns accumulated activations for each layer.
    """
    # Tokenize text
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
    input_ids = tokens["input_ids"][0]  # Get the first sequence

    # Get the number of layers from the model's configuration
    num_layers = model.config.num_hidden_layers

    # Initialize accumulation list on CPU
    accumulated_hidden_states = [torch.zeros(0, model.config.hidden_size).to("cpu") for _ in range(num_layers)]

    # Process input in chunks
    for i in range(0, len(input_ids), chunk_size):
        chunk = input_ids[i : i + chunk_size].unsqueeze(0).to(device)  # Move to GPU

        with torch.no_grad():
            outputs = model(input_ids=chunk, output_hidden_states=True)
            hidden_states = outputs.hidden_states  # List of tensors, one per layer
            #print("Length of hidden states", len(hidden_states))
        # Offload activations for each layer to CPU and accumulate
        for layer_idx, layer_activation in enumerate(hidden_states[1:]):
            #print(layer_idx)
            accumulated_hidden_states[layer_idx] = torch.cat(
                [accumulated_hidden_states[layer_idx], layer_activation.squeeze(0).to("cpu")], dim=0
            )

        # Free GPU memory
        del chunk, hidden_states
        torch.cuda.empty_cache()

    return accumulated_hidden_states

# Step 4: Process all samples efficiently
def process_all_samples(texts, tokenizer, model, chunk_size=512):
    """
    Process all texts and compute accumulated hidden states for each layer.
    """
    # Get the number of layers from the model's configuration
    num_layers = model.config.num_hidden_layers

    # Initialize global accumulation list for all layers
    global_accumulated_hidden_states = [torch.zeros(0, model.config.hidden_size).to("cpu") for _ in range(num_layers)]

    for text in texts:
        sample_hidden_states = process_text_in_chunks(text, tokenizer, model, chunk_size)

        # Accumulate results across all samples for each layer
        for layer_idx, layer_activation in enumerate(sample_hidden_states):
            global_accumulated_hidden_states[layer_idx] = torch.cat(
                [global_accumulated_hidden_states[layer_idx], layer_activation], dim=0
            )
        # Free CPU memory for intermediate sample_hidden_states
        del sample_hidden_states

    return global_accumulated_hidden_states


def compute_layer_averages(global_accumulated_hidden_states, num_samples):
    """
    Compute average hidden states for each layer, collapsing both tokens and samples.
    """
    return [layer_hidden_states.mean(dim=0, keepdim=True).mean(dim=0, keepdim=True) for layer_hidden_states in global_accumulated_hidden_states]

# Execute the pipeline
chunk_size = 128
global_hidden_states = process_all_samples(texts, tokenizer, llama_model, chunk_size)
averaged_hidden_states = compute_layer_averages(global_hidden_states, len(texts))

In [10]:
# import pickle
# with open('averaged_hidden_states.pkl', 'wb') as f:
#     pickle.dump(averaged_hidden_states, f)

In [None]:
# import pickle
# with open('averaged_hidden_states.pkl', 'rb') as f:
#     averaged_hidden_states = pickle.load(f)

In [11]:
import math

import torch
from torch import Tensor

def exp_map0(x: Tensor, curv: float | Tensor = 1.0, eps: float = 1e-8) -> Tensor:
    """
    Map points from the tangent space at the vertex of hyperboloid, on to the
    hyperboloid. This mapping is done using the exponential map of Lorentz model.

    Args:
        x: Tensor of shape `(B, D)` giving batch of Euclidean vectors to project
            onto the hyperboloid. These vectors are interpreted as velocity
            vectors in the tangent space at the hyperboloid vertex.
        curv: Positive scalar denoting negative hyperboloid curvature.
        eps: Small float number to avoid division by zero.

    Returns:
        Tensor of same shape as `x`, giving space components of the mapped
        vectors on the hyperboloid.
    """

    rc_xnorm = curv**0.5 * torch.norm(x, dim=-1, keepdim=True)

    # Ensure numerical stability in sinh by clamping input.
    sinh_input = torch.clamp(rc_xnorm, min=eps, max=math.asinh(2**15))
    _output = torch.sinh(sinh_input) * x / torch.clamp(rc_xnorm, min=eps)
    return _output

def pairwise_inner(x: Tensor, y: Tensor, curv: float | Tensor = 1.0):
    """
    Compute pairwise Lorentzian inner product between input vectors.

    Args:
        x: Tensor of shape `(B1, D)` giving a space components of a batch
            of vectors on the hyperboloid.
        y: Tensor of shape `(B2, D)` giving a space components of another
            batch of points on the hyperboloid.
        curv: Positive scalar denoting negative hyperboloid curvature.
        eps: Small float number to avoid numerical instability.

    Returns:
        Tensor of shape `(B1, B2)` giving pairwise Lorentzian inner product
        between input vectors.
    """

    x_time = torch.sqrt(1 / curv + torch.sum(x**2, dim=-1, keepdim=True))
    y_time = torch.sqrt(1 / curv + torch.sum(y**2, dim=-1, keepdim=True))
    xyl = x @ y.T - x_time @ y_time.T
    return xyl


def pairwise_dist(
    x: Tensor, y: Tensor, curv: float | Tensor = 1.0, eps: float = 1e-8
) -> Tensor:
    """
    Compute the pairwise geodesic distance between two batches of points on
    the hyperboloid.

    Args:
        x: Tensor of shape `(B1, D)` giving a space components of a batch
            of point on the hyperboloid.
        y: Tensor of shape `(B2, D)` giving a space components of another
            batch of points on the hyperboloid.
        curv: Positive scalar denoting negative hyperboloid curvature.
        eps: Small float number to avoid numerical instability.

    Returns:
        Tensor of shape `(B1, B2)` giving pairwise distance along the geodesics
        connecting the input points.
    """

    c_xyl = -curv * pairwise_inner(x, y, curv)
    _distance = torch.acosh(torch.clamp(c_xyl, min=1 + eps))
    return _distance / curv**0.5



In [12]:
import torch

# Step 1: Project each layer's hidden state onto the hyperboloid
def project_layers_to_hyperboloid(averaged_hidden_states, curv=1.0):
    """
    Projects each layer's averaged hidden state onto the Lorentz hyperboloid.
    Args:
        averaged_hidden_states (list of torch.Tensor): List of tensors, each of shape [1, hidden_size].
        curv (float): Curvature of the hyperboloid.
    Returns:
        torch.Tensor: Projected embeddings on the hyperboloid of shape [num_layers, hidden_size].
    """
    # Apply exp_map0 to each tensor in the list
    projected_embeddings = torch.cat([exp_map0(state, curv=curv) for state in averaged_hidden_states], dim=0)
    return projected_embeddings

proj_embeddings = project_layers_to_hyperboloid(averaged_hidden_states, curv=1.0)

print("proj_embeddings:")
print(proj_embeddings)


proj_embeddings:
tensor([[ 2.9490e-03, -4.2312e-03,  4.8084e-03,  ...,  3.8492e-04,
         -6.1793e-03,  1.6702e-04],
        [ 2.6674e-02,  7.1886e-03, -2.8933e-02,  ..., -4.2680e-03,
         -1.8673e-02, -8.3376e-03],
        [ 4.6756e-02, -7.5611e-03, -5.5619e-02,  ...,  4.0249e-03,
         -3.2666e-02, -1.6010e-03],
        ...,
        [ 2.6331e+02,  9.3405e+01, -6.3939e+01,  ...,  6.2968e+01,
          7.7193e+01,  5.6427e+01],
        [ 2.3228e+02,  9.5317e+01, -1.9507e+02,  ...,  2.3256e+01,
          1.3176e+02, -7.0199e+01],
        [-2.2032e+02,  1.2323e+02, -7.4716e+01,  ..., -7.0108e+01,
          8.9844e+01, -1.5619e+02]])


In [13]:
proj_embeddings.shape

torch.Size([40, 5120])

In [None]:
import numpy as np
def hyperbolic_spectral_clustering(projected_embeddings, n_clusters=5):
    """
    Perform spectral clustering using hyperbolic inner product.
    """
    def hyperbolic_inner_product(x, y, curv=1.0):
        x_time = torch.sqrt(1 / curv + torch.sum(x**2, dim=-1, keepdim=True))
        y_time = torch.sqrt(1 / curv + torch.sum(y**2, dim=-1, keepdim=True))
        return x @ y.T - x_time @ y_time.T

    flat_embeddings = projected_embeddings.squeeze(1)

    affinity_matrix = hyperbolic_inner_product(flat_embeddings, flat_embeddings).cpu().numpy()

    affinity_matrix = (affinity_matrix - affinity_matrix.min()) / (affinity_matrix.max() - affinity_matrix.min())

    from sklearn.cluster import SpectralClustering
    clustering = SpectralClustering(
        n_clusters=n_clusters,
        affinity='precomputed',
        random_state=21
    ).fit(affinity_matrix)

    cluster_labels = clustering.labels_

    clusters = {}
    for cluster in range(n_clusters):
        cluster_indices = np.where(cluster_labels == cluster)[0]
        clusters[cluster] = {
            'layers': cluster_indices.tolist()
        }

    return clusters, cluster_labels

def compute_cluster_similarities(clusters, similarity_matrix):
    """
    Compute intra-cluster similarities and identify top clusters.
    """
    # Convert similarity matrix to numpy if it's a torch tensor
    if torch.is_tensor(similarity_matrix):
        similarity_matrix = similarity_matrix.numpy()

    # Store cluster similarities
    cluster_similarities = {}

    # Compute similarities for each cluster
    for label, cluster_info in clusters.items():
        cluster_indices = cluster_info['layers']

        # Extract intra-cluster similarity matrix
        cluster_similarities_matrix = similarity_matrix[cluster_indices][:, cluster_indices]

        # Compute average similarity within the cluster
        avg_similarity = np.mean(cluster_similarities_matrix)
        cluster_similarities[label] = avg_similarity

    # Sort clusters by average similarity
    sorted_clusters = sorted(cluster_similarities.items(), key=lambda x: x[1], reverse=True)

    # Get top clusters
    top_clusters = {label: cluster_similarities[label] for label, _ in sorted_clusters}

    return clusters, cluster_similarities, top_clusters

# Usage
clusters, labels = hyperbolic_spectral_clustering(proj_embeddings, 8)
clusters_dict, similarities, top_clusters = compute_cluster_similarities(clusters, proj_embeddings)

# Sort clusters by their maximum similarity in descending order
sorted_clusters = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Print results
print("Clusters sorted by maximum similarity:")
for label, sim in sorted_clusters:
    layers = clusters_dict[label]['layers']  # Get layers associated with the cluster
    print(f"Cluster {label}: Layers {layers}, Max Similarity = {sim:.4f}")

print("\nTop Clusters:")
for label, sim in top_clusters.items():
    layers = clusters_dict[label]['layers']
    print(f"Cluster {label}: Layers {layers}, Average Similarity = {sim:.4f}")




Clusters sorted by maximum similarity:
Cluster 4: Layers [24, 25, 26, 27, 28, 29, 30, 31], Max Similarity = 697.9032
Cluster 0: Layers [21, 22, 23], Max Similarity = 579.8470
Cluster 2: Layers [39], Max Similarity = 146.9625
Cluster 3: Layers [32, 33, 34, 35, 36, 37, 38], Max Similarity = 4.2052
Cluster 7: Layers [1], Max Similarity = 0.0072
Cluster 5: Layers [2], Max Similarity = -0.0556
Cluster 6: Layers [12, 13, 14, 15, 16, 17, 18, 19, 20], Max Similarity = -14.5947
Cluster 1: Layers [0, 3, 4, 5, 6, 7, 8, 9, 10, 11], Max Similarity = -15.8452

Top Clusters:
Cluster 4: Layers [24, 25, 26, 27, 28, 29, 30, 31], Average Similarity = 697.9032
Cluster 0: Layers [21, 22, 23], Average Similarity = 579.8470
Cluster 2: Layers [39], Average Similarity = 146.9625
Cluster 3: Layers [32, 33, 34, 35, 36, 37, 38], Average Similarity = 4.2052
Cluster 7: Layers [1], Average Similarity = 0.0072
Cluster 5: Layers [2], Average Similarity = -0.0556
Cluster 6: Layers [12, 13, 14, 15, 16, 17, 18, 19, 20], 

In [21]:
def merge_layers_in_place(model, merge_base_lay, merge_layer_num):
    merge_layer_num = min(merge_layer_num, len(model.model.layers) - merge_base_lay - 1)

    for diff_lay in range(merge_base_lay + 1, merge_base_lay + 1 + merge_layer_num):
        print(f"Merging layer {diff_lay} into layer {merge_base_lay}")
        base_layer = model.model.layers[merge_base_lay]
        merging_layer = model.model.layers[diff_lay]

        # gate_proj
        base_layer.mlp.gate_proj.weight.data.add_(
            merging_layer.mlp.gate_proj.weight.data - base_layer.mlp.gate_proj.weight.data
        )
        # down_proj
        base_layer.mlp.down_proj.weight.data.add_(
            merging_layer.mlp.down_proj.weight.data - base_layer.mlp.down_proj.weight.data
        )
        # up_proj
        base_layer.mlp.up_proj.weight.data.add_(
            merging_layer.mlp.up_proj.weight.data - base_layer.mlp.up_proj.weight.data
        )

        # q_proj
        base_layer.self_attn.q_proj.weight.data.add_(
            merging_layer.self_attn.q_proj.weight.data - base_layer.self_attn.q_proj.weight.data
        )
        # k_proj
        base_layer.self_attn.k_proj.weight.data.add_(
            merging_layer.self_attn.k_proj.weight.data - base_layer.self_attn.k_proj.weight.data
        )
        # v_proj
        base_layer.self_attn.v_proj.weight.data.add_(
            merging_layer.self_attn.v_proj.weight.data - base_layer.self_attn.v_proj.weight.data
        )
        # o_proj
        base_layer.self_attn.o_proj.weight.data.add_(
            merging_layer.self_attn.o_proj.weight.data - base_layer.self_attn.o_proj.weight.data
        )

    # Remove merged layers in reverse order to avoid shifting indices
    layers_to_delete = list(range(merge_base_lay + merge_layer_num, merge_base_lay, -1))
    for diff_lay in layers_to_delete:
        print(f"Deleting layer {diff_lay}")
        del model.model.layers[diff_lay]

    return model


In [22]:
def merge_clusters(model, clusters, cluster_similarities, threshold, merge_layers_fn):
    """
    Merge model layers based on clusters and their average similarities.

    Args:
    - model: The model to merge layers in.
    - clusters: Dictionary of clusters with layers as lists.
    - cluster_similarities: Dictionary with cluster IDs as keys and average similarities as values.
    - threshold: Similarity threshold to decide whether to merge a cluster.
    - merge_layers_fn: Function to handle merging of layers (e.g., merge_layers_return_model).

    Returns:
    - model: The updated model with merged layers.
    """
    # Sort clusters by similarity in descending order
    sorted_clusters = sorted(cluster_similarities.items(), key=lambda x: x[1], reverse=True)

    for cluster_id, avg_similarity in sorted_clusters:
        if avg_similarity > threshold:
            layers_to_merge = clusters[cluster_id]['layers']
            if len(layers_to_merge) > 1:
                print(f"Merging Cluster {cluster_id} with layers: {layers_to_merge} and avg similarity: {avg_similarity}")
                model = merge_layers_fn(model, min(layers_to_merge), len(layers_to_merge)-1)
            else:
                print(f"Skipping Cluster {cluster_id} with only one layer: {layers_to_merge}")
        else:
            print(f"Skipping Cluster {cluster_id}: Avg similarity {avg_similarity} below threshold.")

    return model



In [23]:
# Define a similarity threshold
SIMILARITY_THRESHOLD = 500

# Merge clusters
merged_model = merge_clusters(
    model=llama_model,
    clusters=clusters,
    cluster_similarities=similarities,
    threshold=SIMILARITY_THRESHOLD,
    merge_layers_fn=merge_layers_in_place
)


Merging Cluster 4 with layers: [24, 25, 26, 27, 28, 29, 30, 31] and avg similarity: 697.9031982421875
Merging layer 25 into layer 24
Merging layer 26 into layer 24
Merging layer 27 into layer 24
Merging layer 28 into layer 24
Merging layer 29 into layer 24
Merging layer 30 into layer 24
Merging layer 31 into layer 24
Deleting layer 31
Deleting layer 30
Deleting layer 29
Deleting layer 28
Deleting layer 27
Deleting layer 26
Deleting layer 25
Merging Cluster 0 with layers: [21, 22, 23] and avg similarity: 579.8469848632812
Merging layer 22 into layer 21
Merging layer 23 into layer 21
Deleting layer 23
Deleting layer 22
Skipping Cluster 2: Avg similarity 146.96250915527344 below threshold.
Skipping Cluster 3: Avg similarity 4.205190181732178 below threshold.
Skipping Cluster 7: Avg similarity 0.007188588846474886 below threshold.
Skipping Cluster 5: Avg similarity -0.055619463324546814 below threshold.
Skipping Cluster 6: Avg similarity -14.594661712646484 below threshold.
Skipping Cluste

In [24]:
merged_model.config.num_hidden_layers = len(merged_model.model.layers)
merged_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-30): 31 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((5120,), eps=1e-05)
      )
    )
    (no

In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def generate_text(model, tokenizer, prompt, max_length=64, num_return_sequences=1):
    """
    Generates text from the given model and prompt.
    Args:
        model: HuggingFace transformer model (e.g., LLaMA).
        tokenizer: Corresponding tokenizer for the model.
        prompt (str): Input text prompt for generation.
        max_length (int): Maximum length of generated text.
        num_return_sequences (int): Number of text sequences to generate.
    Returns:
        List[str]: List of generated text sequences.
    """
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate text
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,  # Prevent repetitive text
        early_stopping=True
    )

    # Decode and return the generated sequences
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]


# # Generate text with the unpruned model
prompt = "Today is a beautiful day"
# unpruned_text = generate_text(llama_model, tokenizer, prompt)
# print("Unpruned Model Output:")
# print(unpruned_text)

# Generate text with the pruned model
pruned_text = generate_text(merged_model, tokenizer, prompt)
print("\nPruned Model Output:")
print(pruned_text)



Pruned Model Output:
['Today is a beautiful day, and I have some more good news for you.\nWe’ve been nominated for the 2017 Best of the Web Awards by the American Web International. We’re the only company in our area to be nominated. And we’d appreciate your help. If you']


In [28]:
merged_model.save_pretrained("/content/pruned_model")
tokenizer.save_pretrained("/content/pruned_model")

('/content/pruned_model/tokenizer_config.json',
 '/content/pruned_model/special_tokens_map.json',
 '/content/pruned_model/tokenizer.model',
 '/content/pruned_model/added_tokens.json',
 '/content/pruned_model/tokenizer.json')

In [29]:
with open("/content/pruned_model/README.md", "w") as f:
    f.write("# Merged LLaMA Model\n\nThis is a merged version of the LLaMA2-13b model based on hyperboloid method where similarity was calculated based on hyperbolic distance. 10 layers have been merged.")

In [30]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id="namannn/llama2-13b-hyperbolic-cluster-pruned", exist_ok=True)
api.upload_folder(
    folder_path="/content/pruned_model",
    repo_id="namannn/llama2-13b-hyperbolic-cluster-pruned",
    repo_type="model",
)

- empty or missing yaml metadata in repo card


model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/469M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/namannn/llama2-13b-hyperbolic-cluster-pruned/commit/9bb3a3191278eff77cf3ca9bf5e8694f3254a93b', commit_message='Upload folder using huggingface_hub', commit_description='', oid='9bb3a3191278eff77cf3ca9bf5e8694f3254a93b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/namannn/llama2-13b-hyperbolic-cluster-pruned', endpoint='https://huggingface.co', repo_type='model', repo_id='namannn/llama2-13b-hyperbolic-cluster-pruned'), pr_revision=None, pr_num=None)