<a href="https://colab.research.google.com/github/peremartra/LLMOptCost/blob/main/pruning_structured_pairsGLU_llama3.2-1b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install torch
!pip install sentencepiece  # Required for LLaMA tokenizer



In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch import nn
import os

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
model_name = 'meta-llama/Llama-3.2-1B'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [None]:
def get_output(prompt, model=model, tokenizer=tokenizer):
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id
    )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated

# Test the original model
prompt = "Paris is the capital of"
generated = get_output(prompt)
print(f"Generated text: {generated}")

Generated text: Paris is the capital of France and the most populated city in the country. It is located in the northwestern part of the country, on the river Seine. The city is the most important financial center in Europe and the largest in France.


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

original_param_count = count_parameters(model)
print(f"Original model parameters: {original_param_count}")

Original model parameters: 1235814400


In [None]:
# Function to compute importance scores (L1 norm)
def compute_importance_scores(weight):
    # Ensure the weight is in float32
    weight = weight.float()
    return torch.sum(torch.abs(weight), dim=1)

In [None]:
def prune_neurons(mlp, prune_percent):
    # Get the weights of the gate_proj and up_proj layers
    gate_weight = mlp.gate_proj.weight.data.float()  # Shape: [output_features, input_features]
    up_weight = mlp.up_proj.weight.data.float()      # Shape: [output_features, input_features]

    print(f"gate_weight.shape: {gate_weight.shape}")
    print(f"up_weight.shape: {up_weight.shape}")

    # Compute importance scores for each neuron separately and sum them
    importance_scores_gate = compute_importance_scores(gate_weight)
    importance_scores_up = compute_importance_scores(up_weight)
    importance_scores = importance_scores_gate + importance_scores_up

    # Check for NaNs or Infs
    if torch.isnan(importance_scores).any():
        print("Warning: importance_scores contains NaNs")
    if torch.isinf(importance_scores).any():
        print("Warning: importance_scores contains Infs")

    # Determine the number of neurons to prune
    original_intermediate_size = gate_weight.size(0)  # This is output_features
    num_neurons_to_prune = int(prune_percent * original_intermediate_size)

    # Ensure num_neurons_to_prune is valid
    num_neurons_to_prune = max(0, min(num_neurons_to_prune, original_intermediate_size - 1))
    k = original_intermediate_size - num_neurons_to_prune

    print(f"Original intermediate size: {original_intermediate_size}")
    print(f"Number of neurons to prune: {num_neurons_to_prune}")
    print(f"Number of neurons to keep (k): {k}")

    if k <= 0:
        raise ValueError(f"Invalid number of neurons to keep: {k}. Adjust the prune_percent or check the layer sizes.")

    # Ensure importance_scores is on the same device
    importance_scores = importance_scores.to(device)

    # Get indices of neurons to keep (those with highest importance)
    _, indices_to_keep = torch.topk(importance_scores, k)

    # Sort indices to maintain order
    indices_to_keep, _ = torch.sort(indices_to_keep)

    # Create new Linear layers with reduced size
    new_gate_proj = nn.Linear(mlp.gate_proj.in_features, len(indices_to_keep), bias=False).to(device)
    new_up_proj = nn.Linear(mlp.up_proj.in_features, len(indices_to_keep), bias=False).to(device)
    new_down_proj = nn.Linear(len(indices_to_keep), mlp.down_proj.out_features, bias=False).to(device)

    return new_gate_proj, new_up_proj, new_down_proj, len(indices_to_keep), indices_to_keep


In [None]:
# Function to copy weights and biases to new pruned layers
def copy_weights_and_biases(mlp, new_gate_proj, new_up_proj, new_down_proj, indices_to_keep):
    # Copy weights for gate_proj and up_proj (input features remain the same)
    new_gate_proj.weight.data = mlp.gate_proj.weight.data[indices_to_keep, :]
    new_up_proj.weight.data = mlp.up_proj.weight.data[indices_to_keep, :]

    # Copy weights for down_proj (output features remain the same)
    new_down_proj.weight.data = mlp.down_proj.weight.data[:, indices_to_keep]

# Function to update the model
def update_model(model, prune_percent):
    new_intermediate_size = None

    for idx, layer in enumerate(model.model.layers):
        mlp = layer.mlp

        # Prune the neurons and create new layers
        new_gate_proj, new_up_proj, new_down_proj, new_size, indices_to_keep = prune_neurons(mlp, prune_percent)

        # Copy weights from old layers to new pruned layers
        copy_weights_and_biases(mlp, new_gate_proj, new_up_proj, new_down_proj, indices_to_keep)

        # Replace old layers with new pruned layers
        mlp.gate_proj = new_gate_proj
        mlp.up_proj = new_up_proj
        mlp.down_proj = new_down_proj

        # Update the intermediate size for the first layer
        if new_intermediate_size is None:
            new_intermediate_size = new_size

    # Update the model configuration with the new intermediate size
    model.config.intermediate_size = new_intermediate_size

    return model


In [None]:
prune_percent = 0.2  # Prune 20% of neurons
model = update_model(model, prune_percent)

gate_weight.shape: torch.Size([8192, 2048])
up_weight.shape: torch.Size([8192, 2048])
Original intermediate size: 8192
Number of neurons to prune: 1638
Number of neurons to keep (k): 6554
gate_weight.shape: torch.Size([8192, 2048])
up_weight.shape: torch.Size([8192, 2048])
Original intermediate size: 8192
Number of neurons to prune: 1638
Number of neurons to keep (k): 6554
gate_weight.shape: torch.Size([8192, 2048])
up_weight.shape: torch.Size([8192, 2048])
Original intermediate size: 8192
Number of neurons to prune: 1638
Number of neurons to keep (k): 6554
gate_weight.shape: torch.Size([8192, 2048])
up_weight.shape: torch.Size([8192, 2048])
Original intermediate size: 8192
Number of neurons to prune: 1638
Number of neurons to keep (k): 6554
gate_weight.shape: torch.Size([8192, 2048])
up_weight.shape: torch.Size([8192, 2048])
Original intermediate size: 8192
Number of neurons to prune: 1638
Number of neurons to keep (k): 6554
gate_weight.shape: torch.Size([8192, 2048])
up_weight.shape:

In [None]:
# Recalculate the number of parameters
pruned_param_count = count_parameters(model)
reduction_in_params = original_param_count - pruned_param_count
percentage_savings = (reduction_in_params / original_param_count) * 100

print(f"Pruned model parameters: {pruned_param_count}")
print(f"Reduction in parameters: {reduction_in_params}")
print(f"Percentage of weight savings: {percentage_savings:.2f}%")


Pruned model parameters: 1074792448
Reduction in parameters: 161021952
Percentage of weight savings: 13.03%


In [None]:
# Test the pruned model
generated = get_output(prompt, model, tokenizer)
print(f"Generated text after pruning: {generated}")

Generated text after pruning: Paris is the capital of of the the most most the the most the the the the the the the the the
The is the the the the
Paris is the the
Is the
Is the
Is the
Is the
Is the


In [None]:
output_dir = './pruned_llama_1b'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Pruned model saved to {output_dir}")

Pruned model saved to ./pruned_llama_1b


In [None]:
# Push the model to your Hugging Face repository
model.push_to_hub('pruned-llama-1b')

model.safetensors:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/oopere/pruned-llama-1b/commit/18ed40f201d1be7c85118cbadfb4f801eb1de63c', commit_message='Upload LlamaForCausalLM', commit_description='', oid='18ed40f201d1be7c85118cbadfb4f801eb1de63c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub('pruned-llama-1b')

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/oopere/pruned-llama-1b/commit/383b3b2cf8bec7bb7df853261150ee448cc67757', commit_message='Upload tokenizer', commit_description='', oid='383b3b2cf8bec7bb7df853261150ee448cc67757', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Download the pruned model
pruned_model_name = 'oopere/pruned-llama-1b'
pruned_model = AutoModelForCausalLM.from_pretrained(pruned_model_name, torch_dtype=torch.float16).to(device)
pruned_tokenizer = AutoTokenizer.from_pretrained(pruned_model_name)


config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

In [None]:
# Test the downloaded pruned model
generated = get_output(prompt, pruned_model, pruned_tokenizer)
print(f"Generated text from downloaded pruned model: {generated}")

Generated text from downloaded pruned model: Paris is the capital of of France, which is a a a
1) Paris is the the main of the
2) The is the a
3. It is the
4. It is the
Post your reply to this special
