<a href="https://colab.research.google.com/github/peremartra/LLMOptCost/blob/main/pruning_structured_pairsGLU_llama3.2-1b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install torch
!pip install sentencepiece  # Required for LLaMA tokenizer



In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch import nn
import os

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [56]:
model_name = 'meta-llama/Llama-3.2-1B'
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

In [71]:
def get_output(prompt, model=model, tokenizer=tokenizer):
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        temperature=None,
        top_p=None,
        do_sample=False,          # Disable sampling
        num_beams=5,              # Use beam search
        early_stopping=True,      # Stop when end-of-sequence token is generated
        no_repeat_ngram_size=2    # Prevent repetition of 2-grams
    )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated

# Test the original model
prompt = "Paris is the capital of"
generated = get_output(prompt)
print(f"Generated text: {generated}")

Generated text: Paris is the capital of France. It is also one of the most beautiful cities in the world. There are many reasons for that, but the main one is that it is a city full of beautiful buildings. Some of them are so beautiful that


In [66]:
print(model.generate.__doc__)



        Generates sequences of token ids for models with a language modeling head.


        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

        For an overview of generation strategies and code examples, check out the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                should be in the format of `input_ids`. For encoder-decoder models *inputs* 

In [58]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

original_param_count = count_parameters(model)
print(f"Original model parameters: {original_param_count}")

Original model parameters: 1235814400


In [59]:
#Product of Norms:
#Since the GLU multiplies the outputs of gate_proj and up_proj,
#we can compute the product of their weight norms to better represent the
#importance of the neuron pair
def compute_neuron_pair_importance(gate_weight, up_weight):
    gate_norms = torch.norm(gate_weight, p=1, dim=1)
    up_norms = torch.norm(up_weight, p=1, dim=1)
    importance_scores = gate_norms * up_norms
    return importance_scores
#sample response: Generated text after pruning: Paris is the capital of of of of the of the the the the to to to from to from from from to to from to
#France France France France France France France France France France France
#France France France France France
#All All
#All

In [40]:
#Variance of Weights
#Neurons with higher weight variance may contribute more to the model's output.
def compute_neuron_pair_importance(gate_weight, up_weight):
    gate_variance = torch.var(gate_weight, dim=1)
    up_variance = torch.var(up_weight, dim=1)
    importance_scores = gate_variance + up_variance
    return importance_scores
#sample response: Paris is the capital of the French Republic. It is also a...
#Paris is the capital of the French Republic. It is also a
#Germany is the German Republic. It is also a
#of the Austrian Republic. It is also a

In [60]:
#Maximum Absolute Weight:
#The maximum absolute weight in a neuron might indicate its significance.
def compute_neuron_pair_importance(gate_weight, up_weight):
    gate_max_abs = torch.max(torch.abs(gate_weight), dim=1).values
    up_max_abs = torch.max(torch.abs(up_weight), dim=1).values
    importance_scores = gate_max_abs + up_max_abs
    return importance_scores

#response: Paris is the capital of France, and the City of Lights is a city that’s famous for its architecture, museums, and art galleries. But it’s also a city that’s famous for its cuisine, and for its cuisine, the French do



In [61]:
def prune_neuron_pairs(mlp, prune_percent):
    gate_weight = mlp.gate_proj.weight.data.float()
    up_weight = mlp.up_proj.weight.data.float()

    importance_scores = compute_neuron_pair_importance(gate_weight, up_weight)

    original_intermediate_size = gate_weight.size(0)
    num_neuron_pairs_to_prune = int(prune_percent * original_intermediate_size)
    num_neuron_pairs_to_prune = max(0, min(num_neuron_pairs_to_prune, original_intermediate_size - 1))
    k = original_intermediate_size - num_neuron_pairs_to_prune

    if k <= 0:
        raise ValueError(f"Invalid number of neuron pairs to keep: {k}. Adjust the prune_percent.")

    _, indices_to_keep = torch.topk(importance_scores, k, largest=True, sorted=True)
    indices_to_keep = indices_to_keep.sort().values

    new_gate_proj = nn.Linear(mlp.gate_proj.in_features, k, bias=False).to(device)
    new_up_proj = nn.Linear(mlp.up_proj.in_features, k, bias=False).to(device)
    new_down_proj = nn.Linear(k, mlp.down_proj.out_features, bias=False).to(device)

    new_gate_proj.weight.data = mlp.gate_proj.weight.data[indices_to_keep, :]
    new_up_proj.weight.data = mlp.up_proj.weight.data[indices_to_keep, :]
    new_down_proj.weight.data = mlp.down_proj.weight.data[:, indices_to_keep]

    return new_gate_proj, new_up_proj, new_down_proj, k


In [62]:
def update_model(model, prune_percent):
    new_intermediate_size = None

    for idx, layer in enumerate(model.model.layers):
        mlp = layer.mlp

        new_gate_proj, new_up_proj, new_down_proj, new_size = prune_neuron_pairs(mlp, prune_percent)

        mlp.gate_proj = new_gate_proj
        mlp.up_proj = new_up_proj
        mlp.down_proj = new_down_proj

        if new_intermediate_size is None:
            new_intermediate_size = new_size

    model.config.intermediate_size = new_intermediate_size

    return model


In [63]:
prune_percent = 0.2  # Prune 20% of neurons
model = update_model(model, prune_percent)

In [64]:
# Recalculate the number of parameters
pruned_param_count = count_parameters(model)
reduction_in_params = original_param_count - pruned_param_count
percentage_savings = (reduction_in_params / original_param_count) * 100

print(f"Pruned model parameters: {pruned_param_count}")
print(f"Reduction in parameters: {reduction_in_params}")
print(f"Percentage of weight savings: {percentage_savings:.2f}%")


Pruned model parameters: 1074792448
Reduction in parameters: 161021952
Percentage of weight savings: 13.03%


In [65]:
# Test the pruned model
generated = get_output(prompt, model, tokenizer)
print(f"Generated text after pruning: {generated}")

Generated text after pruning: Paris is the capital of France, a city with more than 2 million inhabitants. It is also the seat of the European Capital of Culture, the European Capital of Culture of Design, and the European Capital of Culture of Architecture. It is also


In [None]:
output_dir = './pruned_llama_1b'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Pruned model saved to {output_dir}")

Pruned model saved to ./pruned_llama_1b


In [None]:
# Push the model to your Hugging Face repository
model.push_to_hub('pruned-llama-1b')

model.safetensors:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/oopere/pruned-llama-1b/commit/18ed40f201d1be7c85118cbadfb4f801eb1de63c', commit_message='Upload LlamaForCausalLM', commit_description='', oid='18ed40f201d1be7c85118cbadfb4f801eb1de63c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub('pruned-llama-1b')

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/oopere/pruned-llama-1b/commit/383b3b2cf8bec7bb7df853261150ee448cc67757', commit_message='Upload tokenizer', commit_description='', oid='383b3b2cf8bec7bb7df853261150ee448cc67757', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Download the pruned model
pruned_model_name = 'oopere/pruned-llama-1b'
pruned_model = AutoModelForCausalLM.from_pretrained(pruned_model_name, torch_dtype=torch.float16).to(device)
pruned_tokenizer = AutoTokenizer.from_pretrained(pruned_model_name)


config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

In [None]:
# Test the downloaded pruned model
generated = get_output(prompt, pruned_model, pruned_tokenizer)
print(f"Generated text from downloaded pruned model: {generated}")

Generated text from downloaded pruned model: Paris is the capital of of France, which is a a a
1) Paris is the the main of the
2) The is the a
3. It is the
4. It is the
Post your reply to this special
