In [1]:
!pip install transformers torch accelerate huggingface_hub

Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Collecting torch
  Downloading torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl.metadata (31 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-1.4.1-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.4.2-cp312-cp312-macosx_14_0_arm64.whl.metadata (6.6 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2026.1.15-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.3 kB)
Collecting typer-slim (from transformers)
  Downloading typer_slim-0.23.1-py3-none-any.whl.metadata (4.2 kB)
Colle

In [None]:
from huggingface_hub import login
#HF_TOKEN = ""
login(HF_TOKEN)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch
import torch.nn.utils.prune as prune
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "meta-llama/Llama-3.2-1B"
TARGET_LAYER_NAME_STR = "model.layers.0.mlp.gate_proj"
PRUNING_AMOUNT = 0.5

# Correct device selection for Apple Silicon
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Safe dtype selection for all PyTorch versions on Mac
if device.type == "mps":
    try:
        # Try BF16
        torch.zeros(1, dtype=torch.bfloat16, device="mps")
        model_dtype = torch.bfloat16
    except Exception:
        # Fall back to FP16
        model_dtype = torch.float16
else:
    model_dtype = torch.float32  # CPU fallback

print(f"Using device: {device}, dtype: {model_dtype}")


Using device: mps, dtype: torch.bfloat16


In [6]:
def get_module_by_name_str(model, module_name_str):
    """Gets a module from a model using its string name (e.g, model.layers.0.mlp.gate_proj)"""
    names = module_name_str.split(".")
    current_module = model
    for name_part in names:
        if hasattr(current_module, name_part):
            current_module = getattr(current_module, name_part)
        else:
            try: #Handle integer indices for lists (e.g., layers.0)
                idx = int(name_part)
                current_module = current_module[idx]
            except (ValueError, IndexError):
                raise ValueError(f"Module '{name_part}' not found in the model.")
    return current_module

def calculate_sparsity(module, param_name = 'weight'):
    "Calculates sparsity of a nmed parameter in a module."""
    if hasattr(module, param_name):
        param = getattr(module, param_name)
        if param is not None:
            return 100.0 * float(torch.sum(param == 0)) / param.numel()
    return 0.0

In [7]:
print(f"---Loading Model: {MODEL_NAME}---")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=model_dtype, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
print(f"---Pruning Layer: {TARGET_LAYER_NAME_STR} with amount: {PRUNING_AMOUNT}---")

---Loading Model: meta-llama/Llama-3.2-1B---


`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|██████████| 146/146 [00:01<00:00, 123.84it/s, Materializing param=model.norm.weight]                              


---Pruning Layer: model.layers.0.mlp.gate_proj with amount: 0.5---


In [8]:
PROMPT_TEXT_DEMO = "The capital of France is"
print(f"----Quick Generaton PRE-PRUNING---")
inputs = tokenizer(PROMPT_TEXT_DEMO, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)
    
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Prompt: {PROMPT_TEXT_DEMO}")
print(f"Generated Text: {generated_text}")

----Quick Generaton PRE-PRUNING---
Prompt: The capital of France is
Generated Text: The capital of France is Paris, with a population of 2.2


In [10]:
print(f"\n---Accessing Target Layer--- {TARGET_LAYER_NAME_STR}---")
target_module = get_module_by_name_str(model, TARGET_LAYER_NAME_STR)
print(f"Suuccessfully accessed target module: {target_module}")

sparsity_before = calculate_sparsity(target_module, param_name='weight')
print(f"Sparsity of target module before pruning: {sparsity_before:.2f}%")


---Accessing Target Layer--- model.layers.0.mlp.gate_proj---
Suuccessfully accessed target module: Linear(in_features=2048, out_features=8192, bias=False)
Sparsity of target module before pruning: 0.00%


In [11]:
print(f"---Applying L1 unstructure pruning (amount={PRUNING_AMOUNT}) to weight parameter of target module---")
prune.l1_unstructured(target_module, name='weight', amount=PRUNING_AMOUNT)

print("Pruning hook has been applied")
print(f"The layer now has a weight_mask and weight_orig parameter. The original weight parameter is now a pruned version of the original weights.")

---Applying L1 unstructure pruning (amount=0.5) to weight parameter of target module---
Pruning hook has been applied
The layer now has a weight_mask and weight_orig parameter. The original weight parameter is now a pruned version of the original weights.


In [12]:
print(f"\n===Making pruning permanent for '{TARGET_LAYER_NAME_STR}.weight'---")
prune.remove(target_module, 'weight')
print("Pruning has been made permanent. The weight parameter is now the pruned version and the mask and orig parameters have been removed.")


===Making pruning permanent for 'model.layers.0.mlp.gate_proj.weight'---
Pruning has been made permanent. The weight parameter is now the pruned version and the mask and orig parameters have been removed.


In [13]:
sparsity_after = calculate_sparsity(target_module, param_name='weight')
print(f"Sparsity of target module after pruning: {sparsity_after:.2f}%")

Sparsity of target module after pruning: 50.00%


In [14]:
print(f"Quick Generaton POST-PRUNING---")
inputs = tokenizer(PROMPT_TEXT_DEMO, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)
    
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Prompt: {PROMPT_TEXT_DEMO}")
print(f"Generated Text: {generated_text}")

Quick Generaton POST-PRUNING---
Prompt: The capital of France is
Generated Text: The capital of France is Paris. It is a city of 2.
