**Clone github repo if using colab**

In [None]:
# !git clone https://github.com/jlopetegui98/PruningOfExperts.git

In [None]:
# !cp -r PruningOfExperts/data/ ./

In [None]:
# !cp -r PruningOfExperts/method/ ./

In [None]:
# !cp -r PruningOfExperts/model/ ./

**Preparing the environment**

- Hugging Face authentication required to download Mixtral8x7B instruct model
- Install dependencies if using colab

In [None]:
# login to hugging face before starting
from huggingface_hub import login

# Paste your token (replace 'your_token_here' with the actual token)
login("")

In [None]:
# !pip install -q -U transformers bitsandbytes accelerate

In [None]:
# !pip install datasets

**Imports**

In [None]:
import torch

from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, set_seed, default_data_collator

from data import CacheDataset, build_calib_loader_mixtral
from model import PrunableMixtralSparseMoeBlockWrapper
from method import progressive_pruning_mixtral

In [15]:
set_seed(42)

**Loading the model (4-bit quantization)**

In [16]:
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"

In [None]:
# quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
# load tokenizer and model from the hub
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='cuda',
    quantization_config=quantization_config
)

In [None]:
# visualize the model
model

In [None]:
# visualize the shape of router matrices (important to 
# understand how the 4 bit quantization works and the modifications 
# we had to make to the pruning pipeline)
model.model.layers[3].block_sparse_moe.gate.weight.shape

**Load the calibration data**

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
calib_loader = build_calib_loader_mixtral("c4", tokenizer, 2048, 128, 4, 8, 42)

**Run the pruning method**

In [None]:
# chose the value of r (number of experts to keep)
r = 6

In [None]:
with torch.no_grad():
    model, info = progressive_pruning_mixtral(model, calib_loader, r=r)

In [None]:
model

**Save the pruned model to the hub**

In [None]:
repository_id = "JavierLopetegui/Mixtral8x7B-4bit-pruned"
model.push_to_hub(repository_id)

**Alternatively save it to the drive**

In [40]:
import os
drive_save_path = "/content/drive/MyDrive/LLMs_MVA/Final_Project/models"  # Adjust path as needed

# Step 3: Create the directory if it doesn't exist
os.makedirs(drive_save_path, exist_ok=True)

# Step 4: Save the model and tokenizer to Google Drive
model.save_pretrained(drive_save_path)