# DeepSeek MoE 16B Base Model Pruning (from 64 to 16 experts per layer)

## Mount Google Drive (if using Google Colab)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Install necessary packages

In [None]:
!pip install datasets transformers accelerate bitsandbytes

## Import libraries

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed
import bitsandbytes as bnb

from data import CacheDataset, build_calib_loader_deepseek
from model import PrunableDeepseekMoEWrapper
from method import progressive_pruning_deepseek

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Set base directory for saving the model (e.g. your current working directory)

In [None]:
base_dir = "/content/drive/MyDrive/MVA/LLM"

## Main execution with 4-bit quantization of DeepSeek MoE 16B Base model

In [None]:
if __name__ == "__main__":
   set_seed(42)

   model_name = "deepseek-ai/deepseek-moe-16b-base"

   quantization_config = BitsAndBytesConfig(
       load_in_4bit=True,
       bnb_4bit_compute_dtype=torch.float16,
       bnb_4bit_quant_type="nf4",
       bnb_4bit_use_double_quant=True
   )

   tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

   model = AutoModelForCausalLM.from_pretrained(
       model_name,
       device_map='cuda',
       quantization_config=quantization_config,
       trust_remote_code=True
   )

   calib_loader = build_calib_loader_deepseek("c4", tokenizer, 2048, 64, 4, 8, 42)

   with torch.no_grad():
       model, info = progressive_pruning_deepseek(model, calib_loader, r=16)

   model.save_pretrained(os.path.join(base_dir, "deepseek-moe-16b-pruned"))
   tokenizer.save_pretrained(os.path.join(base_dir, "deepseek-moe-16b-pruned"))