# Install deps

In [1]:
pip install -U "transformers>=4.43" "accelerate>=0.30" "peft>=0.11.1" safetensors huggingface_hub bitsandbytes

Note: you may need to restart the kernel to use updated packages.


# 1) Imports & basic config

In [2]:
import os, torch, shutil
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from huggingface_hub import login, HfApi

# ---- Edit these IDs as needed ----
BASE_ID    = "deepseek-ai/deepseek-r1-distill-qwen-32b"          # base model
ADAPTER_ID = "peers-ai/deepseek-32b-my-lora1-with-stops"          # your LoRA repo
OUT_DIR    = "./merged-deepseek-32b-with-stops"                   # where to save merged
PUSH_TO    = ""  # e.g., "yourname/merged-deepseek-32b-with-stops" or leave "" to skip pushing

# dtype options: "auto", "bfloat16", "float16", "float32"
DTYPE = "bfloat16"

# Device map: "auto" tries to use GPU; use "cpu" if you want a CPU-only merge (needs lots of RAM)
DEVICE_MAP = "auto"

# Set True to load the base in 8-bit (saves VRAM, slower; requires bitsandbytes)
USE_8BIT_BASE = False

# DeepSeek/Qwen stacks often need trust_remote_code=True
TRUST_REMOTE_CODE = True

print("Basic config set")

Basic config set


# 2) Login to Huggingface

In [4]:
# Only needed if your base/adapter are gated/private or if you'll push to Hub.
HF_TOKEN = os.getenv("HF_TOKEN", "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")  # set in environment or paste here
if HF_TOKEN:
    login(token=HF_TOKEN)
    print("Login success")
else:
    print("HF_TOKEN not set—continuing without Hub login.")


Login success


# 3) Load tokenizer

In [5]:
tok = AutoTokenizer.from_pretrained(
    BASE_ID,
    use_fast=True,
    trust_remote_code=TRUST_REMOTE_CODE
)
print("Loaded tokenizer from:", BASE_ID)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Loaded tokenizer from: deepseek-ai/deepseek-r1-distill-qwen-32b


# 4) Load base model (choose VRAM strategy)

In [6]:
load_kwargs = dict(
    trust_remote_code=TRUST_REMOTE_CODE,
    device_map=DEVICE_MAP,          # "auto" -> try GPU; "cpu" -> CPU-only
    low_cpu_mem_usage=True
)

if USE_8BIT_BASE:
    load_kwargs.update(dict(load_in_8bit=True))     # ~greatly reduces VRAM, slower
    print("Loading base in 8-bit mode.")
else:
    # Respect DTYPE if not using 8-bit
    torch_dtype = {
        "auto": "auto",
        "bfloat16": torch.bfloat16,
        "float16": torch.float16,
        "float32": torch.float32,
    }[DTYPE]
    load_kwargs.update(dict(torch_dtype=(None if torch_dtype == "auto" else torch_dtype)))
    print(f"Loading base with dtype={DTYPE}.")

base = AutoModelForCausalLM.from_pretrained(BASE_ID, **load_kwargs)
print("Loaded base model:", BASE_ID)

Loading base with dtype=bfloat16.


config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-000008.safetensors:   0%|          | 0.00/8.79G [00:00<?, ?B/s]

model-00002-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00003-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00004-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

  [2m2025-08-28T21:10:44.356186Z[0m [33m WARN[0m  [33mStatus Code: 500. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220

  [2m2025-08-28T21:10:44.357170Z[0m [33m WARN[0m  [33mRetry attempt #0. Sleeping 1.489672063s before the next attempt[0m
    [2;3mat[0m /root/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs:171



model-00005-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00006-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00007-of-000008.safetensors:   0%|          | 0.00/8.78G [00:00<?, ?B/s]

model-00008-of-000008.safetensors:   0%|          | 0.00/4.07G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Loaded base model: deepseek-ai/deepseek-r1-distill-qwen-32b


# 5) Load LoRA adapter on top of base

In [7]:
# Your LoRA repo must contain adapter_model.safetensors + adapter_config.json
peft_model = PeftModel.from_pretrained(base, ADAPTER_ID)
print("Loaded LoRA adapter:", ADAPTER_ID)

adapter_config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

Loaded LoRA adapter: peers-ai/deepseek-32b-my-lora1-with-stops


# 6) Merge LoRA → base weights

In [8]:
print("Merging LoRA into base (can take a while)…")
merged = peft_model.merge_and_unload()

# Free GPU VRAM before saving (optional but helpful)
merged = merged.to("cpu")
torch.cuda.empty_cache() if torch.cuda.is_available() else None

print("Merge complete.")


Merging LoRA into base (can take a while)…
Merge complete.


# 7) Save the merged model locally

In [None]:
os.makedirs(OUT_DIR, exist_ok=True)
merged.save_pretrained(OUT_DIR, safe_serialization=True)
tok.save_pretrained(OUT_DIR)

print("Saved merged model to:", OUT_DIR)
# After this, OUT_DIR is

8) (Optional) Push the merged model to your Hugging Face repo

In [None]:
if PUSH_TO:
    if not HF_TOKEN:
        raise ValueError("Set HF_TOKEN (env or cell 2) to push to the Hub.")

    api = HfApi()
    try:
        api.create_repo(PUSH_TO, private=False, exist_ok=True)
        print(f"Ensured HF repo exists: {PUSH_TO}")
    except Exception as e:
        print("(Info) create_repo:", e)

    api.upload_folder(
        folder_path=OUT_DIR,
        repo_id=PUSH_TO,
        commit_message="Add merged LoRA model"
    )
    print(f"Pushed merged model to: https://huggingface.co/{PUSH_TO}")
else:
    print("Skipping push to Hub (PUSH_TO not set).")

9) Quick sanity test (local load)

In [None]:
# Reload from OUT_DIR to verify it’s a standalone model
test_tok = AutoTokenizer.from_pretrained(OUT_DIR, use_fast=True, trust_remote_code=TRUST_REMOTE_CODE)
test_model = AutoModelForCausalLM.from_pretrained(OUT_DIR, torch_dtype=torch.bfloat16 if DTYPE=="bfloat16" else "auto", device_map="auto", trust_remote_code=TRUST_REMOTE_CODE)

prompt = "You are a helpful assistant. Briefly introduce yourself."
inputs = test_tok(prompt, return_tensors="pt").to(next(test_model.parameters()).device)
with torch.no_grad():
    out = test_model.generate(**inputs, max_new_tokens=64)
print(test_tok.decode(out[0], skip_special_tokens=True))