<a href="https://colab.research.google.com/github/morgoth22a/AppFlowy/blob/main/KONTYNUACJA_DeepSeekCoder_QLoRA_Parrot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# DeepSeek-Coder 6.7B (Instruct) — QLoRA finetune on **AIDC-AI/Parrot-dataset** (Colab)

This notebook fine-tunes **deepseek-ai/deepseek-coder-6.7b-instruct** with **QLoRA** using the dataset:
`AIDC-AI/Parrot-dataset`.

**Features**:
- 4-bit quantization (bitsandbytes) + PEFT LoRA
- TRL `SFTTrainer` pipeline
- Packing for longer sequences (optional)
- Periodic checkpoints & final adapter
- Merge LoRA → full HF model (optional)
- Simple inference cell after training

> **Colab tip:** Use a T4/L4/A100 GPU runtime. Menu: Runtime → Change runtime type → GPU.


In [1]:
# %%capture
!pip -q install -U "transformers>=4.42.0" "trl>=0.9.6" "accelerate>=0.33.0" "peft>=0.11.1" "datasets>=2.20.0" bitsandbytes einops wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:

import os, json, math, random
from dataclasses import dataclass
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, AutoPeftModelForCausalLM
from datetime import datetime

print("Torch:", torch.__version__)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Torch: 2.8.0+cu126
Device: cuda


In [3]:
# ==== CONFIG ====
MODEL_NAME = "deepseek-ai/deepseek-coder-6.7b-instruct"
OUTPUT_DIR = "/content/deepseek_coder_qlora_parrot"
HF_CACHE = "/content/.cache/huggingface"
os.environ["HF_HOME"] = HF_CACHE
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # faster downloads

# Training hyperparams (safe defaults for 24GB VRAM Colab)
MAX_SEQ_LEN = 2048
BATCH_SIZE = 1
GRAD_ACCUM = 16
EPOCHS = 1
LR = 2e-4
WARMUP_RATIO = 0.03
SAVE_STEPS = 1000
LOG_STEPS = 10
PACKING = True  # set False if you want one-sample-per-batch without packing

# LoRA settings
LORA_R = 64
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

# BitsAndBytes 4-bit config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)

print("Config ready.")

Config ready.


In [4]:
from google.colab import userdata
userdata.get('secretName')

'hf_QBENIOfRrYHiKQEkcGDHXaYREyyLDh'

In [None]:

# Load tokenizer & model in 4-bit
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
# Ensure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True
)
model.config.use_cache = False  # needed for gradient checkpointing compatibility
print("Model loaded.")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from google.colab import userdata
userdata.get('secretName')

# Nowa sekcja

In [None]:
# Load dataset
dataset = load_dataset("AIDC-AI/Parrot-dataset")
print(dataset)

# Try to infer fields; Parrot-dataset often has fields akin to instruction-following.
# We'll map records into a single 'text' field using a simple prompt template.
# The template is geared for code generation (system + user → assistant).
SYSTEM_PROMPT = "You are a helpful, expert coding assistant. Provide clear, correct, well-structured code with brief explanations when needed."

def build_prompt(example):
    # Flexible mapping across possible keys
    user = None
    assistant = None

    for k in ["instruction", "prompt", "question", "input", "query", "user"]:
        if k in example and example[k]:
            user = example[k]
            break
    for k in ["output", "response", "answer", "completion", "assistant"]:
        if k in example and example[k]:
            assistant = example[k]
            break

    # Fallbacks: if some datasets use 'Human'/'Assistant' pair
    if user is None:
        user = example.get("Human", example.get("text", ""))
    if assistant is None:
        assistant = example.get("Assistant", example.get("target", ""))

    # Final prompt format (chat-style → single text for SFT)
    prompt = f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n"
    prompt += f"<|user|>\n{user}\n</|user|>\n"
    if assistant:
        prompt += f"<|assistant|>\n{assistant}\n</|assistant|>\n"
    return {"text": prompt}

# Map dataset to 'text'
# Use the 'validation' split as there is no 'train' split
column_names = dataset["validation"].column_names
processed = dataset.map(build_prompt, remove_columns=column_names)
print(processed)

In [None]:
# Model / dane / ścieżki
MODEL_NAME   = "deepseek-ai/deepseek-coder-6.7b-instruct"
OUTPUT_DIR   = "/content/deepseek_coder_qlora_parrot"
DATASET_NAME = "AIDC-AI/Parrot-dataset"

# Trening (A100 40GB/80GB — parametry bezpieczne; możesz podnieść EPOCHS)
MAX_SEQ_LEN  = 2048
BATCH_SIZE   = 1
GRAD_ACCUM   = 16
EPOCHS       = 1
LR           = 2e-4
WARMUP_RATIO = 0.03
LOG_STEPS    = 10
SAVE_STEPS   = 1000
PACKING      = True   # efektywne pakowanie sekwencji po stronie TRL

# LoRA (sprawdzone na DeepSeek-Coder)
LORA_R         = 64
LORA_ALPHA     = 16
LORA_DROPOUT   = 0.05
TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

# BitsAndBytes — 4-bit NF4
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True,
)
# Dla zgodności z gradient checkpointing
model.config.use_cache = False

print("Model & tokenizer ready.")


In [None]:
from datasets import DatasetDict, load_dataset

# Load the dataset into raw_ds before using it
raw_ds = load_dataset(DATASET_NAME)
print(raw_ds)

SYSTEM_PROMPT = (
    "You are a helpful, expert coding assistant. Provide clear, correct, "
    "well-structured code with brief explanations when needed."
)

def to_text_record(example):
    user = None
    assistant = None
    for k in ["instruction","prompt","question","input","query","user"]:
        if k in example and example[k]:
            user = example[k]; break
    for k in ["output","response","answer","completion","assistant"]:
        if k in example and example[k]:
            assistant = example[k]; break
    if user is None:
        user = example.get("Human", example.get("text", ""))
    if assistant is None:
        assistant = example.get("Assistant", example.get("target", ""))

    prompt = (
        f"<|system|>\n{SYSTEM_PROMPT}\n</|system|>\n"
        f"<|user|>\n{user}\n</|user|>\n"
    )
    if assistant:
        prompt += f"<|assistant|>\n{assistant}\n</|assistant|>\n"
    return {"text": prompt}

# Mapowanie splitów → DatasetDict
processed = {}
for split in raw_ds.keys():
    cols = raw_ds[split].column_names
    processed[split] = raw_ds[split].map(to_text_record, remove_columns=cols)

processed = DatasetDict(processed)   # teraz działa
print(processed)

# Use the 'validation' split from the processed DatasetDict as the training dataset
train_ds = processed["validation"]
print("Training on split: validation | examples:", len(train_ds))


# Twarde przycięcie do MAX_SEQ_LEN (proste i stabilne)
def truncate_long_examples(example):
    enc = tokenizer(
        example["text"],
        add_special_tokens=False,
        return_attention_mask=False,
    )
    ids = enc["input_ids"]
    if len(ids) > MAX_SEQ_LEN:
        ids = ids[:MAX_SEQ_LEN]
        example["text"] = tokenizer.decode(ids, skip_special_tokens=True)
    return example

processed = processed.map(truncate_long_examples)
print("Preprocessing done.")

In [None]:
# ==== KONFIG + TRAINER + TENSORBOARD LOGGING ====
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig
from datasets import DatasetDict
import torch, os

# ---- LoRA ----
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=TARGET_MODULES,
    bias="none",
    task_type="CAUSAL_LM",
)

# ---- Split: używamy 'validation' jako train ----
assert isinstance(processed, DatasetDict) and "validation" in processed, "Brak splitu 'validation' w processed"
train_ds = processed["validation"]
print("Training on split: validation | examples:", len(train_ds))

# ---- TensorBoard logdir ----
LOG_DIR = f"{OUTPUT_DIR}/runs"
os.makedirs(LOG_DIR, exist_ok=True)

# ---- SFTConfig (bez max_seq_length, bez tokenizer) ----
sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=WARMUP_RATIO,
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),
    fp16=False,
    optim="paged_adamw_32bit",
    gradient_checkpointing=True,
    packing=PACKING,          # ok z Twoją wersją TRL
    dataset_num_proc=2,
    report_to="tensorboard",  # <— włącz TB
    logging_dir=LOG_DIR,      # <— katalog eventów
)

# ---- formatting_func zamiast dataset_text_field ----
def formatting_func(examples):
    # oczekuje listy stringów (batched)
    return examples["text"]

# ---- Trainer ----
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    peft_config=peft_config,
    train_dataset=train_ds,
    formatting_func=formatting_func,
)

print("Trainer initialized.")

# ---- Trening + zapis adapterów ----
train_result = trainer.train()
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)
print("Training done. Saved to:", OUTPUT_DIR)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# === Quick inference (PEFT adapter loaded in trainer.model) ===
from transformers import pipeline
import torch

pipe = pipeline(
    task="text-generation",
    model=trainer.model,           # PEFT model z adapterem
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    max_new_tokens=256,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
)

test_prompt = (
    "<|system|>\nYou are a helpful coding assistant.\n</|system|>\n"
    "<|user|>\nCreate a minimal Python Flask app with one /hello route returning 'Hello, world!'\n</|user|>\n"
    "<|assistant|>\n"
)

out = pipe(test_prompt)[0]["generated_text"]
print(out)


In [None]:

# (Optional) Merge LoRA into base weights to get a standalone HF model (fp16/bf16)
# Warning: this requires more VRAM/RAM; skip if you only need the adapter.
MERGE = False  # set True to merge
if MERGE:
    merged_dir = OUTPUT_DIR + "-merged"
    os.makedirs(merged_dir, exist_ok=True)
    base = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        device_map="auto",
        trust_remote_code=True
    )
    peft_model = AutoPeftModelForCausalLM.from_pretrained(OUTPUT_DIR, device_map="auto")
    merged = peft_model.merge_and_unload()
    merged.save_pretrained(merged_dir, safe_serialization=True)
    tokenizer.save_pretrained(merged_dir)
    print("Merged model saved to:", merged_dir)
else:
    print("Skipping merge; LoRA adapters saved in:", OUTPUT_DIR)


In [None]:

# Quick inference test (uses the PEFT adapter)
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    max_new_tokens=300,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
)

prompt = "<|system|>\nYou are a helpful coding assistant.\n</|system|>\n<|user|>\nWrite a Python Flask app with one endpoint /hello returning 'Hello, world!'.\n</|user|>\n<|assistant|>\n"
out = pipe(prompt)[0]["generated_text"]
print(out)


In [None]:

# Export artifacts for download (adapter + tokenizer)
print("Saved files under:", OUTPUT_DIR)
!ls -lah $OUTPUT_DIR | head -n 50


In [None]:
!pip install -U bitsandbytes