In [1]:
!pip install -q transformers
!pip install -u datasets
!pip install -q torch


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -u


In [2]:
import torch
import transformers
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

GPU: Tesla T4
Memory: 15.8 GB


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType
import gc

In [4]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    torch_dtype = torch.float16,
    device_map="auto",
    load_in_8bit=True,
    trust_remote_code=True,)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

In [8]:
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [9]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 80,740,352 || all params: 7,696,356,864 || trainable%: 1.0491


In [10]:
print(f"Model loaded: {model.num_parameters() / 1e6:.1f}M parameters")

Model loaded: 7696.4M parameters


In [11]:
from datasets import load_dataset

In [12]:
dataset = load_dataset("microsoft/orca-math-word-problems-200k", split="train[:3000]")

In [13]:
print(f"Dataset size: {len(dataset)}")
print("Sample:", dataset[0])

Dataset size: 3000
Sample: {'question': 'Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.', 'answer': 'If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.'}


In [14]:
def format_math_qa(example):
    question = example["question"]
    answer = example["answer"]

    prompt = f"<|user|>\n{question}\n<|assistant|>\n{answer}<|end|>"

    return {"text": prompt}

In [15]:
formatted_dataset = dataset.map(format_math_qa, remove_columns=dataset.column_names)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [16]:
train_val_split = formatted_dataset.train_test_split(seed=42,test_size=0.2)

In [17]:
train_dataset =  train_val_split['train']
eval_dataset =  train_val_split['test']

In [18]:
def tokenize_function(examples):
    result = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=1024,
        return_overflowing_tokens=False,
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [19]:
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_dataset.column_names)

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [20]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

In [21]:
training_args = TrainingArguments(
    output_dir="./deepseek-r1-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    learning_rate=2e-4,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    logging_steps=25,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    remove_unused_columns=False,
    report_to=None,
    bf16=True,
    seed=42,
)

In [22]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [23]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  data_collator=data_collator,
                  tokenizer=tokenizer)

  trainer = Trainer(model=model,
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
