In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets

In [None]:
import os
import math
import torch
import time
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from peft import (
    get_peft_model,
    PromptTuningConfig,
    LoraConfig,
    TaskType
)

In [None]:
os.environ["WANDB_DISABLED"] = "true"
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
UNLABELLED_CSV = "/content/drive/MyDrive/datasetfiles/unlabelled_development_set.json"
OUTPUT_DIR = "/dapt_checkpoints"

In [None]:
raw = load_dataset("json", data_files=UNLABELLED_CSV)["train"]
raw = raw.remove_columns([c for c in raw.column_names if c != "prompt"])
splits = raw.train_test_split(test_size=0.1, seed=55)
ds = DatasetDict({
    "train": splits["train"],
    "validation": splits["test"]
})

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})

def tokenize_fn(examples):
    return tokenizer(
        examples["prompt"],
        truncation=True,
        max_length=512
    )

ds_tok = ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=["prompt"]
)

ds_tok["train"]      = ds_tok["train"].filter(lambda x: len(x["input_ids"]) > 1)
ds_tok["validation"] = ds_tok["validation"].filter(lambda x: len(x["input_ids"]) > 1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Map:   0%|          | 0/715 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Filter:   0%|          | 0/715 [00:00<?, ? examples/s]

Filter:   0%|          | 0/80 [00:00<?, ? examples/s]

In [None]:
collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=False
)
# def compute_perplexity(model_path, dataset, split_name):
#     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
#     model.resize_token_embeddings(len(tokenizer))
#     trainer = Trainer(
#         model=model,
#         args=TrainingArguments(
#             output_dir=os.path.join(OUTPUT_DIR, "eval_tmp"),
#             per_device_eval_batch_size=16,
#             report_to=[]
#         ),
#         data_collator=collator,
#         eval_dataset=dataset
#     )
#     metrics = trainer.evaluate()
#     loss = metrics["eval_loss"]
#     ppl  = math.exp(loss)
#     print(f"[{split_name}] Model @ {model_path} → eval_loss={loss:.4f}, perplexity={ppl:.2f}")
#     return loss, ppl

In [None]:
dapt_dir = "./drive/MyDrive/model/dapt1"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

dapt_args = TrainingArguments(
    output_dir=dapt_dir,
    per_device_train_batch_size=2,
    num_train_epochs=2,
    learning_rate=5e-6,
    save_steps=500,
    logging_steps=100,
    report_to=[]
)
trainer = Trainer(
    model=model,
    args=dapt_args,
    train_dataset=ds_tok["train"],
    data_collator=collator
)
print("\n== Start DAPT ==")
trainer.train()
model.save_pretrained(dapt_dir)
tokenizer.save_pretrained(dapt_dir)

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]


== Start DAPT ==


Step,Training Loss
100,3.1887
200,3.1469
300,2.9986
400,2.9218
500,2.5918
600,2.5127
700,2.591


('./drive/MyDrive/model/dapt1/tokenizer_config.json',
 './drive/MyDrive/model/dapt1/special_tokens_map.json',
 './drive/MyDrive/model/dapt1/vocab.json',
 './drive/MyDrive/model/dapt1/merges.txt',
 './drive/MyDrive/model/dapt1/added_tokens.json',
 './drive/MyDrive/model/dapt1/tokenizer.json')