In [None]:
!pip install datasets trl bitsandbytes
!pip install flash-attn --no-build-isolation

## dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("shivam9980/Inshorts-english", split="train")

In [None]:
ds

In [None]:
ds = ds.train_test_split(test_size = 0.025, seed = 3407)

In [None]:
ds

In [None]:
ds["train"][0]["Content"], ds["train"][0]["Headline"]

In [None]:
user_prompt = '''Generate a concise news headline based on the following news content. The headline should clearly and accurately summarize the key point of the article. Avoid exaggeration or misleading phrasing.

News Content: {content}'''

input_prompt = '''<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Generate a concise news headline based on the following news content. The headline should clearly and accurately summarize the key point of the article. Avoid exaggeration or misleading phrasing.

News Content: {content}<|im_end|>
<|im_start|>assistant
'''

In [None]:
def map_func(datapoint):
  datapoint["text"] = f'''<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Generate a concise news headline based on the following news content. The headline should clearly and accurately summarize the key point of the article. Avoid exaggeration or misleading phrasing.

News Content: {datapoint["Content"]}<|im_end|>
<|im_start|>assistant
{datapoint["Headline"]}<|im_end|>'''
  return datapoint
ds = ds.map(map_func)

In [None]:
print(ds["train"][0]["text"])

In [None]:
# train_dataset = ds["train"].shuffle(seed=3407).select(range(110000))
train_dataset = ds["train"].shuffle(seed=3407)


## model


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


## main

In [None]:
def freeze_all_param(model):
  for name, param in model.named_parameters():
    param.requires_grad = False
def unfreeze_last_n_layers(model, n: int, exclude_modules = ["layernorm"]):
  num_layers = model.config.num_hidden_layers
  for module in model.model.layers[num_layers-n: num_layers]:
    for name, param in module.named_parameters():
      ignore_flag = False
      for exclude_module in exclude_modules:
        if exclude_module in name:
          ignore_flag = True
          break
      param.requires_grad = True if not ignore_flag else False
def check_which_param_are_trainable(model):
  print("_-"*5)
  for name, param in model.named_parameters():
    if param.requires_grad:
      print("*"*2)
      print(name)
      print("*"*2)
  print("_-"*5)
  print()


In [None]:
freeze_all_param(model)
unfreeze_last_n_layers(model, 5)
check_which_param_are_trainable(model)

In [None]:
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
collator = DataCollatorForCompletionOnlyLM(response_template="<|im_start|>assistant\n",instruction_template="<|im_start|>user\n", tokenizer=tokenizer)

training_args = SFTConfig(output_dir="qwen2.5-0.5B-Instruct-Inshort",
                          per_device_train_batch_size=8,
                          gradient_accumulation_steps=1,
                          warmup_steps=50,
                          optim="adamw_8bit",
                          learning_rate=5e-5,
                          max_steps=38412, ## 1 and a 1/2 epoch
                          max_seq_length=312,
                          logging_steps=1500,
                          lr_scheduler_type="linear",
                          seed=3407,
                          gradient_checkpointing=True,
                          gradient_checkpointing_kwargs={"use_reentrant": False},
                          report_to="none",
                          save_steps=len(train_dataset)//8
)


trainer = SFTTrainer(
    args=training_args,
    model=model,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    data_collator=collator
)

In [None]:
import numpy as np
def count_trainable_parameters(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params

In [None]:
count_trainable_parameters(model)

In [None]:
trainer.train()

In [None]:
model.push_to_hub("")
tokenizer.push_to_hub("")