In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_from_disk

In [None]:
#Loading the datatset
df = pd.read_csv("/content/train.csv")

#Sampling
sample = df.sample(n=200, random_state=1)

#Converting to huggingface dataset object
stack = Dataset.from_pandas(sample)
print(stack[:5])

{'Id': [46425260, 40276958, 46167365, 46333205, 51061863], 'Title': ['LinkedIn in-app browser forcing video to fullscreen on iPhone', 'How to use "HTML form target self" ?', 'Getting through in Machine Learning', 'Using ConstraintLayout with custom Android Dialog', 'how to set a button as it plays an audio clip after 5 or 10 seconds in swift?'], 'Body': ["<p>We are using LinkedIn to share a link to an HTML5 interactive video. When the link is shared, by default it opens in LinkedIn's browser inside the app. The problem is that when the user starts playing the video, the browser automatically switches to fullscreen, hiding our custom controls. iOS allows inline video playback nowadays with <em>playsinline</em> attribute on the video element, but LinkedIn browser doesn't support the attribute. On iPad the video does play inline though and does not switch to fullscreen. We have tested this bug on iOS versions 10 and 11. On native Safari browser or Google Chrome there's no problem, the vid

In [None]:
#Saving to disk
stack.save_to_disk("data/sample_150")

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
#Loading base model for LoRa fine-tuning
model_id = "Qwen/Qwen2-0.5B"

#Tokenizer to convert text to tokens
tokenizer = AutoTokenizer.from_pretrained(model_id)

#Loading pre-trained model
model = AutoModelForCausalLM.from_pretrained(model_id)

#LoRa configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    task_type=TaskType.CAUSAL_LM,
    lora_dropout=0.1,
    bias="none"
)

#Wrapping base model with lora adapters using peft
model = get_peft_model(model, peft_config)

#Loading dataset
stack = load_from_disk("data/sample_150")

#Defining tokenization
def tokenize(example):
  text = example["Title"]+ " " + example["Body"]
  return tokenizer(text, truncation=True, padding="max_length", max_length=128)

#Apply tokenization function on dataset
tokenized = stack.map(tokenize)

#Trainig configuration
args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    label_names=["input_ids"],
    save_total_limit=1,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

#Saving fine-tuned Lora adapter
trainer.train()
model.save_pretrained("lora-qwen2-adapter")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
10,2.6097
20,2.6263
