In [None]:
# 1. Install required libraries
!pip install -U datasets fsspec gcsfs
!pip install -U transformers peft accelerate bitsandbytes


In [None]:
# 2. Import libraries
import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
from google.colab import drive


In [None]:
# 3. Mount Google Drive
drive.mount('/content/drive')


In [None]:
# 4. Define function to load JSONL files
def load_jsonl(path: str) -> list[dict]:
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f if line.strip()]


In [None]:
# 5. Copy file (Drive → Colab local)
!cp "/content/drive/MyDrive/Noa/first_emotion_dataset_train.jsonl" /content/train.jsonl
!cp "/content/drive/MyDrive/Noa/first_emotion_dataset_valid.jsonl" /content/valid.jsonl


In [None]:
# 6. Load and transform dataset
train_data = load_jsonl("/content/train.jsonl")
valid_data = load_jsonl("/content/valid.jsonl")

train_dataset = Dataset.from_list(train_data)
valid_dataset = Dataset.from_list(valid_data)


In [None]:
# 7. Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)


In [None]:
# 8. Load tokenizer and model (quantized + auto device mapping)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16
)

In [None]:
# 9. Set up and apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Restructure later
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [None]:
# 10. Define tokenization function
def tokenize(example: dict) -> dict:
    prompt = f"{example['instruction']}\n{example['input']}"
    tokenized = tokenizer(prompt, truncation=True, max_length=512)
    tokenized["labels"] = tokenizer(
        example["output"], truncation=True, max_length=512
    )["input_ids"]
    return tokenized

In [None]:
# 11. Apply tokenization
tokenized_train = train_dataset.map(tokenize, batched=False)
tokenized_valid = valid_dataset.map(tokenize, batched=False)

In [None]:
# 12. Set training arguments (optimized for Colab environment)
training_args = TrainingArguments(
    output_dir="./phi2_emotion_lora",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    num_train_epochs=2,
    learning_rate=2e-4,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

In [None]:
# 13. Define Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid
)

trainer.train()

In [None]:
# 14. Save the trained model
model.save_pretrained("./phi2_emotion_lora")
tokenizer.save_pretrained("./phi2_emotion_lora")