In [12]:
import os
import dotenv

dotenv.load_dotenv()

True

In [None]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.1

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer

model_id = "google/gemma-7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, token=os.environ['HF_TOKEN'])

In [None]:
text = "Given the following cryptic clue for a cryptic crossword, what is the answer?\n\nClue:\nPointed shoe parts with circular fitments.\n\nAnswer:\n"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [9]:
# from datasets import load_dataset

# data = load_dataset("Abirate/english_quotes")
# data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

TEMPLATE = "Given the following cryptic clue for a cryptic crossword, what is the answer?\n\nClue:\n{title}\n\nAnswer:\n{content}"


class CrypticDataset(Dataset):
    def __init__(self, csv_file):
        self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-7b', token=os.environ['HF_TOKEN'])
        
        self.data = pd.read_csv(csv_file, header=None)
        
        self.ds = self.data.apply(self.prompt, axis=1)
        self.ds = self.ds_prompts.apply(self.tokenizer, return_tensors="pt")


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

    def prompt(self, col):
        return TEMPLATE.format(title=col[1], content=col[2])



Given the following cryptic clue for a cryptic crossword, what is the answer?

Clue:
Acquisitive chap, as we see it (8)

Answer:
COVETOUS


In [None]:
import transformers
from trl import SFTTrainer

data = CrypticDataset("uk_cryptics.csv")


trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
)
trainer.train()

In [None]:
text = "Given the following cryptic clue for a cryptic crossword, what is the answer?\n\nClue:\nPointed shoe parts with circular fitments.\n\nAnswer:\n"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))