In [None]:
import os
import dotenv

dotenv.load_dotenv()

In [None]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.1

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer

model_id = "google/gemma-7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, token=os.environ['HF_TOKEN'])

In [None]:
text = "Given the following cryptic clue for a cryptic crossword, what is the answer?\n\nClue:\nPointed shoe parts with circular fitments.\n\nAnswer:\n"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [None]:
# from datasets import load_dataset

# data = load_dataset("Abirate/english_quotes")
# data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

TEMPLATE = "Given the following cryptic clue for a cryptic crossword, what is the answer?\nClue:\n{title}\n\nAnswer:\n{content}"

class CrypticDataset(Dataset):
    def __init__(self, csv_file):
        self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-7b', token=os.environ['HF_TOKEN'])
        
        self.data = pd.read_csv(csv_file, header=None)
        
        self.ds = self.data.apply(self.prompt, axis=1)
        self.ds = self.ds.apply(self.tokenizer, return_tensors="pt")


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

    def prompt(self, col):
        return TEMPLATE.format(title=col[1], content=col[2])



In [None]:
import transformers
from trl import SFTTrainer
from torch.utils.data import Dataset, random_split, Subset

# Set the random seed for reproducibility
torch.manual_seed(42)

# Create a torch.Generator object
generator = torch.Generator()
generator.manual_seed(42)

dataset = CrypticDataset("uk_cryptics.csv")

# Shuffle the indices of the dataset
indices = torch.randperm(len(dataset))

# Select the first 1500 elements
subset_indices = indices[:1500]

# Create a Subset of the dataset using the selected indices
subset_dataset = Subset(dataset, subset_indices)

# Assuming you have a sampled dataset called 'shuffled_dataset'
train_size = int(0.6 * len(subset_dataset))
val_size = int(0.2 * len(subset_dataset))
test_size = len(subset_dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(subset_dataset, [train_size, val_size, test_size], generator=generator)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

print("Example training sample:", dataset.ds[151])


In [None]:
print("Example training sample:", dataset.ds[152])

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        save_steps=200,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
)
trainer.train()
model.save_pretrained("./weights")

In [None]:
text = "Given the following cryptic clue for a cryptic crossword, what is the answer?\n\nClue:\nPointed shoe parts with circular fitments.\n\nAnswer:\n"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))