In [None]:
pip install transformers datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "distilgpt2"  # a small GPT-2 variant (~82M params)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to("cpu")

In [None]:
pairs = [
    {"prompt": "Hello, my name is",          "completion": " OpenShift Bot."},
    {"prompt": "The capital of France is",   "completion": " Paris."},
    {"prompt": "2 + 2 equals",               "completion": " 4."},
    {"prompt": "Roses are red,",             "completion": " michael is blue."},
    {"prompt": "GPU stands for",             "completion": " Graphics Processing Unit."},
    # add a few more if you like
]

# Flatten each pair into a single training line
lines = [p["prompt"] + p["completion"] for p in pairs]
print("\n".join(lines))

In [None]:
import pathlib, os

DATA_DIR = pathlib.Path("/mnt/hello-world-dataset")  # adjust if your PVC is mounted elsewhere
DATA_DIR.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "train.txt"
with train_path.open("w", encoding="utf-8") as f:
    for line in lines:
        f.write(line.strip() + "\n")

print(f"Saved to {train_path} ({train_path.stat().st_size} bytes)")

In [None]:
from datasets import load_dataset

dataset = load_dataset("text", data_files=str(train_path))
print(dataset)
print("Example record:", dataset["train"][0])

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset

# 0. load tiny corpus --------------------------------------------------------
dataset = load_dataset("text", data_files="/mnt/hello-world-dataset/train.txt")

# 1. tokenizer with a PAD token ---------------------------------------------
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token          # give GPT-2 a pad token


# 2. tokenise ----------------------------------------------------------------
def tokenize(batch):
    return tokenizer(batch["text"],
                     return_tensors="pt",
                     padding="max_length",
                     truncation=True,
                     max_length=64)

tokenised = dataset.map(tokenize, batched=True, remove_columns=["text"])

# turns list columns into torch.Tensor columns
tokenised.set_format("torch")

# 3. dataloader --------------------------------------------------------------
loader = DataLoader(tokenised["train"], batch_size=4, shuffle=True)

# 4. tiny training loop (CPU or GPU) ----------------------------------------
SAVE_DIR = "/mnt/models"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))
model.to(DEVICE)

optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(50):
    for batch in loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}  # now tensors âžœ .to() OK
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()
        optim.step()
        optim.zero_grad()
    print(f"Epoch {epoch}: loss {loss.item():.4f}")

model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)