In [1]:
# Disable W&B logging
import os
os.environ["WANDB_DISABLED"] = "true"


In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("wikitext", name="wikitext-2-v1")
dataset["train"] = dataset["train"].select(range(1000))       # Reduce to 1k samples
dataset["validation"] = dataset["validation"].select(range(200))  # Reduce to 200 samples

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

In [4]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])

In [5]:
# Group texts into blocks
block_size = 128
def group_texts(examples):
    concatenated = sum(examples['input_ids'], [])
    total_len = (len(concatenated) // block_size) * block_size
    result = {
        "input_ids": [concatenated[i:i + block_size] for i in range(0, total_len, block_size)],
        "labels": [concatenated[i:i + block_size] for i in range(0, total_len, block_size)]
    }
    return result

lm_dataset = tokenized.map(group_texts, batched=True)

# Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


Embedding(50257, 768)

In [None]:
# Training arguments (optimized for CPU)
training_args = TrainingArguments(
    output_dir="./gpt2-local-checkpoints",
    run_name="gpt2-local-cpu",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=50,
    fp16=False,  # IMPORTANT: must be False on CPU
    push_to_hub=False,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["validation"]
)

: 

In [None]:
# Train
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


In [None]:
trainer.train(resume_from_checkpoint=True)

NameError: name 'trainer' is not defined

In [None]:
import math
from transformers import pipeline

# ✅ Evaluate the model (after training)
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# ✅ Optional: Install Gradio for web interface
# (You may skip this if running offline or headless)
!pip install gradio --quiet

# ✅ Setup Gradio Interface for Next Word Prediction
import gradio as gr
import torch

def predict_next_word(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

gr.Interface(fn=predict_next_word, inputs="text", outputs="text", title="Next Word Predictor").launch(share=True)

# ✅ Alternatively: Use Hugging Face pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "The capital of France is"
output = text_generator(prompt, max_length=10, num_return_sequences=1)
print("Generated Text:", output[0]["generated_text"])


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


NameError: name 'trainer' is not defined