# Fine-tuning GPT-2 in Colab (End-to-End)

This notebook demonstrates:
1. Downloading a **small dataset**.
2. Preprocessing with **NLTK** and **spaCy** (tokenization, stopword removal, lemmatization).
3. Fine-tuning GPT-2 with **PyTorch + Hugging Face**.
4. Testing the fine-tuned model with text generation.



In [None]:

!pip install -qU transformers datasets torch nltk spacy
!python -m spacy download en_core_web_sm
import nltk
nltk.download("punkt")
nltk.download("stopwords")



In [None]:
from datasets import load_dataset

# Load small dataset (wikitext-2)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Take only a small subset of the training set for quick fine-tuning
small_train = dataset["train"].shuffle(seed=42).select(range(200))
small_val = dataset["validation"].shuffle(seed=42).select(range(50))

print(small_train[0])
print(small_val[0])


In [None]:

import spacy
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc
              if not token.is_punct and not token.is_space and token.text.lower() not in stop_words]
    return " ".join(tokens)

# Apply preprocessing
dataset = dataset.map(lambda ex: {"clean_text": preprocess(ex["text"])})
print(dataset['train'][0])


In [None]:

from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<PAD>"})

def tokenize_function(examples):
    return tokenizer(examples["clean_text"], truncation=True, padding="max_length", max_length=64)

tokenized_ds = dataset.map(tokenize_function, batched=True, remove_columns=["text","clean_text"])
tokenized_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])
print(tokenized_ds['train'][0])


In [None]:

from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gpt2-small-finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    save_strategy="no",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    data_collator=data_collator
)

trainer.train()


In [None]:

from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "Artificial intelligence is"
outputs = generator(prompt, max_new_tokens=40, do_sample=True, top_k=50, top_p=0.95)

print("Generated text:\n", outputs[0]["generated_text"])


In [None]:

# Save fine-tuned model locally
model.save_pretrained("./final_gpt2_model")
tokenizer.save_pretrained("./final_gpt2_model")
print("Model saved to ./final_gpt2_model")
