# 🚀 Language Model Implementation and Analysis — GPT-2 Fine-Tuning

**Author:** Misba Sikandar  
**Project Level:** Advanced  
**Topic:** Natural Language Processing (NLP) — Language Model Deployment  
**Model Chosen:** GPT-2 (by OpenAI)  
**Environment:** Python, Jupyter Notebook, Transformers Library  

---

### 📘 Objective
To implement and analyze a Language Model (LM) — GPT-2 — by fine-tuning it on a text dataset, exploring its text generation capabilities, and understanding its performance and limitations.

---


In [None]:
# 🪜 Step 1: Install all necessary libraries

# --- Install required packages (if not already installed) ---
import sys
import subprocess

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install tqdm and ipywidgets
try:
    import tqdm
except ImportError:
    install("tqdm")

try:
    import ipywidgets
except ImportError:
    install("ipywidgets")

# --- Import required libraries ---
from tqdm.notebook import tqdm
tqdm.pandas()  # Enable progress bars for pandas

from transformers import GPT2Tokenizer, GPT2LMHeadModel

print("✅ Step 1 complete: Environment ready, tqdm & ipywidgets imported, GPT-2 tokenizer and model ready to load.")


In [None]:
# ⚙️ Step 2: Import all necessary libraries

import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm.auto import tqdm
import torch

print("All libraries imported successfully ✅")


In [None]:
# 📁 Step 3: Verify that your dataset (data.txt) exists

data_path = "data.txt"
print(os.path.exists(data_path))


In [None]:
# 🧠 Step 4: Load and inspect your dataset

with open("data.txt", "r", encoding="utf-8") as f:
    text_data = f.read()

# Display the first few lines
print(text_data[:500])


In [None]:
# 🧩 Step 5: Load GPT-2 tokenizer and model

from transformers import GPT2LMHeadModel

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Because we added a pad_token to the tokenizer, we must resize the model embeddings
# Using mean_resizing=False to disable the info message about new embeddings
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

# Ensure the model uses the correct pad token
model.config.pad_token_id = tokenizer.pad_token_id

print("Tokenizer and model loaded successfully ✅")



In [None]:
# 🧱 Step 6: Prepare dataset for training

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, text, tokenizer, block_size=128):
        tokenized = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=block_size,
            return_tensors="pt"
        )
        self.input_ids = tokenized.input_ids
        self.attn_masks = tokenized.attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attn_masks[idx],
            "labels": self.input_ids[idx]
        }

dataset = TextDataset(text_data, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

print("Dataset and DataLoader ready ✅")


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# 🚀 Step 7: Fine-tune the model

# =====================================================
# STEP 7: INITIALIZE TRAINER AND START TRAINING
# =====================================================

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# ---------------------------------------
# Reload tokenizer and model (so no errors)
# ---------------------------------------
model_path = "./models/gpt2-finetuned"  # Change if your model path differs
try:
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
except:
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# ---------------------------------------
# Data collator and training arguments
# ---------------------------------------
from datasets import load_dataset

# Load your dataset again
dataset = load_dataset('text', data_files={'train': r'C:\Users\misba\OneDrive\Desktop\LM_Project\data.txt'})

# Tokenize again
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training configuration
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    push_to_hub=False
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ---------------------------------------
# Start training
# ---------------------------------------
print("🚀 Training started...")
trainer.train()
print("✅ Training finished successfully!")


In [None]:
# ✅ STEP 8: Save & Test Your Fine-Tuned Model

from transformers import pipeline

# 1️⃣ Save model and tokenizer
save_path = "./models/gpt2-finetuned"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print("✅ Model and tokenizer saved successfully at:", save_path)

# 2️⃣ Load the fine-tuned model for text generation
generator = pipeline("text-generation", model=save_path, tokenizer=save_path, device=-1)  # device=-1 means CPU

# 3️⃣ Test your model with a custom prompt
prompt = "Hello, my name is Misba and I am working on"
output = generator(prompt, max_length=80, num_return_sequences=1, temperature=0.7)

print("\n🧠 Generated Text:\n")
print(output[0]['generated_text'])


In [None]:
# =====================================================
# 🧾 STEP 9: SAVE AND RELOAD YOUR FINE-TUNED MODEL
# =====================================================

from transformers import GPT2LMHeadModel, GPT2Tokenizer

# ✅ Define save directory
save_directory = "./gpt2-finetuned-model"

# ✅ Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print("✅ Model and tokenizer saved successfully at:", save_directory)

# =====================================================
# 🔁 To verify: Reload the model and tokenizer
# =====================================================
print("\n🔁 Reloading the model to confirm...")

# Load back from the saved directory
reloaded_tokenizer = GPT2Tokenizer.from_pretrained(save_directory)
reloaded_model = GPT2LMHeadModel.from_pretrained(save_directory)

print("✅ Reloaded model and tokenizer successfully!")

# =====================================================
# 💬 Quick test: Generate text from the reloaded model
# =====================================================

prompt = "Once upon a time in the world of AI,"
inputs = reloaded_tokenizer(prompt, return_tensors="pt")

print("\n🤖 Generating text...")
outputs = reloaded_model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.8,
    do_sample=True,
    top_p=0.95
)

print("\n✨ Model Output:\n")
print(reloaded_tokenizer.decode(outputs[0], skip_special_tokens=True))
