# üöÄ Language Model Implementation and Analysis ‚Äî GPT-2 Fine-Tuning

**Author:** Misba Sikandar  
**Project Level:** Advanced  
**Topic:** Natural Language Processing (NLP) ‚Äî Language Model Deployment  
**Model Chosen:** GPT-2 (by OpenAI)  
**Environment:** Python, Jupyter Notebook, Transformers Library  

---

### üìò Objective
To implement and analyze a Language Model (LM) ‚Äî GPT-2 ‚Äî by fine-tuning it on a text dataset, exploring its text generation capabilities, and understanding its performance and limitations.

---


In [1]:
# ü™ú Step 1: Install all necessary libraries

# --- Install required packages (if not already installed) ---
import sys
import subprocess

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install tqdm and ipywidgets
try:
    import tqdm
except ImportError:
    install("tqdm")

try:
    import ipywidgets
except ImportError:
    install("ipywidgets")

# --- Import required libraries ---
from tqdm.notebook import tqdm
tqdm.pandas()  # Enable progress bars for pandas

from transformers import GPT2Tokenizer, GPT2LMHeadModel

print("‚úÖ Step 1 complete: Environment ready, tqdm & ipywidgets imported, GPT-2 tokenizer and model ready to load.")


‚úÖ Step 1 complete: Environment ready, tqdm & ipywidgets imported, GPT-2 tokenizer and model ready to load.


In [2]:
# ‚öôÔ∏è Step 2: Import all necessary libraries

import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm.auto import tqdm
import torch

print("All libraries imported successfully ‚úÖ")


All libraries imported successfully ‚úÖ


In [3]:
# üìÅ Step 3: Verify that your dataset (data.txt) exists

data_path = "data.txt"
print(os.path.exists(data_path))


True


In [4]:
# üß† Step 4: Load and inspect your dataset

with open("data.txt", "r", encoding="utf-8") as f:
    text_data = f.read()

# Display the first few lines
print(text_data[:500])


Hello, I am working on a Language Model project.
This project fine-tunes GPT-2 using custom text data.
The model learns to generate sentences related to AI and NLP.



In [5]:
# üß© Step 5: Load GPT-2 tokenizer and model

from transformers import GPT2LMHeadModel

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Because we added a pad_token to the tokenizer, we must resize the model embeddings
# Using mean_resizing=False to disable the info message about new embeddings
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

# Ensure the model uses the correct pad token
model.config.pad_token_id = tokenizer.pad_token_id

print("Tokenizer and model loaded successfully ‚úÖ")



NameError: name 'tokenizer' is not defined

In [None]:
# üß± Step 6: Prepare dataset for training

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, text, tokenizer, block_size=128):
        tokenized = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=block_size,
            return_tensors="pt"
        )
        self.input_ids = tokenized.input_ids
        self.attn_masks = tokenized.attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attn_masks[idx],
            "labels": self.input_ids[idx]
        }

dataset = TextDataset(text_data, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

print("Dataset and DataLoader ready ‚úÖ")


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# üöÄ Step 7: Fine-tune the model

# =====================================================
# STEP 7: INITIALIZE TRAINER AND START TRAINING
# =====================================================

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# ---------------------------------------
# Reload tokenizer and model (so no errors)
# ---------------------------------------
model_path = "./models/gpt2-finetuned"  # Change if your model path differs
try:
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
except:
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# ---------------------------------------
# Data collator and training arguments
# ---------------------------------------
from datasets import load_dataset

# Load your dataset again
dataset = load_dataset('text', data_files={'train': r'C:\Users\misba\OneDrive\Desktop\LM_Project\data.txt'})

# Tokenize again
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training configuration
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    push_to_hub=False
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ---------------------------------------
# Start training
# ---------------------------------------
print("üöÄ Training started...")
trainer.train()
print("‚úÖ Training finished successfully!")


In [None]:
# ‚úÖ STEP 8: Save & Test Your Fine-Tuned Model

from transformers import pipeline

# 1Ô∏è‚É£ Save model and tokenizer
save_path = "./models/gpt2-finetuned"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print("‚úÖ Model and tokenizer saved successfully at:", save_path)

# 2Ô∏è‚É£ Load the fine-tuned model for text generation
generator = pipeline("text-generation", model=save_path, tokenizer=save_path, device=-1)  # device=-1 means CPU

# 3Ô∏è‚É£ Test your model with a custom prompt
prompt = "Hello, my name is Misba and I am working on"
output = generator(prompt, max_length=80, num_return_sequences=1, temperature=0.7)

print("\nüß† Generated Text:\n")
print(output[0]['generated_text'])


In [7]:
!where python


C:\Users\misba\OneDrive\Desktop\LM_Project\.venv\Scripts\python.exe
C:\Users\misba\AppData\Local\Microsoft\WindowsApps\python.exe


In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model directory (the one you trained earlier)
model_path = "./models/gpt2-finetuned"  # or the folder where your fine-tuned model was saved

# Load model and tokenizer from there
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

print("‚úÖ Model and tokenizer loaded from:", model_path)


‚úÖ Model and tokenizer loaded from: ./models/gpt2-finetuned


In [9]:
# =====================================================
# üßæ STEP 9: SAVE AND RELOAD YOUR FINE-TUNED MODEL (FIXED)
# =====================================================

from transformers import GPT2LMHeadModel, GPT2Tokenizer

# üîπ Load the fine-tuned model (if Jupyter was restarted)
model_path = "./models/gpt2-finetuned"  # change this if your fine-tuned model is elsewhere

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
print("‚úÖ Model and tokenizer loaded from:", model_path)

# =====================================================
# üíæ Save the model and tokenizer
# =====================================================
save_directory = "./gpt2-finetuned-model"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print("‚úÖ Model and tokenizer saved successfully at:", save_directory)

# =====================================================
# üîÅ Reload the saved model to confirm
# =====================================================
print("\nüîÅ Reloading the model to confirm...")

reloaded_tokenizer = GPT2Tokenizer.from_pretrained(save_directory)
reloaded_model = GPT2LMHeadModel.from_pretrained(save_directory)

print("‚úÖ Reloaded model and tokenizer successfully!")

# =====================================================
# üí¨ Generate text from reloaded model
# =====================================================
prompt = "Once upon a time in the world of AI,"
inputs = reloaded_tokenizer(prompt, return_tensors="pt")

print("\nü§ñ Generating text...")
outputs = reloaded_model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.8,
    do_sample=True,
    top_p=0.95
)

print("\n‚ú® Model Output:\n")
print(reloaded_tokenizer.decode(outputs[0], skip_special_tokens=True))


‚úÖ Model and tokenizer loaded from: ./models/gpt2-finetuned
‚úÖ Model and tokenizer saved successfully at: ./gpt2-finetuned-model

üîÅ Reloading the model to confirm...
‚úÖ Reloaded model and tokenizer successfully!

ü§ñ Generating text...

‚ú® Model Output:

Once upon a time in the world of AI, the world of AI, the world of AI has been completely different. AI is not simply a language, it is also a computer. In the language of AI, the data that is being processed is being processed, and the data that is being processed is being stored. That is, the data can be written by any program, and its data can be read by any program.

On the one hand, this is true of a language such as Haskell. It is true of a language like


In [11]:
# =====================================================
# üí¨ STEP 10a: INTERACTIVE CHAT WITH FINE-TUNED MODEL
# =====================================================

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load your fine-tuned model and tokenizer
model_path = "./gpt2-finetuned-model"  # or wherever you saved it
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Put the model in evaluation mode
model.eval()

print("ü§ñ Fine-tuned GPT-2 model is ready for chat! Type 'quit' to stop.\n")

# Simple interactive chat loop
while True:
    user_input = input("üß† You: ")
    if user_input.lower() in ["quit", "exit", "stop"]:
        print("üëã Chat ended.")
        break

    # Encode input
    inputs = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt")

    # Generate model response
    outputs = model.generate(
        inputs,
        max_new_tokens=100,
        temperature=0.8,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and display response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response[len(user_input):].strip()  # remove repetition

    print(f"ü§ñ GPT-2: {response}\n")


ü§ñ Fine-tuned GPT-2 model is ready for chat! Type 'quit' to stop.



üß† You:  What Are Paleo?


ü§ñ GPT-2: BELLEVUE, Wash. - An online retailer has removed two women from their listings.

Bella Yantalova and Krista Yantalova are not on the list.

The women's listing on the online store went down Tuesday, and is now being reviewed by a customer service representative.

The women's listing was removed by Amazon.com after an internal review. Yantalova is listed as an author and Yantalova as a business



üß† You:  quit


üëã Chat ended.


In [12]:
# üíæ Step 10b: Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

print("‚úÖ Model and tokenizer saved successfully!")


‚úÖ Model and tokenizer saved successfully!


In [13]:
# üöÄ Step 11: Load the fine-tuned model
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_path = "./gpt2-finetuned"

tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

print("‚úÖ Model loaded successfully and ready for chat!")


‚úÖ Model loaded successfully and ready for chat!


In [14]:
pip install streamlit

Collecting streamlit
  Using cached streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting altair!=5.4.0,!=5.4.1,<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<7,>=4.0 (from streamlit)
  Downloading cachetools-6.2.1-py3-none-any.whl.metadata (5.5 kB)
Collecting pillow<12,>=7.1.0 (from streamlit)
  Using cached pillow-11.3.0-cp312-cp312-win_amd64.whl.metadata (9.2 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Using cached tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Using cached toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Using cached watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Using cached pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1

  You can safely remove it manually.


In [15]:
# =====================================================
# STEP 13: STREAMLIT CHATBOT FOR FINE-TUNED GPT-2
# =====================================================

import streamlit as st
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# -------------------------------
# Load fine-tuned model
# -------------------------------
model_path = "./gpt2-finetuned"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Make sure model runs on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# -------------------------------
# Streamlit UI
# -------------------------------
st.title("ü§ñ GPT-2 Chatbot")
st.write("Chat with your fine-tuned GPT-2 model!")

# User input
user_input = st.text_input("You:", "")

if user_input:
    # Tokenize input
    inputs = tokenizer(user_input, return_tensors="pt").to(device)
    
    # Generate response
    outputs = model.generate(
        **inputs, 
        max_new_tokens=150, 
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_p=0.9,
        temperature=0.8
    )
    
    # Decode output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Display response
    st.text_area("GPT-2:", value=response, height=200)


2025-10-24 21:24:44.683 
  command:

    streamlit run c:\Users\misba\OneDrive\Desktop\LM_Project\.venv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-10-24 21:24:44.689 Session state does not function when running a script without `streamlit run`
