In [None]:
"""
========================================================================
GPT MODELS TUTORIAL - Complete Guide with Hugging Face Transformers
========================================================================

This notebook covers:
1. Understanding GPT Architecture
2. Model Setup and Configuration
3. Text Generation Techniques
4. Creative Writing Applications
5. Fine-tuning Basics
6. Advanced Use Cases
7. Best Practices and Tips

Author: GPT Tutorial
Date: 2024
"""

In [None]:


# ========================================================================
# SECTION 1: INSTALLATION AND IMPORTS
# ========================================================================

"""
First, install required packages:
!pip install transformers torch sentencepiece accelerate
"""

import torch
from transformers import (
    GPT2LMHeadModel, 
    GPT2Tokenizer,
    GPT2Config,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    Trainer,
    TrainingArguments
)
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("GPT MODELS TUTORIAL")
print("="*70)


In [None]:


# ========================================================================
# SECTION 2: UNDERSTANDING GPT
# ========================================================================

"""
GPT (Generative Pre-trained Transformer) Overview:
--------------------------------------------------
- Architecture: Decoder-only Transformer
- Training: Autoregressive language modeling (predict next token)
- Strengths: Text generation, completion, creative writing
- Variants: GPT-2, GPT-Neo, GPT-J, etc.

Key Characteristics:
- Unidirectional (left-to-right) attention
- Generates coherent, contextually relevant text
- Can be fine-tuned for specific tasks
"""

In [None]:


# ========================================================================
# SECTION 3: BASIC MODEL SETUP
# ========================================================================

class GPTBasics:
    """Understanding GPT model basics"""
    
    def __init__(self, model_name="gpt2"):
        """
        Initialize GPT model
        
        Available models:
        - gpt2 (124M parameters)
        - gpt2-medium (355M parameters)
        - gpt2-large (774M parameters)
        - gpt2-xl (1.5B parameters)
        """
        print(f"\n{'='*70}")
        print(f"Loading {model_name} model...")
        print(f"{'='*70}")
        
        self.model_name = model_name
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        
        # Set padding token
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.config.pad_token_id = self.tokenizer.eos_token_id
        
        # Set to evaluation mode
        self.model.eval()
        
        print(f"✓ Model loaded successfully")
        print(f"✓ Vocabulary size: {len(self.tokenizer)}")
        print(f"✓ Model parameters: {self.count_parameters():,}")
    
    def count_parameters(self):
        """Count trainable parameters"""
        return sum(p.numel() for p in self.model.parameters())
    
    def tokenization_demo(self, text):
        """Demonstrate tokenization process"""
        print(f"\n{'='*70}")
        print("TOKENIZATION DEMONSTRATION")
        print(f"{'='*70}")
        
        print(f"\nOriginal text:\n'{text}'")
        
        # Encode
        tokens = self.tokenizer.encode(text)
        print(f"\nToken IDs: {tokens}")
        
        # Decode individual tokens
        print("\nIndividual tokens:")
        for i, token_id in enumerate(tokens):
            token_text = self.tokenizer.decode([token_id])
            print(f"  {i}: {token_id} -> '{token_text}'")
        
        # Decode back to text
        decoded = self.tokenizer.decode(tokens)
        print(f"\nDecoded text:\n'{decoded}'")

In [None]:


# ========================================================================
# SECTION 4: TEXT GENERATION TECHNIQUES
# ========================================================================

class TextGeneration:
    """Various text generation strategies"""
    
    def __init__(self, model_name="gpt2"):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.eval()
    
    def greedy_search(self, prompt, max_length=50):
        """
        Greedy Search: Always pick the most likely next token
        Pros: Fast, deterministic
        Cons: Repetitive, less creative
        """
        print(f"\n{'='*70}")
        print("GREEDY SEARCH (Deterministic)")
        print(f"{'='*70}")
        print(f"Prompt: {prompt}\n")
        
        inputs = self.tokenizer.encode(prompt, return_tensors="pt")
        
        outputs = self.model.generate(
            inputs,
            max_length=max_length,
            do_sample=False  # Greedy
        )
        
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated:\n{text}")
    
    def beam_search(self, prompt, max_length=50, num_beams=5):
        """
        Beam Search: Keep top K candidates at each step
        Pros: Better quality than greedy
        Cons: Still can be repetitive
        """
        print(f"\n{'='*70}")
        print(f"BEAM SEARCH (num_beams={num_beams})")
        print(f"{'='*70}")
        print(f"Prompt: {prompt}\n")
        
        inputs = self.tokenizer.encode(prompt, return_tensors="pt")
        
        outputs = self.model.generate(
            inputs,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
        
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated:\n{text}")
    
    def sampling(self, prompt, max_length=50, temperature=1.0):
        """
        Sampling: Randomly sample from probability distribution
        
        Temperature controls randomness:
        - temperature < 1.0: More focused, conservative
        - temperature = 1.0: Normal sampling
        - temperature > 1.0: More random, creative
        """
        print(f"\n{'='*70}")
        print(f"SAMPLING (temperature={temperature})")
        print(f"{'='*70}")
        print(f"Prompt: {prompt}\n")
        
        inputs = self.tokenizer.encode(prompt, return_tensors="pt")
        
        outputs = self.model.generate(
            inputs,
            max_length=max_length,
            do_sample=True,
            temperature=temperature
        )
        
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated:\n{text}")
    
    def top_k_sampling(self, prompt, max_length=50, top_k=50):
        """
        Top-K Sampling: Sample from top K most likely tokens
        Reduces chance of picking unlikely words
        """
        print(f"\n{'='*70}")
        print(f"TOP-K SAMPLING (k={top_k})")
        print(f"{'='*70}")
        print(f"Prompt: {prompt}\n")
        
        inputs = self.tokenizer.encode(prompt, return_tensors="pt")
        
        outputs = self.model.generate(
            inputs,
            max_length=max_length,
            do_sample=True,
            top_k=top_k,
            temperature=0.8
        )
        
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated:\n{text}")
    
    def top_p_sampling(self, prompt, max_length=50, top_p=0.9):
        """
        Top-P (Nucleus) Sampling: Sample from smallest set 
        whose cumulative probability exceeds P
        
        More dynamic than top-k, adapts to distribution shape
        """
        print(f"\n{'='*70}")
        print(f"TOP-P (NUCLEUS) SAMPLING (p={top_p})")
        print(f"{'='*70}")
        print(f"Prompt: {prompt}\n")
        
        inputs = self.tokenizer.encode(prompt, return_tensors="pt")
        
        outputs = self.model.generate(
            inputs,
            max_length=max_length,
            do_sample=True,
            top_p=top_p,
            temperature=0.8
        )
        
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated:\n{text}")
    
    def combined_sampling(self, prompt, max_length=100, 
                         temperature=0.8, top_k=50, top_p=0.95):
        """
        Combined Strategy: Top-K + Top-P + Temperature
        Best practice for balanced generation
        """
        print(f"\n{'='*70}")
        print("COMBINED SAMPLING (Recommended)")
        print(f"{'='*70}")
        print(f"Parameters: temp={temperature}, top_k={top_k}, top_p={top_p}")
        print(f"Prompt: {prompt}\n")
        
        inputs = self.tokenizer.encode(prompt, return_tensors="pt")
        
        outputs = self.model.generate(
            inputs,
            max_length=max_length,
            do_sample=True,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            no_repeat_ngram_size=2  # Avoid repeating 2-grams
        )
        
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated:\n{text}")

In [None]:
# ========================================================================
# SECTION 5: CREATIVE WRITING APPLICATIONS
# ========================================================================

class CreativeWriting:
    """Use GPT for creative writing tasks"""
    
    def __init__(self):
        self.generator = pipeline('text-generation', model='gpt2')
    
    def story_writer(self, beginning, style="adventure"):
        """Generate creative stories"""
        print(f"\n{'='*70}")
        print(f"STORY GENERATION - {style.upper()} STYLE")
        print(f"{'='*70}")
        
        prompt = f"{beginning}"
        
        result = self.generator(
            prompt,
            max_length=200,
            num_return_sequences=1,
            temperature=0.9,
            top_p=0.95,
            do_sample=True
        )
        
        print(f"\n{result[0]['generated_text']}")
    
    def poem_generator(self, theme):
        """Generate poetry"""
        print(f"\n{'='*70}")
        print("POETRY GENERATION")
        print(f"{'='*70}")
        
        prompt = f"A poem about {theme}:\n\n"
        
        result = self.generator(
            prompt,
            max_length=100,
            num_return_sequences=1,
            temperature=0.9
        )
        
        print(result[0]['generated_text'])
    
    def dialogue_writer(self, context):
        """Generate dialogue between characters"""
        print(f"\n{'='*70}")
        print("DIALOGUE GENERATION")
        print(f"{'='*70}")
        
        result = self.generator(
            context,
            max_length=150,
            num_return_sequences=1,
            temperature=0.8
        )
        
        print(result[0]['generated_text'])


In [None]:

# ========================================================================
# SECTION 6: ADVANCED USE CASES
# ========================================================================

class AdvancedUseCases:
    """Advanced GPT applications"""
    
    def __init__(self, model_name="gpt2"):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.eval()
    
    def conditional_generation(self, prompt, prefix=""):
        """Generate text with specific prefix constraint"""
        print(f"\n{'='*70}")
        print("CONDITIONAL GENERATION")
        print(f"{'='*70}")
        
        full_prompt = f"{prefix} {prompt}" if prefix else prompt
        
        inputs = self.tokenizer.encode(full_prompt, return_tensors="pt")
        
        outputs = self.model.generate(
            inputs,
            max_length=100,
            temperature=0.8,
            top_p=0.9,
            do_sample=True,
            no_repeat_ngram_size=2
        )
        
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Prompt: {full_prompt}")
        print(f"\nGenerated:\n{text}")
    
    def batch_generation(self, prompts):
        """Generate text for multiple prompts efficiently"""
        print(f"\n{'='*70}")
        print("BATCH GENERATION")
        print(f"{'='*70}")
        
        inputs = self.tokenizer(
            prompts, 
            return_tensors="pt", 
            padding=True,
            truncation=True
        )
        
        outputs = self.model.generate(
            **inputs,
            max_length=80,
            temperature=0.8,
            do_sample=True,
            num_return_sequences=1
        )
        
        for i, output in enumerate(outputs):
            text = self.tokenizer.decode(output, skip_special_tokens=True)
            print(f"\nPrompt {i+1}: {prompts[i]}")
            print(f"Generated: {text}")
    
    def get_perplexity(self, text):
        """
        Calculate perplexity (measure of uncertainty)
        Lower perplexity = more confident/natural text
        """
        print(f"\n{'='*70}")
        print("PERPLEXITY CALCULATION")
        print(f"{'='*70}")
        
        inputs = self.tokenizer(text, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            perplexity = torch.exp(loss)
        
        print(f"Text: {text}")
        print(f"Perplexity: {perplexity.item():.2f}")
        print(f"(Lower is better - more natural text)")
        
        return perplexity.item()


In [None]:

# ========================================================================
# SECTION 7: BEST PRACTICES AND TIPS
# ========================================================================

"""
BEST PRACTICES FOR GPT TEXT GENERATION
=======================================

1. PROMPT ENGINEERING
   - Be specific and clear
   - Provide context and examples
   - Use formatting (bullets, numbers) when needed
   
2. PARAMETER TUNING
   - temperature: 0.7-0.9 for creative tasks
   - temperature: 0.3-0.5 for factual tasks
   - top_p: 0.9-0.95 is generally good
   - top_k: 40-50 works well
   
3. AVOIDING REPETITION
   - Use no_repeat_ngram_size=2 or 3
   - Lower temperature slightly
   - Use diverse beam search
   
4. CONTROLLING LENGTH
   - max_length: Set reasonable limits
   - min_length: Ensure minimum output
   - Use length_penalty in beam search
   
5. QUALITY IMPROVEMENTS
   - Use larger models for better quality
   - Fine-tune on domain-specific data
   - Post-process outputs (filtering, formatting)
   
6. PERFORMANCE OPTIMIZATION
   - Use batch processing for multiple prompts
   - Cache models in production
   - Consider quantization for inference
   - Use GPU when available
"""

In [None]:
    print("\n" + "="*70)
    print("GPT TUTORIAL - COMPLETE DEMONSTRATION")
    print("="*70)
    
    # 1. Basics
    print("\n\n### PART 1: BASICS ###")
    basics = GPTBasics("gpt2")
    basics.tokenization_demo("Hello, how are you today?")

In [None]:
    # 2. Generation Techniques
    print("\n\n### PART 2: GENERATION TECHNIQUES ###")
    generator = TextGeneration("gpt2")
    
    prompt = "The future of artificial intelligence"
    
    generator.greedy_search(prompt, max_length=60)
    generator.sampling(prompt, max_length=60, temperature=0.7)
    generator.top_k_sampling(prompt, max_length=60)
    generator.top_p_sampling(prompt, max_length=60)
    generator.combined_sampling(prompt, max_length=80)

In [None]:
    # 3. Creative Writing
    print("\n\n### PART 3: CREATIVE WRITING ###")
    writer = CreativeWriting()
    
    writer.story_writer(
        "In a world where dreams could be recorded and shared,",
        style="sci-fi"
    )
    
    writer.dialogue_writer(
        'Character A: "What do you think about the new technology?"\n'
        'Character B:'
    )

In [None]:
    # 4. Advanced Use Cases
    print("\n\n### PART 4: ADVANCED USE CASES ###")
    advanced = AdvancedUseCases("gpt2")
    
    advanced.conditional_generation(
        "will transform society",
        prefix="Artificial intelligence"
    )
    
    advanced.batch_generation([
        "The best way to learn programming is",
        "Climate change affects our planet by",
        "The benefits of regular exercise include"
    ])
    
    advanced.get_perplexity("The quick brown fox jumps over the lazy dog.")
    advanced.get_perplexity("Colorless green ideas sleep furiously.")
    
    print("\n" + "="*70)
    print("DEMONSTRATION COMPLETE!")
    print("="*70)
    
    print("\n### KEY TAKEAWAYS ###")
    print("""
    1. GPT excels at text generation and completion
    2. Different sampling strategies produce different outputs
    3. Temperature controls creativity vs. coherence
    4. Combining top-k and top-p gives best results
    5. Proper prompting is crucial for quality output
    6. Fine-tuning can improve domain-specific performance
    """)

