In [1]:
#GPT 1

#Steps to build the GPT model in local

1. Create the Training Data (10 lines of text)
2. Build the cleaning, tokenization, vocabulary, training samples, unsupervised training sample
3. BUild the model (Transformer Decoder Architecture)
4. Pre-train model (Predict the next token)
5. Evaluation on Pre-train model
6. Fine Tuning model (sentiment classification)
7. Evaluation on Pre-trained Fine Tuned model
8. Generate the text

In [2]:
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [3]:
# TextProcessor: NLP Text Preprocessing & Tokenization Engine
class TextProcessor:
    """
    TextProcessor - Advanced Text Preprocessing & Vocabulary Management System

    A comprehensive text processing class that handles:
    - Text cleaning and normalization
    - Frequency-based vocabulary building
    - Bidirectional text-to-token conversion
    - Special token management for NLP pipelines

    Ideal for preparing text data for machine learning models.
    """

    def __init__(self, vocab_size=500):
        """Initialize the TextProcessor"""
        print("Constructor Called")
        self.vocab_size = vocab_size
        self.vocab = {}
        self.reverse_vocab = {}
        self.word_counts = {}
        print("Constructor Initialized")

    def clean_text(self, text):
        """Clean and normalize text"""
        text = text.lower()
        # Keep basic punctuation
        text = re.sub(r'[^\w\s\.\!\?\,]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def build_vocab(self, texts):
        """Build vocabulary from texts"""
        print("Building vocabulary...")

        # Count all words
        for text in texts:
            clean_text = self.clean_text(text)
            words = clean_text.split()
            for word in words:
                self.word_counts[word] = self.word_counts.get(word, 0) + 1

        # Get most frequent words
        frequent_words = sorted(self.word_counts.items(), key=lambda x: x[1], reverse=True)

        # Build vocabulary
        self.vocab = {
            '<PAD>': 0,
            '<UNK>': 1,
            '<START>': 2,
            '<END>': 3
        }

        # Add frequent words
        for i, (word, _) in enumerate(frequent_words[:self.vocab_size-4]):
            self.vocab[word] = i + 4

        # Add common words if vocabulary is small
        common_words = ['the', 'and', 'is', 'a', 'to', 'for', 'of', 'with', 'in', 'this',
                       'that', 'it', 'on', 'as', 'are', 'was', 'at', 'be', 'or', 'an',
                       'very', 'good', 'great', 'bad', 'love', 'hate', 'like', 'nice']
        for word in common_words:
            if word not in self.vocab and len(self.vocab) < self.vocab_size:
                self.vocab[word] = len(self.vocab)

        self.reverse_vocab = {v: k for k, v in self.vocab.items()}

        print(f"Vocabulary size: {len(self.vocab)}")
        return self.vocab

    def encode(self, text):
        """Convert text to token ids"""
        clean_text = self.clean_text(text)
        words = clean_text.split()
        return [self.vocab.get(word, self.vocab['<UNK>']) for word in words]

    def decode(self, token_ids):
        """Convert token ids back to text"""
        words = []
        for token_id in token_ids:
            word = self.reverse_vocab.get(token_id, '<UNK>')
            if word not in ['<PAD>', '<START>', '<END>']:
                words.append(word)
        return ' '.join(words)


# ============== TESTING THE TextProcessor CLASS ==============

print("=" * 60)
print("TESTING TextProcessor CLASS")
print("=" * 60)

# 1. CREATE AN OBJECT
print("\n1. Creating TextProcessor object:")
print("-" * 40)
processor = TextProcessor(vocab_size=50)  # Using smaller vocab for demonstration
print(f"Created TextProcessor with vocab_size: {processor.vocab_size}")
print(f"Initial vocab: {processor.vocab}")
print(f"Initial word_counts: {processor.word_counts}")

# 2. TEST clean_text() METHOD
print("\n2. Testing clean_text() method:")
print("-" * 40)
sample_texts = [
    "Hello World! How are you?",
    "This is a GREAT example with @special #characters & symbols!",
    "Multiple    spaces   and\tTABS\nand newlines",
    "Punctuation: hello, world. Nice! Really?",
]

for text in sample_texts:
    cleaned = processor.clean_text(text)
    print(f"Original: '{text}'")
    print(f"Cleaned:  '{cleaned}'")
    print()

# 3. TEST build_vocab() METHOD
print("\n3. Testing build_vocab() method:")
print("-" * 40)

# Sample training texts
training_texts = [
    "I love machine learning and natural language processing",
    "This is a great example of text processing",
    "Machine learning is very interesting and useful",
    "I really love working with text data",
    "Natural language processing is a great field",
    "Text processing and machine learning go together",
    "This example shows how to process text data",
    "I love this great example of processing"
]

print("Training texts:")
for i, text in enumerate(training_texts, 1):
    print(f"{i}. {text}")

print("\nBuilding vocabulary from training texts...")
vocab = processor.build_vocab(training_texts)

print(f"\nWord counts (top 10):")
sorted_counts = sorted(processor.word_counts.items(), key=lambda x: x[1], reverse=True)
for word, count in sorted_counts[:10]:
    print(f"  '{word}': {count}")

print(f"\nBuilt vocabulary (first 20 items):")
vocab_items = list(processor.vocab.items())[:20]
for word, idx in vocab_items:
    print(f"  '{word}': {idx}")

print(f"\nSpecial tokens:")
for token in ['<PAD>', '<UNK>', '<START>', '<END>']:
    print(f"  {token}: {processor.vocab[token]}")

# 4. TEST encode() METHOD
print("\n4. Testing encode() method:")
print("-" * 40)

test_sentences = [
    "I love machine learning",
    "This is a great example",
    "Unknown words will be encoded as UNK tokens",
    "Text processing is useful"
]

for sentence in test_sentences:
    encoded = processor.encode(sentence)
    print(f"Text: '{sentence}'")
    print(f"Encoded: {encoded}")

    # Show word-to-token mapping
    words = processor.clean_text(sentence).split()
    word_token_pairs = []
    for word in words:
        token_id = processor.vocab.get(word, processor.vocab['<UNK>'])
        word_token_pairs.append(f"'{word}'→{token_id}")
    print(f"Mapping: {' | '.join(word_token_pairs)}")
    print()

# 5. TEST decode() METHOD
print("\n5. Testing decode() method:")
print("-" * 40)

# Test with some encoded sequences
test_sequences = [
    [4, 5, 6, 7],  # Should correspond to some words from vocab
    [0, 1, 2, 3],  # Special tokens
    [4, 1, 5, 1, 6],  # Mix of known and unknown tokens
]

for seq in test_sequences:
    decoded = processor.decode(seq)
    print(f"Token sequence: {seq}")
    print(f"Decoded text: '{decoded}'")

    # Show token-to-word mapping
    token_word_pairs = []
    for token_id in seq:
        word = processor.reverse_vocab.get(token_id, '<UNK>')
        token_word_pairs.append(f"{token_id}→'{word}'")
    print(f"Mapping: {' | '.join(token_word_pairs)}")
    print()

# 6. ROUND-TRIP TEST (encode then decode)
print("\n6. Round-trip test (encode → decode):")
print("-" * 40)

round_trip_texts = [
    "I love this great example",
    "Machine learning is interesting",
    "Some unknown words here: quantum blockchain"
]

for text in round_trip_texts:
    print(f"Original: '{text}'")
    encoded = processor.encode(text)
    print(f"Encoded: {encoded}")
    decoded = processor.decode(encoded)
    print(f"Decoded: '{decoded}'")
    print(f"Match: {text.lower() == decoded}")
    print()

# 7. SHOW VOCABULARY STATISTICS
print("\n7. Vocabulary Statistics:")
print("-" * 40)
print(f"Total vocabulary size: {len(processor.vocab)}")
print(f"Total unique words in training: {len(processor.word_counts)}")
print(f"Words included in vocab: {len(processor.vocab) - 4}")  # Minus special tokens
print(f"Special tokens: 4 (<PAD>, <UNK>, <START>, <END>)")

# Show most and least frequent words in vocab
print(f"\nMost frequent words in training data:")
for word, count in sorted_counts[:5]:
    if word in processor.vocab:
        print(f"  '{word}': {count} occurrences, token_id: {processor.vocab[word]}")

print("\n" + "=" * 60)
print("TESTING COMPLETE!")
print("=" * 60)

TESTING TextProcessor CLASS

1. Creating TextProcessor object:
----------------------------------------
Constructor Called
Constructor Initialized
Created TextProcessor with vocab_size: 50
Initial vocab: {}
Initial word_counts: {}

2. Testing clean_text() method:
----------------------------------------
Original: 'Hello World! How are you?'
Cleaned:  'hello world! how are you?'

Original: 'This is a GREAT example with @special #characters & symbols!'
Cleaned:  'this is a great example with special characters symbols!'

Original: 'Multiple    spaces   and	TABS
and newlines'
Cleaned:  'multiple spaces and tabs and newlines'

Original: 'Punctuation: hello, world. Nice! Really?'
Cleaned:  'punctuation hello, world. nice! really?'


3. Testing build_vocab() method:
----------------------------------------
Training texts:
1. I love machine learning and natural language processing
2. This is a great example of text processing
3. Machine learning is very interesting and useful
4. I really love

In [4]:

# SimpleGPT: Educational Transformer Language Model

class SimpleGPT(keras.Model):
    """
    SimpleGPT - Lightweight GPT Implementation for Learning & Experimentation

    A simplified but fully functional GPT (Generative Pre-trained Transformer) model
    designed for educational purposes. Features:
    - Causal self-attention with masking
    - Positional embeddings and multi-head attention
    - Autoregressive text generation capabilities
    - Clean, understandable transformer architecture

    Perfect for understanding how modern language models work.
    """

    def __init__(self, vocab_size, d_model=64, num_heads=4, num_layers=2, dff=128, max_length=16):
        super(SimpleGPT, self).__init__()

        self.d_model = d_model
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dff = dff

        # Embedding layers
        self.token_embedding = layers.Embedding(vocab_size, d_model)
        self.position_embedding = layers.Embedding(max_length, d_model)

        # Transformer blocks
        self.transformer_blocks = []
        for _ in range(num_layers):
            self.transformer_blocks.append([
                layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model//num_heads),
                layers.LayerNormalization(),
                layers.Dense(dff, activation='relu'),
                layers.Dense(d_model),
                layers.LayerNormalization(),
                layers.Dropout(0.1)
            ])

        # Output layer
        self.output_layer = layers.Dense(vocab_size)
        self.dropout = layers.Dropout(0.1)

    def call(self, inputs, training=None):
        seq_len = tf.shape(inputs)[1]

        # Create position indices
        positions = tf.range(start=0, limit=seq_len, delta=1)

        # Embeddings
        token_emb = self.token_embedding(inputs)
        pos_emb = self.position_embedding(positions)
        x = token_emb + pos_emb
        x = self.dropout(x, training=training)

        # Create causal mask
        mask = self.create_causal_mask(seq_len)

        # Transformer blocks
        for attention, norm1, ffn1, ffn2, norm2, dropout in self.transformer_blocks:
            # Self-attention with residual connection
            attn_output = attention(x, x, attention_mask=mask, training=training)
            x = norm1(x + attn_output)

            # Feed-forward with residual connection
            ffn_output = ffn2(ffn1(x))
            ffn_output = dropout(ffn_output, training=training)
            x = norm2(x + ffn_output)

        # Output projection
        return self.output_layer(x)

    def create_causal_mask(self, seq_len):
        """Create causal mask for self-attention"""
        mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return mask[tf.newaxis, tf.newaxis, :, :]

# ============== TESTING THE SimpleGPT CLASS ==============

print("=" * 70)
print("TESTING SimpleGPT MODEL")
print("=" * 70)

# 1. CREATE MODEL OBJECT
print("\n1. Creating SimpleGPT Model Object:")
print("-" * 50)

# Model parameters
vocab_size = 100
d_model = 64
num_heads = 4
num_layers = 2
dff = 128
max_length = 16
batch_size = 2

print(f"Model Configuration:")
print(f"  Vocabulary Size: {vocab_size}")
print(f"  Model Dimension (d_model): {d_model}")
print(f"  Number of Attention Heads: {num_heads}")
print(f"  Number of Layers: {num_layers}")
print(f"  Feed-Forward Dimension: {dff}")
print(f"  Maximum Sequence Length: {max_length}")

# Create the model
model = SimpleGPT(
    vocab_size=vocab_size,
    d_model=d_model,
    num_heads=num_heads,
    num_layers=num_layers,
    dff=dff,
    max_length=max_length
)

print(f"\nModel created successfully!")
print(f"Model type: {type(model)}")

# 2. EXAMINE MODEL STRUCTURE
print("\n2. Examining Model Structure:")
print("-" * 50)

print(f"Model attributes:")
print(f"  d_model: {model.d_model}")
print(f"  max_length: {model.max_length}")
print(f"  vocab_size: {model.vocab_size}")
print(f"  num_heads: {model.num_heads}")
print(f"  num_layers: {model.num_layers}")

print(f"\nEmbedding layers:")
print(f"  Token embedding: {model.token_embedding}")
print(f"  Position embedding: {model.position_embedding}")

print(f"\nTransformer blocks: {len(model.transformer_blocks)} layers")
for i, block in enumerate(model.transformer_blocks):
    print(f"  Layer {i+1}: {len(block)} components")
    print(f"    - MultiHeadAttention: {block[0]}")
    print(f"    - LayerNormalization: {block[1]}")
    print(f"    - Dense (FFN1): {block[2]}")
    print(f"    - Dense (FFN2): {block[3]}")
    print(f"    - LayerNormalization: {block[4]}")
    print(f"    - Dropout: {block[5]}")

print(f"\nOutput layer: {model.output_layer}")

# 3. TEST create_causal_mask METHOD
print("\n3. Testing create_causal_mask() Method:")
print("-" * 50)

# Test with different sequence lengths
test_lengths = [3, 5, 8]

for seq_len in test_lengths:
    print(f"\nTesting causal mask for sequence length {seq_len}:")
    mask = model.create_causal_mask(seq_len)
    print(f"Mask shape: {mask.shape}")
    print(f"Mask (first batch, first head):")

    # Convert to numpy for easier viewing
    mask_np = mask.numpy()[0, 0]  # Remove batch and head dimensions

    # Print the mask in a readable format
    for i in range(seq_len):
        row_str = "  "
        for j in range(seq_len):
            row_str += f"{int(mask_np[i, j])} "
        print(row_str)

    print(f"Purpose: This mask ensures each position can only attend to previous positions")
    print(f"1 = can attend, 0 = cannot attend (future positions)")

# 4. CREATE SAMPLE INPUT DATA
print("\n4. Creating Sample Input Data:")
print("-" * 50)

# Create sample input sequences (token IDs)
sequence_length = 8
sample_input = tf.random.uniform(
    shape=(batch_size, sequence_length),
    maxval=vocab_size,
    dtype=tf.int32
)

print(f"Sample input shape: {sample_input.shape}")
print(f"Sample input data (token IDs):")
for i in range(batch_size):
    print(f"  Batch {i}: {sample_input[i].numpy()}")

print(f"\nInput explanation:")
print(f"  - Shape: (batch_size={batch_size}, sequence_length={sequence_length})")
print(f"  - Values: Random token IDs from 0 to {vocab_size-1}")
print(f"  - This represents {batch_size} sequences of {sequence_length} tokens each")

# 5. TEST MODEL FORWARD PASS (call method)
print("\n5. Testing Model Forward Pass (call method):")
print("-" * 50)

print("Running forward pass...")
try:
    # Forward pass
    output = model(sample_input, training=False)

    print(f"✓ Forward pass successful!")
    print(f"Output shape: {output.shape}")
    print(f"Expected shape: (batch_size={batch_size}, seq_len={sequence_length}, vocab_size={vocab_size})")

    print(f"\nOutput statistics:")
    print(f"  Min value: {tf.reduce_min(output):.4f}")
    print(f"  Max value: {tf.reduce_max(output):.4f}")
    print(f"  Mean value: {tf.reduce_mean(output):.4f}")

    # Show sample predictions for first sequence, first few positions
    print(f"\nSample output (logits) for first sequence, first 3 positions:")
    for pos in range(min(3, sequence_length)):
        top_3_indices = tf.nn.top_k(output[0, pos], k=3).indices
        top_3_values = tf.nn.top_k(output[0, pos], k=3).values
        print(f"  Position {pos}: Top 3 tokens = {top_3_indices.numpy()}, scores = {top_3_values.numpy()}")

except Exception as e:
    print(f"✗ Forward pass failed: {e}")

# 6. TEST WITH TRAINING MODE
print("\n6. Testing with Training Mode:")
print("-" * 50)

try:
    # Forward pass in training mode
    output_train = model(sample_input, training=True)
    print(f"✓ Training mode forward pass successful!")
    print(f"Training output shape: {output_train.shape}")

    # Compare with inference mode
    output_inference = model(sample_input, training=False)

    # Check if outputs are different (due to dropout)
    difference = tf.reduce_mean(tf.abs(output_train - output_inference))
    print(f"Difference between training and inference: {difference:.6f}")
    print(f"(Difference > 0 indicates dropout is working)")

except Exception as e:
    print(f"✗ Training mode failed: {e}")

# 7. MODEL SUMMARY
print("\n7. Model Summary:")
print("-" * 50)

try:
    # Build the model first by calling it
    _ = model(sample_input)

    print("Model Summary:")
    model.summary()

    # Count parameters
    total_params = model.count_params()
    print(f"\nTotal trainable parameters: {total_params:,}")

except Exception as e:
    print(f"Could not generate summary: {e}")

# 8. UNDERSTANDING THE ARCHITECTURE
print("\n8. Understanding the GPT Architecture:")
print("-" * 50)

print("🧠 SIMPLEGPT ARCHITECTURE BREAKDOWN:")
print()
print("INPUT PROCESSING:")
print("  1. Token Embedding: Converts token IDs to dense vectors")
print("  2. Position Embedding: Adds positional information to tokens")
print("  3. Dropout: Regularization during training")
print()
print("TRANSFORMER LAYERS (x2):")
print("  1. Multi-Head Self-Attention:")
print("     - Allows each token to attend to previous tokens")
print("     - Uses causal mask to prevent looking at future tokens")
print("  2. Layer Normalization + Residual Connection")
print("  3. Feed-Forward Network:")
print("     - Two dense layers with ReLU activation")
print("     - Processes attended information")
print("  4. Layer Normalization + Residual Connection")
print("  5. Dropout for regularization")
print()
print("OUTPUT:")
print("  - Dense layer projects to vocabulary size")
print("  - Each position outputs probability distribution over vocabulary")
print("  - Can be used for next token prediction")

print("\n" + "=" * 70)
print("TESTING COMPLETE!")
print("=" * 70)

# 9. PRACTICAL USAGE EXAMPLE
print("\n9. Practical Usage Example:")
print("-" * 50)

print("Here's how you would typically use this model:")
print()
print("# For training:")
print("model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')")
print("model.fit(train_data, epochs=10)")
print()
print("# For text generation:")
print("def generate_text(model, start_tokens, max_length=20):")
print("    for _ in range(max_length):")
print("        predictions = model(start_tokens)")
print("        next_token = tf.argmax(predictions[:, -1, :], axis=-1)")
print("        start_tokens = tf.concat([start_tokens, next_token[:, None]], axis=1)")
print("    return start_tokens")

TESTING SimpleGPT MODEL

1. Creating SimpleGPT Model Object:
--------------------------------------------------
Model Configuration:
  Vocabulary Size: 100
  Model Dimension (d_model): 64
  Number of Attention Heads: 4
  Number of Layers: 2
  Feed-Forward Dimension: 128
  Maximum Sequence Length: 16

Model created successfully!
Model type: <class '__main__.SimpleGPT'>

2. Examining Model Structure:
--------------------------------------------------
Model attributes:
  d_model: 64
  max_length: 16
  vocab_size: 100
  num_heads: 4
  num_layers: 2

Embedding layers:
  Token embedding: <Embedding name=embedding, built=False>
  Position embedding: <Embedding name=embedding_1, built=False>

Transformer blocks: 2 layers
  Layer 1: 6 components
    - MultiHeadAttention: <MultiHeadAttention name=multi_head_attention, built=False>
    - LayerNormalization: <LayerNormalization name=layer_normalization, built=False>
    - Dense (FFN1): <Dense name=dense, built=False>
    - Dense (FFN2): <Dense nam


Total trainable parameters: 80,868

8. Understanding the GPT Architecture:
--------------------------------------------------
🧠 SIMPLEGPT ARCHITECTURE BREAKDOWN:

INPUT PROCESSING:
  1. Token Embedding: Converts token IDs to dense vectors
  2. Position Embedding: Adds positional information to tokens
  3. Dropout: Regularization during training

TRANSFORMER LAYERS (x2):
  1. Multi-Head Self-Attention:
     - Allows each token to attend to previous tokens
     - Uses causal mask to prevent looking at future tokens
  2. Layer Normalization + Residual Connection
  3. Feed-Forward Network:
     - Two dense layers with ReLU activation
     - Processes attended information
  4. Layer Normalization + Residual Connection
  5. Dropout for regularization

OUTPUT:
  - Dense layer projects to vocabulary size
  - Each position outputs probability distribution over vocabulary
  - Can be used for next token prediction

TESTING COMPLETE!

9. Practical Usage Example:
----------------------------------

# Model Summary:
# Model: "simple_gpt"
# ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
# ┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
# ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
# │ embedding (Embedding)           │ (2, 8, 64)             │         6,400 │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ embedding_1 (Embedding)         │ (8, 64)                │         1,024 │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ multi_head_attention            │ (2, 8, 64)             │        16,640 │
# │ (MultiHeadAttention)            │                        │               │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ layer_normalization             │ (2, 8, 64)             │           128 │
# │ (LayerNormalization)            │                        │               │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ dense (Dense)                   │ (2, 8, 128)            │         8,320 │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ dense_1 (Dense)                 │ (2, 8, 64)             │         8,256 │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ layer_normalization_1           │ (2, 8, 64)             │           128 │
# │ (LayerNormalization)            │                        │               │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ dropout (Dropout)               │ ?                      │             0 │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ multi_head_attention_1          │ (2, 8, 64)             │        16,640 │
# │ (MultiHeadAttention)            │                        │               │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ layer_normalization_2           │ (2, 8, 64)             │           128 │
# │ (LayerNormalization)            │                        │               │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ dense_2 (Dense)                 │ (2, 8, 128)            │         8,320 │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ dense_3 (Dense)                 │ (2, 8, 64)             │         8,256 │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ layer_normalization_3           │ (2, 8, 64)             │           128 │
# │ (LayerNormalization)            │                        │               │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ dropout_1 (Dropout)             │ ?                      │             0 │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ dense_4 (Dense)                 │ (2, 8, 100)            │         6,500 │
# ├─────────────────────────────────┼────────────────────────┼───────────────┤
# │ dropout_2 (Dropout)             │ ?                      │             0 │
# └─────────────────────────────────┴────────────────────────┴───────────────┘
#  Total params: 80,868 (315.89 KB)
#  Trainable params: 80,868 (315.89 KB)
#  Non-trainable params: 0 (0.00 B)

# Total trainable parameters: 80,868


#Explain this numbers with justification?

# 🎯 Why Token Embedding Has 6,400 Parameters

## The Math
```
vocab_size × d_model = parameters
100 × 64 = 6,400 parameters
```

## 📊 What This Looks Like

The token embedding is essentially a **lookup table** (matrix) where:
- **Rows**: Each of the 100 possible tokens in vocabulary  
- **Columns**: Each of the 64 dimensions in the embedding vector

```
        Dimension →
Token ↓  1    2    3    4   ...  64
──────────────────────────────────────
  0   │ 0.12 -0.45 0.78 -0.23 ... 0.56 │
  1   │ -0.67 0.34 -0.12 0.89 ... -0.45│  
  2   │ 0.23 0.91 -0.56 0.12 ... 0.78 │
  3   │ -0.34 -0.12 0.67 -0.89 ... 0.23│
 ... │  ...   ...   ...   ...  ...  ...│
 99   │ 0.45 -0.78 0.23 0.56 ... -0.12│
──────────────────────────────────────
     100 rows × 64 columns = 6,400 numbers
```

## 🔍 How It Works

### Input Process:
1. **Token ID comes in**: `[5, 23, 7, 12]` (sequence of token IDs)
2. **Lookup happens**: Each ID gets its corresponding row from the matrix
3. **Vectors retrieved**:
   - Token 5 → `[0.12, -0.45, 0.78, ..., 0.56]` (64 numbers)
   - Token 23 → `[-0.67, 0.34, -0.12, ..., -0.45]` (64 numbers)
   - Token 7 → `[0.23, 0.91, -0.56, ..., 0.78]` (64 numbers)
   - Token 12 → `[-0.34, -0.12, 0.67, ..., 0.23]` (64 numbers)

### Why 64 Dimensions?
- **Rich Representation**: 64 numbers can capture many aspects of a word's meaning
- **Computational Efficiency**: Not too large (like 1024) but not too small (like 8)
- **Learning Capacity**: Enough dimensions to distinguish between different token meanings

## 🧠 What the Model Learns

During training, these 6,400 parameters adjust so that:

### Similar Tokens Get Similar Vectors:
```
"good"     → [0.2, 0.8, -0.1, 0.5, ...]
"great"    → [0.3, 0.7, -0.2, 0.4, ...]  # Similar values!
"excellent"→ [0.1, 0.9, -0.1, 0.6, ...]
```

### Different Tokens Get Different Vectors:
```
"good"  → [0.2, 0.8, -0.1, 0.5, ...]
"cat"   → [-0.4, 0.1, 0.9, -0.2, ...]  # Very different!
"run"   → [0.6, -0.3, 0.2, 0.8, ...]
```

## 📈 Parameter Breakdown Across Model

| Layer Type | Parameters | Calculation |
|------------|------------|-------------|
| **Token Embedding** | **6,400** | **100 × 64** |
| Position Embedding | 1,024 | 16 × 64 |
| Multi-Head Attention | 16,640 | Complex matrix ops |
| Feed-Forward | 16,576 | Dense layer weights |
| Layer Norm | 128 | Scaling parameters |
| Output Layer | 6,500 | 64 × 100 + 100 |
| **Total** | **80,868** | |

## 💡 Key Insights

1. **Embedding = Learned Dictionary**: Each token learns what it "means" as a 64-dimensional vector
2. **No Fixed Rules**: The model discovers these representations during training
3. **Context Aware**: Similar contexts push similar tokens to have similar embeddings
4. **Foundation Layer**: All other processing builds on these learned token representations

## 🎯 Real Example

If your vocabulary included:
```
Token 0: <PAD>     → [0.0, 0.0, 0.0, ...]     # Padding
Token 1: <UNK>     → [0.1, 0.1, 0.1, ...]     # Unknown
Token 4: "love"    → [0.8, 0.9, 0.2, ...]     # Positive emotion
Token 15: "hate"   → [-0.8, -0.9, 0.1, ...]   # Opposite emotion
Token 23: "book"   → [0.3, 0.1, 0.8, ...]     # Object
```

The model learns these patterns automatically by seeing millions of examples during training!


In [5]:
# Now the main GPTTrainer class
class GPTTrainer:
    """Main trainer class for educational GPT demonstration"""

    def __init__(self, vocab_size=400, d_model=64, num_heads=4, num_layers=2, dff=128, max_length=12):
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dff = dff
        self.max_length = max_length

        self.processor = TextProcessor(vocab_size)
        self.model = None
        self.classification_model = None
        self.history = {'pretrain': None, 'finetune': None}

    def create_training_data(self):
        """Create comprehensive training data"""

        pretrain_texts = [
            "I love reading books about science and technology.",
            "The weather today is sunny and beautiful.",
            "Learning new things is always exciting and fun.",
            "Movies and music make me happy and relaxed.",
            "Good food brings people together for dinner.",
            "Exercise is important for health and wellness.",
            "Travel helps us see new places and cultures.",
            "Education opens doors to many opportunities.",
            "Art and creativity inspire people every day.",
            "Friends and family are very important to me.",
            "Technology changes how we work and live.",
            "Nature and animals are beautiful and amazing.",
            "Sports and games bring joy to many people.",
            "Reading helps improve vocabulary and knowledge.",
            "Music has the power to change our mood.",
            "Cooking is both art and science combined.",
            "Science helps us understand the world better.",
            "History teaches us about past events.",
            "Mathematics is useful in everyday life.",
            "Languages connect people from different countries.",
            "Gardens and flowers make spaces more beautiful.",
            "Photography captures important memories forever.",
            "Writing helps express thoughts and ideas clearly.",
            "Dancing is a wonderful form of artistic expression.",
            "Architecture combines beauty with practical function."
        ]

        positive_texts = [
            "I absolutely love this amazing product!",
            "This is fantastic and wonderful experience.",
            "Great job and excellent work done here.",
            "Beautiful day with perfect weather today.",
            "Amazing food and outstanding service provided.",
            "Wonderful time with family and friends.",
            "Excellent quality and great value for money.",
            "Perfect solution to my problem solved.",
            "Outstanding performance and great results achieved.",
            "Incredible experience that exceeded all expectations.",
            "Fantastic movie with great acting throughout.",
            "Delicious meal at this wonderful restaurant.",
            "Amazing vacation with beautiful scenery everywhere.",
            "Great book that I really enjoyed reading.",
            "Wonderful music that makes me feel happy."
        ]

        negative_texts = [
            "This is terrible and completely disappointing.",
            "Awful experience and poor quality service.",
            "Horrible weather ruined my entire day.",
            "Bad food and slow service at restaurant.",
            "Disappointing movie with boring plot line.",
            "Poor quality product that broke immediately.",
            "Terrible customer service and rude staff.",
            "Awful traffic and long delays everywhere.",
            "Bad news and disappointing results received.",
            "Horrible experience that wasted my time.",
            "Terrible book with confusing story line.",
            "Poor performance and disappointing outcome achieved.",
            "Bad weather cancelled all outdoor plans.",
            "Awful mistake that caused many problems.",
            "Disappointing vacation with many problems encountered."
        ]

        finetune_texts = positive_texts + negative_texts
        finetune_labels = [1] * len(positive_texts) + [0] * len(negative_texts)

        return pretrain_texts, finetune_texts, finetune_labels

    def prepare_sequences(self, texts, seq_length=8):
        """Create training sequences"""
        sequences = []

        for text in texts:
            tokens = self.processor.encode(text)

            for i in range(len(tokens) - seq_length):
                if i + seq_length + 1 <= len(tokens):
                    seq = tokens[i:i + seq_length + 1]
                    sequences.append(seq)

            if len(tokens) >= 4 and len(tokens) <= seq_length:
                padded = tokens + [0] * (seq_length + 1 - len(tokens))
                sequences.append(padded[:seq_length + 1])

        inputs = [seq[:-1] for seq in sequences]
        targets = [seq[1:] for seq in sequences]

        return np.array(inputs), np.array(targets)

    def build_model(self):
        """Build the GPT model"""
        print("Building GPT model...")
        self.model = SimpleGPT(
            vocab_size=self.vocab_size,
            d_model=self.d_model,
            num_heads=self.num_heads,
            num_layers=self.num_layers,
            dff=self.dff,
            max_length=self.max_length
        )

        dummy_input = tf.constant([[1, 2, 3, 4, 5]])
        self.model(dummy_input)

        print(f"Model built with {self.model.count_params()} parameters")

    def pretrain(self, texts, epochs=10, batch_size=8, seq_length=8):
        """Pre-training phase"""
        print("\n" + "="*50)
        print("PHASE 1: UNSUPERVISED PRE-TRAINING")
        print("="*50)

        self.processor.build_vocab(texts)
        X, y = self.prepare_sequences(texts, seq_length)

        print(f"Created {len(X)} training sequences")

        if len(X) == 0:
            raise ValueError("No training sequences created!")

        if self.model is None:
            self.build_model()

        self.model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=0.001),
            loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=['accuracy']
        )

        print("Starting pre-training...")
        history = self.model.fit(
            X, y,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.1,
            verbose=1
        )

        self.history['pretrain'] = history.history
        print("Pre-training completed!")

        print("\n📝 Testing text generation:")
        test_prompts = ["i love", "this is", "the weather"]
        for prompt in test_prompts:
            try:
                generated = self.generate_text(prompt, max_length=6)
                print(f"'{prompt}' → '{generated}'")
            except Exception as e:
                print(f"'{prompt}' → Generation failed: {e}")

        return history.history

    def finetune_classification(self, texts, labels, epochs=8):
        """Fine-tuning for classification"""
        print("\n" + "="*50)
        print("PHASE 2: SUPERVISED FINE-TUNING")
        print("="*50)

        if self.model is None:
            raise ValueError("Model must be pre-trained first!")

        sequences = []
        for text in texts:
            tokens = self.processor.encode(text)
            if len(tokens) > self.max_length - 1:
                tokens = tokens[:self.max_length - 1]
            else:
                tokens = tokens + [0] * (self.max_length - 1 - len(tokens))
            sequences.append(tokens)

        X = np.array(sequences)
        y = np.array(labels)

        print(f"Fine-tuning on {len(X)} examples")

        inputs = keras.Input(shape=(self.max_length - 1,))

        for layer in self.model.layers:
            layer.trainable = False

        gpt_features = self.model(inputs)

        pooled = layers.GlobalAveragePooling1D()(gpt_features)
        dense1 = layers.Dense(32, activation='relu')(pooled)
        dropout1 = layers.Dropout(0.5)(dense1)
        outputs = layers.Dense(2, activation='softmax')(dropout1)

        self.classification_model = keras.Model(inputs, outputs)

        self.classification_model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=0.001),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        print("Training classification head...")
        history = self.classification_model.fit(
            X, y,
            epochs=epochs,
            batch_size=4,
            validation_split=0.2,
            verbose=1
        )

        self.history['finetune'] = history.history
        print("Fine-tuning completed!")

        return history.history

    def generate_text(self, prompt, max_length=8, temperature=0.8):
        """Generate text safely"""
        if self.model is None:
            raise ValueError("Model must be trained first!")

        tokens = self.processor.encode(prompt)
        if len(tokens) == 0:
            tokens = [1]

        for _ in range(max_length):
            current_tokens = tokens[-(self.max_length-1):]

            input_tokens = current_tokens + [0] * (self.max_length - 1 - len(current_tokens))
            input_tokens = input_tokens[:self.max_length - 1]

            input_tensor = tf.constant([input_tokens])
            predictions = self.model(input_tensor, training=False)

            last_pos = min(len(current_tokens) - 1, self.max_length - 2)
            if last_pos >= 0:
                logits = predictions[0, last_pos, :]

                logits = logits / temperature
                probabilities = tf.nn.softmax(logits)
                next_token = tf.random.categorical([tf.math.log(probabilities)], 1)[0, 0]
                next_token = int(next_token)

                if next_token == 0 or next_token >= len(self.processor.vocab):
                    break

                tokens.append(next_token)

        return self.processor.decode(tokens)

    def predict_sentiment(self, text):
        """Predict sentiment safely"""
        if self.classification_model is None:
            raise ValueError("Classification model not trained!")

        tokens = self.processor.encode(text)
        if len(tokens) > self.max_length - 1:
            tokens = tokens[:self.max_length - 1]
        else:
            tokens = tokens + [0] * (self.max_length - 1 - len(tokens))

        prediction = self.classification_model(tf.constant([tokens]))
        return prediction.numpy()[0]

    def evaluate_model(self, test_texts, test_labels):
        """Evaluate the classification model"""
        print("\n🔍 Model Evaluation:")
        print("-" * 40)

        correct = 0
        total = len(test_texts)

        for text, true_label in zip(test_texts, test_labels):
            pred_probs = self.predict_sentiment(text)
            pred_label = 1 if pred_probs[1] > pred_probs[0] else 0

            if pred_label == true_label:
                correct += 1

            sentiment = "Positive" if pred_label == 1 else "Negative"
            confidence = max(pred_probs) * 100
            mark = "✓" if pred_label == true_label else "✗"

            print(f"{mark} '{text[:40]}{'...' if len(text) > 40 else ''}'")
            print(f"   → {sentiment} ({confidence:.1f}%)")

        accuracy = correct / total * 100
        print(f"\n📊 Final Accuracy: {accuracy:.1f}% ({correct}/{total})")
        return accuracy

# ============== TESTING THE GPTTrainer CLASS ==============

print("=" * 80)
print("COMPREHENSIVE GPTTrainer TESTING")
print("=" * 80)

# 1. CREATE GPTTrainer OBJECT
print("\n1. Creating GPTTrainer Object:")
print("-" * 60)

trainer = GPTTrainer(
    vocab_size=200,    # Smaller for demo
    d_model=32,        # Smaller for faster training
    num_heads=2,       # Fewer heads
    num_layers=1,      # Single layer
    dff=64,           # Smaller feed-forward
    max_length=10     # Shorter sequences
)

print(f"✓ GPTTrainer created with configuration:")
print(f"  Vocabulary Size: {trainer.vocab_size}")
print(f"  Model Dimension: {trainer.d_model}")
print(f"  Attention Heads: {trainer.num_heads}")
print(f"  Transformer Layers: {trainer.num_layers}")
print(f"  Feed-Forward Size: {trainer.dff}")
print(f"  Max Sequence Length: {trainer.max_length}")

print(f"\nInitial state:")
print(f"  Text Processor: {type(trainer.processor)}")
print(f"  GPT Model: {trainer.model}")
print(f"  Classification Model: {trainer.classification_model}")
print(f"  Training History: {trainer.history}")

# 2. TEST create_training_data METHOD
print("\n2. Testing create_training_data() Method:")
print("-" * 60)

pretrain_texts, finetune_texts, finetune_labels = trainer.create_training_data()

print(f"✓ Training data created:")
print(f"  Pre-training texts: {len(pretrain_texts)} samples")
print(f"  Fine-tuning texts: {len(finetune_texts)} samples")
print(f"  Fine-tuning labels: {len(finetune_labels)} labels")

print(f"\nSample pre-training texts (first 3):")
for i, text in enumerate(pretrain_texts[:3]):
    print(f"  {i+1}. {text}")

print(f"\nSample fine-tuning data:")
print(f"  Positive examples: {sum(finetune_labels)} samples")
print(f"  Negative examples: {len(finetune_labels) - sum(finetune_labels)} samples")

for i in range(2):
    label = "Positive" if finetune_labels[i] == 1 else "Negative"
    print(f"  {label}: {finetune_texts[i]}")

# 3. TEST prepare_sequences METHOD
print("\n3. Testing prepare_sequences() Method:")
print("-" * 60)

# Build vocabulary first (needed for sequence preparation)
trainer.processor.build_vocab(pretrain_texts[:5])  # Use subset for demo

# Test with a few texts
test_texts = pretrain_texts[:3]
X, y = trainer.prepare_sequences(test_texts, seq_length=6)

print(f"✓ Sequences prepared from {len(test_texts)} texts:")
print(f"  Input sequences (X): {X.shape}")
print(f"  Target sequences (y): {y.shape}")

print(f"\nExample sequences:")
for i in range(min(3, len(X))):
    print(f"  Sequence {i+1}:")
    print(f"    Input:  {X[i]} → '{trainer.processor.decode(X[i])}'")
    print(f"    Target: {y[i]} → '{trainer.processor.decode(y[i])}'")

# 4. TEST build_model METHOD
print("\n4. Testing build_model() Method:")
print("-" * 60)

trainer.build_model()

print(f"✓ Model built successfully:")
print(f"  Model type: {type(trainer.model)}")
print(f"  Model parameters: {trainer.model.count_params():,}")

# Test model with dummy input
dummy_input = tf.constant([[1, 2, 3, 4, 5]])
dummy_output = trainer.model(dummy_input)
print(f"  Test output shape: {dummy_output.shape}")
print(f"  Expected shape: [1, 5, {trainer.vocab_size}]")

# 5. DEMO PRE-TRAINING (shortened for demo)
print("\n5. Testing pretrain() Method:")
print("-" * 60)

print("Starting mini pre-training session...")
try:
    # Use subset for quick demo
    demo_texts = pretrain_texts[:10]
    history = trainer.pretrain(demo_texts, epochs=3, batch_size=4, seq_length=6)

    print(f"✓ Pre-training completed!")
    print(f"  Final loss: {history['loss'][-1]:.4f}")
    print(f"  Final accuracy: {history['accuracy'][-1]:.4f}")

except Exception as e:
    print(f"✗ Pre-training failed: {e}")

# 6. TEST generate_text METHOD
print("\n6. Testing generate_text() Method:")
print("-" * 60)

if trainer.model is not None:
    test_prompts = ["i love", "the weather", "this is"]

    print("Testing text generation:")
    for prompt in test_prompts:
        try:
            generated = trainer.generate_text(prompt, max_length=5, temperature=0.7)
            print(f"  '{prompt}' → '{generated}'")
        except Exception as e:
            print(f"  '{prompt}' → Failed: {e}")
else:
    print("Model not available for text generation")

# 7. DEMO FINE-TUNING (shortened for demo)
print("\n7. Testing finetune_classification() Method:")
print("-" * 60)

if trainer.model is not None:
    try:
        # Use subset for quick demo
        demo_finetune_texts = finetune_texts[:20]
        demo_finetune_labels = finetune_labels[:20]

        print("Starting mini fine-tuning session...")
        ft_history = trainer.finetune_classification(
            demo_finetune_texts,
            demo_finetune_labels,
            epochs=3
        )

        print(f"✓ Fine-tuning completed!")
        print(f"  Final loss: {ft_history['loss'][-1]:.4f}")
        print(f"  Final accuracy: {ft_history['val_accuracy'][-1]:.4f}")

    except Exception as e:
        print(f"✗ Fine-tuning failed: {e}")
else:
    print("Model not available for fine-tuning")

# 8. TEST predict_sentiment METHOD
print("\n8. Testing predict_sentiment() Method:")
print("-" * 60)

if trainer.classification_model is not None:
    test_sentences = [
        "This is amazing and wonderful!",
        "I hate this terrible product.",
        "Great job and excellent work!",
        "Awful experience and poor service."
    ]

    print("Testing sentiment prediction:")
    for sentence in test_sentences:
        try:
            probs = trainer.predict_sentiment(sentence)
            sentiment = "Positive" if probs[1] > probs[0] else "Negative"
            confidence = max(probs) * 100

            print(f"  '{sentence}'")
            print(f"    → {sentiment} ({confidence:.1f}% confidence)")
            print(f"    → Probabilities: [Neg: {probs[0]:.3f}, Pos: {probs[1]:.3f}]")

        except Exception as e:
            print(f"  '{sentence}' → Failed: {e}")
else:
    print("Classification model not available")

# 9. TEST evaluate_model METHOD
print("\n9. Testing evaluate_model() Method:")
print("-" * 60)

if trainer.classification_model is not None:
    test_eval_texts = [
        "This is fantastic and amazing!",
        "Terrible and disappointing experience.",
        "Great quality and excellent service!",
        "Poor performance and bad results."
    ]
    test_eval_labels = [1, 0, 1, 0]  # 1=positive, 0=negative

    try:
        accuracy = trainer.evaluate_model(test_eval_texts, test_eval_labels)
        print(f"✓ Evaluation completed!")

    except Exception as e:
        print(f"✗ Evaluation failed: {e}")
else:
    print("Classification model not available for evaluation")

print("\n" + "=" * 80)
print("GPTTrainer TESTING COMPLETE!")
print("=" * 80)

# 10. SUMMARY OF WHAT WE LEARNED
print("\n10. Summary - What Each Method Does:")
print("-" * 60)

print("🏗️  GPTTrainer.__init__(): Sets up trainer with model configuration")
print("📚  create_training_data(): Generates pre-training and classification datasets")
print("🔢  prepare_sequences(): Converts texts to training sequences for language modeling")
print("🧠  build_model(): Creates the SimpleGPT neural network")
print("📖  pretrain(): Trains model to predict next tokens (unsupervised learning)")
print("🎯  finetune_classification(): Adds classification head for sentiment analysis")
print("✍️   generate_text(): Uses trained model to generate new text")
print("😊  predict_sentiment(): Classifies text as positive/negative")
print("📊  evaluate_model(): Tests classification accuracy on new data")

print("\n🎓 LEARNING OUTCOMES:")
print("   ✓ Understand GPT training pipeline")
print("   ✓ See pre-training → fine-tuning workflow")
print("   ✓ Learn autoregressive text generation")
print("   ✓ Experience transfer learning in NLP")
print("   ✓ Practice model evaluation techniques")

COMPREHENSIVE GPTTrainer TESTING

1. Creating GPTTrainer Object:
------------------------------------------------------------
Constructor Called
Constructor Initialized
✓ GPTTrainer created with configuration:
  Vocabulary Size: 200
  Model Dimension: 32
  Attention Heads: 2
  Transformer Layers: 1
  Feed-Forward Size: 64
  Max Sequence Length: 10

Initial state:
  Text Processor: <class '__main__.TextProcessor'>
  GPT Model: None
  Classification Model: None
  Training History: {'pretrain': None, 'finetune': None}

2. Testing create_training_data() Method:
------------------------------------------------------------
✓ Training data created:
  Pre-training texts: 25 samples
  Fine-tuning texts: 30 samples
  Fine-tuning labels: 30 labels

Sample pre-training texts (first 3):
  1. I love reading books about science and technology.
  2. The weather today is sunny and beautiful.
  3. Learning new things is always exciting and fun.

Sample fine-tuning data:
  Positive examples: 15 samples
 

In [6]:
def main():
    """Main demonstration function"""
    print("🤖 GPT Training Demonstration")
    print("="*50)

    # Initialize trainer
    trainer = GPTTrainer(
        vocab_size=300,
        d_model=48,
        num_heads=4,
        num_layers=2,
        dff=96,
        max_length=10
    )

    # Get training data
    pretrain_texts, finetune_texts, finetune_labels = trainer.create_training_data()

    print(f"📚 Training data prepared:")
    print(f"  - Pre-training texts: {len(pretrain_texts)}")
    print(f"  - Fine-tuning examples: {len(finetune_texts)}")

    # Phase 1: Pre-training
    trainer.pretrain(pretrain_texts, epochs=8, seq_length=6)

    # Phase 2: Fine-tuning
    trainer.finetune_classification(finetune_texts, finetune_labels, epochs=6)

    # Evaluation
    test_texts = [
        "I love this wonderful experience!",
        "This is terrible and awful.",
        "Great job and amazing work!",
        "Poor quality and disappointing.",
        "Fantastic results and excellent!",
        "Bad service and horrible experience."
    ]
    test_labels = [1, 0, 1, 0, 1, 0]

    accuracy = trainer.evaluate_model(test_texts, test_labels)

    # Final text generation test
    print("\n📝 Final Text Generation:")
    print("-" * 30)

    final_prompts = ["i love", "this is great", "the weather"]
    for prompt in final_prompts:
        try:
            generated = trainer.generate_text(prompt, max_length=5)
            print(f"'{prompt}' → '{generated}'")
        except Exception as e:
            print(f"'{prompt}' → Error: {e}")

    # Summary
    print(f"\n🎯 DEMONSTRATION COMPLETE!")
    print(f"📊 Classification Accuracy: {accuracy:.1f}%")
    print(f"🧠 Model Parameters: {trainer.model.count_params():,}")
    print(f"📖 Vocabulary Size: {len(trainer.processor.vocab)}")

    if accuracy > 70:
        print("🎉 Excellent! The model learned effectively!")
    elif accuracy > 50:
        print("👍 Good! The model shows clear learning!")
    else:
        print("📚 Model demonstrates core concepts!")

    print("\nKey concepts demonstrated:")
    print("✓ Transformer architecture")
    print("✓ Self-attention mechanisms")
    print("✓ Transfer learning pipeline")
    print("✓ Language model pre-training")
    print("✓ Supervised fine-tuning")
    print("✓ Text generation")

if __name__ == "__main__":
    main()

🤖 GPT Training Demonstration
Constructor Called
Constructor Initialized
📚 Training data prepared:
  - Pre-training texts: 25
  - Fine-tuning examples: 30

PHASE 1: UNSUPERVISED PRE-TRAINING
Building vocabulary...
Vocabulary size: 147
Created 33 training sequences
Building GPT model...
Model built with 67500 parameters
Starting pre-training...
Epoch 1/8
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 281ms/step - accuracy: 0.0083 - loss: 5.7472 - val_accuracy: 0.0000e+00 - val_loss: 5.8358
Epoch 2/8
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.0627 - loss: 5.2620 - val_accuracy: 0.0417 - val_loss: 5.8016
Epoch 3/8
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.1210 - loss: 4.9465 - val_accuracy: 0.0417 - val_loss: 5.7897
Epoch 4/8
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.1212 - loss: 4.7307 - val_accuracy: 0.0417 - val_loss: 5.7851
Epoch 5/8
[1m4/4

In [1]:
final_prompts = ["Cooking is a ", "life is beautiful", "the beauty of knowledge"]
for prompt in final_prompts:
  try:
    generated = trainer.generate_text(prompt, max_length=50)
    print(f"'{prompt}' → '{generated}'")
  except Exception as e:
    print(f"'{prompt}' → Error: {e}")

'Cooking is a ' → Error: name 'trainer' is not defined
'life is beautiful' → Error: name 'trainer' is not defined
'the beauty of knowledge' → Error: name 'trainer' is not defined


In [42]:
#How to train the structured data in the domain specific cases in organization?

# Domain-Specific GPT Pretraining with Structured Data
## Strategic Guide for Organizations

### **🎯 The Challenge: From Tables to Text**

Traditional GPT models are trained on natural language text, but organizational data is often in structured formats:
- **Databases**: Customer records, financial transactions, inventory data
- **CSV files**: Sales reports, employee data, operational metrics  
- **Spreadsheets**: Financial models, project tracking, performance data
- **JSON/XML**: API responses, configuration files, log data

**The key insight**: We need to convert structured data into meaningful text that GPT can learn from.

---

## **🔄 The Structured Data → Text Transformation Process**

### **1. Data Serialization Strategy**

**Turn Tables into Stories**

Instead of keeping data in rows and columns, transform it into natural language descriptions:

**Before (CSV format):**
```
Customer_ID, Name, Purchase_Amount, Product, Date, Satisfaction
12345, John Smith, $299.99, Laptop, 2024-01-15, 4.5
```

**After (Text format):**
```
"Customer John Smith (ID: 12345) purchased a Laptop for $299.99 on January 15th, 2024.
The customer rated their satisfaction as 4.5 out of 5 stars, indicating high satisfaction
with the purchase experience."
```

### **2. Context-Rich Data Narratives**

**Add Business Context**

Transform raw data points into business-meaningful narratives:

**Financial Data Example:**
- **Raw**: `Q1_Revenue: $2.5M, Growth: 15%, Profit_Margin: 22%`
- **Narrative**: `"In Q1, the company achieved $2.5 million in revenue, representing a 15% growth compared to the previous quarter. The profit margin of 22% indicates strong operational efficiency and cost management."`

**HR Data Example:**
- **Raw**: `Employee_ID: 789, Department: Engineering, Performance: 4.2, Promotion: Yes`
- **Narrative**: `"Employee 789 from the Engineering department achieved a performance rating of 4.2 out of 5.0, demonstrating excellent work quality and was approved for promotion based on consistent high performance."`


## **🏢 Organization-Specific Implementation Strategies**

### **For Financial Services:**

**Transform Trading Data:**
- **Raw**: `AAPL, 150.25, +2.3%, 1M_shares, 09:30`
- **Narrative**: `"Apple (AAPL) opened at $150.25, up 2.3% from yesterday's close. Trading volume of 1 million shares at 9:30 AM indicates strong investor interest, potentially driven by positive earnings expectations."`

**Transform Credit Risk Data:**
- **Raw**: `Customer_567, Credit_Score: 720, Debt_Ratio: 0.35, Income: $75K`
- **Narrative**: `"Customer 567 presents a moderate credit risk profile with a credit score of 720, indicating good creditworthiness. Their debt-to-income ratio of 35% is within acceptable limits for their $75,000 annual income."`

### **For Healthcare Organizations:**

**Transform Patient Data:**
- **Raw**: `Patient_123, Age: 45, BP: 140/90, BMI: 28.5, Risk: Medium`
- **Narrative**: `"45-year-old patient presents with elevated blood pressure (140/90) and BMI of 28.5, indicating overweight status. Combined factors suggest medium cardiovascular risk requiring lifestyle modifications and regular monitoring."`

### **For Retail Companies:**

**Transform Inventory Data:**
- **Raw**: `SKU_789, Stock: 45, Reorder: 20, Sales_Velocity: 5/day`
- **Narrative**: `"Product SKU-789 currently has 45 units in stock with a reorder point set at 20 units. Based on current sales velocity of 5 units per day, the product will reach reorder levels in 5 days, ensuring adequate inventory for customer demand."`

### **For Manufacturing:**

**Transform Production Data:**
- **Raw**: `Line_A, Efficiency: 87%, Downtime: 2hrs, Output: 1250_units`
- **Narrative**: `"Production Line A operated at 87% efficiency today with 2 hours of planned maintenance downtime. The line produced 1,250 units, meeting daily targets despite the maintenance window."`
