In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np

# ---------------------
# 1. Activation Functions
# ---------------------
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

# ---------------------
# 2. LSTM Cell Implementation
# ---------------------
class LSTMCell:
    def __init__(self, input_size, hidden_size):
        # Combined parameters for all gates (forget, input, cell, output)
        self.W = np.random.randn(4 * hidden_size, input_size + hidden_size) * 0.01  # <- Fix here
        self.b = np.zeros((4 * hidden_size, 1))
        
        # Split weights into components for each gate
        self.W_f = self.W[:hidden_size]          # Forget gate
        self.W_i = self.W[hidden_size:2*hidden_size]  # Input gate
        self.W_c = self.W[2*hidden_size:3*hidden_size]  # Cell candidate
        self.W_o = self.W[3*hidden_size:]        # Output gate
        
        self.hidden_size = hidden_size


    def forward(self, x, h_prev, c_prev):
        """
        x: input vector (input_size, 1)
        h_prev: previous hidden state (hidden_size, 1)
        c_prev: previous cell state (hidden_size, 1)
        """
        # 1. Concatenate input and previous hidden state
        combined = np.vstack((h_prev, x))  # (hidden_size + input_size, 1)

        # 2. Compute all gates simultaneously
        gates = self.W @ combined + self.b
        
        # 3. Split into individual gates
        f = sigmoid(gates[:self.hidden_size])          # Forget gate
        i = sigmoid(gates[self.hidden_size:2*self.hidden_size])  # Input gate
        c_candidate = tanh(gates[2*self.hidden_size:3*self.hidden_size])  # Cell candidate
        o = sigmoid(gates[3*self.hidden_size:])        # Output gate

        # 4. Update cell state
        c_next = f * c_prev + i * c_candidate

        # 5. Compute new hidden state
        h_next = o * tanh(c_next)

        return h_next, c_next

# ---------------------
# 3. Usage Example
# ---------------------
# Hyperparameters
input_size = 3
hidden_size = 2
seq_length = 4

# Initialize LSTM cell
lstm = LSTMCell(input_size, hidden_size)

# Initialize hidden and cell states
h = np.zeros((hidden_size, 1))
c = np.zeros((hidden_size, 1))

# Sample input sequence (seq_length, input_size, 1)
inputs = [np.random.randn(input_size, 1) for _ in range(seq_length)]

# Forward pass through time
print("Step-by-Step LSTM Processing:")
for t in range(seq_length):
    h, c = lstm.forward(inputs[t], h, c)
    print(f"Time Step {t+1}:")
    print(f"Hidden State:\n{h.round(4)}")
    print(f"Cell State:\n{c.round(4)}\n")


Step-by-Step LSTM Processing:
Time Step 1:
Hidden State:
[[0.0027]
 [0.0004]]
Cell State:
[[0.0053]
 [0.0008]]

Time Step 2:
Hidden State:
[[0.0063]
 [0.0007]]
Cell State:
[[0.0124]
 [0.0014]]

Time Step 3:
Hidden State:
[[ 0.0113]
 [-0.0006]]
Cell State:
[[ 0.0224]
 [-0.0012]]

Time Step 4:
Hidden State:
[[0.0067]
 [0.0001]]
Cell State:
[[0.0134]
 [0.0003]]



In [None]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

class LSTMCell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # Weight matrices for gates: dimensions (hidden_size, hidden_size + input_size)
        self.W_f = np.random.randn(hidden_size, hidden_size + input_size) * 0.1
        self.b_f = np.zeros((hidden_size, 1))
        
        self.W_i = np.random.randn(hidden_size, hidden_size + input_size) * 0.1
        self.b_i = np.zeros((hidden_size, 1))
        
        self.W_C = np.random.randn(hidden_size, hidden_size + input_size) * 0.1
        self.b_C = np.zeros((hidden_size, 1))
        
        self.W_o = np.random.randn(hidden_size, hidden_size + input_size) * 0.1
        self.b_o = np.zeros((hidden_size, 1))

    def forward(self, x_t, h_prev, C_prev):
        # x_t shape: (input_size, 1)
        # h_prev shape: (hidden_size, 1)
        # C_prev shape: (hidden_size, 1)
        
        # Concatenate h_prev and x_t
        concat = np.vstack((h_prev, x_t))  # shape: (hidden_size + input_size, 1)
        
        # Forget gate
        f_t = sigmoid(np.dot(self.W_f, concat) + self.b_f)
        
        # Input gate
        i_t = sigmoid(np.dot(self.W_i, concat) + self.b_i)
        C_tilde = tanh(np.dot(self.W_C, concat) + self.b_C)
        
        # Update cell state
        C_t = f_t * C_prev + i_t * C_tilde
        
        # Output gate
        o_t = sigmoid(np.dot(self.W_o, concat) + self.b_o)
        h_t = o_t * tanh(C_t)
        
        return h_t, C_t

# Example usage:

input_size = 3
hidden_size = 2

lstm_cell = LSTMCell(input_size, hidden_size)

# Random initial hidden state and cell state
h_prev = np.zeros((hidden_size, 1))
C_prev = np.zeros((hidden_size, 1))

# Random input vector
x_t = np.random.randn(input_size, 1)

h_next, C_next = lstm_cell.forward(x_t, h_prev, C_prev)

print("Next hidden state (h_t):\n", h_next)
print("Next cell state (C_t):\n", C_next)


PYTORCH IMPLEMENTATION TO SHOW THE WORKING OF ALL INTERMEDIATE STATES

In [1]:
import torch
import torch.nn as nn

class LSTMCellExplained(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMCellExplained, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # One linear layer for all 4 gates: i, f, g, o
        self.W = nn.Linear(input_size + hidden_size, 4 * hidden_size)

    def forward(self, x_t, h_prev, c_prev):
        # Concatenate input and previous hidden state
        combined = torch.cat((x_t, h_prev), dim=1)  # Shape: [batch_size, input_size + hidden_size]

        # Compute all gate values
        gates = self.W(combined)  # Shape: [batch_size, 4 * hidden_size]

        # Split into individual gates
        i_t, f_t, g_t, o_t = gates.chunk(4, dim=1)

        # Apply nonlinearities
        i_t = torch.sigmoid(i_t)  # Input gate
        f_t = torch.sigmoid(f_t)  # Forget gate
        g_t = torch.tanh(g_t)     # Cell candidate
        o_t = torch.sigmoid(o_t)  # Output gate

        # Update cell and hidden state
        c_t = f_t * c_prev + i_t * g_t
        h_t = o_t * torch.tanh(c_t)

        # Print dimensions of all components
        print("🔍 Dimensions:")
        print("Input x_t:", x_t.shape)
        print("Hidden state h_prev:", h_prev.shape)
        print("Cell state c_prev:", c_prev.shape)
        print("Combined [x_t, h_prev]:", combined.shape)
        print("All gates (linear output):", gates.shape)
        print("Input gate i_t:", i_t.shape)
        print("Forget gate f_t:", f_t.shape)
        print("Cell candidate g_t:", g_t.shape)
        print("Output gate o_t:", o_t.shape)
        print("New cell state c_t:", c_t.shape)
        print("New hidden state h_t:", h_t.shape)

        return h_t, c_t, {
            'input_gate': i_t,
            'forget_gate': f_t,
            'cell_candidate': g_t,
            'output_gate': o_t,
            'hidden_state': h_t,
            'cell_state': c_t
        }


In [2]:
# Parameters
input_size = 3
hidden_size = 4
batch_size = 1

# Model
lstm_cell = LSTMCellExplained(input_size, hidden_size)

# Input and initial states
x_t = torch.randn(batch_size, input_size)
h_prev = torch.zeros(batch_size, hidden_size)
c_prev = torch.zeros(batch_size, hidden_size)

# Forward pass
h_t, c_t, gates = lstm_cell(x_t, h_prev, c_prev)


🔍 Dimensions:
Input x_t: torch.Size([1, 3])
Hidden state h_prev: torch.Size([1, 4])
Cell state c_prev: torch.Size([1, 4])
Combined [x_t, h_prev]: torch.Size([1, 7])
All gates (linear output): torch.Size([1, 16])
Input gate i_t: torch.Size([1, 4])
Forget gate f_t: torch.Size([1, 4])
Cell candidate g_t: torch.Size([1, 4])
Output gate o_t: torch.Size([1, 4])
New cell state c_t: torch.Size([1, 4])
New hidden state h_t: torch.Size([1, 4])


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import Counter
import string

class LSTMTextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.3):
        super(LSTMTextGenerator, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # Embedding layer: converts token indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer : processes embeddings and learns sequential patterns
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, 
                           dropout=dropout, batch_first=True)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
        # Output layer: maps hidden states to vocabulary probabilities
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden=None, verbose=False):
        """
        Forward pass with detailed dimension tracking
        
        Args:
            x: Input tensor of token indices [batch_size, sequence_length]
            hidden: Initial hidden state (optional)
            verbose: Print intermediate dimensions
        """
        batch_size, seq_len = x.shape
        
        if verbose:
            print(f"Input dimensions: {x.shape} [batch_size={batch_size}, seq_len={seq_len}]")
        
        # Step 1: Embedding lookup
        # Input: [batch_size, seq_len] -> Output: [batch_size, seq_len, embedding_dim]
        embedded = self.embedding(x)
        
        if verbose:
            print(f"After embedding: {embedded.shape} [batch_size, seq_len, embedding_dim]")
            print(f"Embedding weights shape: {self.embedding.weight.shape} [vocab_size, embedding_dim]")
        
        # Step 2: LSTM processing
        # Input: [batch_size, seq_len, embedding_dim]
        # Output: [batch_size, seq_len, hidden_dim], (h_n, c_n)
        if hidden is None:
            lstm_out, (h_n, c_n) = self.lstm(embedded)
        else:
            lstm_out, (h_n, c_n) = self.lstm(embedded, hidden)
        
        if verbose:
            print(f"LSTM output: {lstm_out.shape} [batch_size, seq_len, hidden_dim]")
            print(f"Final hidden state (h_n): {h_n.shape} [num_layers, batch_size, hidden_dim]")
            print(f"Final cell state (c_n): {c_n.shape} [num_layers, batch_size, hidden_dim]")
        
        # Step 3: Apply dropout
        lstm_out = self.dropout(lstm_out)
        
        # Step 4: Project to vocabulary size
        # Reshape: [batch_size, seq_len, hidden_dim] -> [batch_size * seq_len, hidden_dim]
        lstm_out_reshaped = lstm_out.reshape(-1, self.hidden_dim)
        
        if verbose:
            print(f"Reshaped for FC: {lstm_out_reshaped.shape} [batch_size*seq_len, hidden_dim]")
        
        # Linear layer: [batch_size * seq_len, hidden_dim] -> [batch_size * seq_len, vocab_size]
        output = self.fc(lstm_out_reshaped)
        
        if verbose:
            print(f"FC output: {output.shape} [batch_size*seq_len, vocab_size]")
        
        # Reshape back: [batch_size * seq_len, vocab_size] -> [batch_size, seq_len, vocab_size]
        output = output.reshape(batch_size, seq_len, self.vocab_size)
        
        if verbose:
            print(f"Final output: {output.shape} [batch_size, seq_len, vocab_size]")
        
        return output, (h_n, c_n)
    
    def init_hidden(self, batch_size, device):
        """Initialize hidden and cell states"""
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return (h0, c0)

class TextPreprocessor:
    def __init__(self):
        self.char_to_idx = {}
        self.idx_to_char = {}
        self.vocab_size = 0
    
    def fit(self, text):
        """Build vocabulary from text"""
        chars = sorted(list(set(text)))
        self.vocab_size = len(chars)
        
        self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
        self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
        
        print(f"Vocabulary size: {self.vocab_size}")
        print(f"Characters: {chars}")
    
    def encode(self, text):
        """Convert text to indices"""
        return [self.char_to_idx[char] for char in text]
    
    def decode(self, indices):
        """Convert indices to text"""
        return ''.join([self.idx_to_char[idx] for idx in indices])

def create_sequences(data, seq_length):
    """Create input-target pairs for training"""
    sequences = []
    targets = []
    
    for i in range(len(data) - seq_length):
        seq = data[i:i + seq_length]
        target = data[i + 1:i + seq_length + 1]
        sequences.append(seq)
        targets.append(target)
    
    return sequences, targets

def demonstrate_embeddings_and_processing():
    """Demonstrate the complete pipeline with dimension tracking"""
    
    # Sample text data
    text = "hello world this is a simple example for lstm text generation"
    
    # Preprocessing
    preprocessor = TextPreprocessor()
    preprocessor.fit(text)
    encoded_text = preprocessor.encode(text)
    
    print("="*60)
    print("TEXT PREPROCESSING")
    print("="*60)
    print(f"Original text: '{text}'")
    print(f"Encoded text: {encoded_text}")
    print(f"Vocabulary mapping: {preprocessor.char_to_idx}")
    
    # Create sequences
    seq_length = 10
    sequences, targets = create_sequences(encoded_text, seq_length)
    
    # Convert to tensors
    X = torch.tensor(sequences[:5])  # Take first 5 sequences for demo
    y = torch.tensor(targets[:5])
    
    print(f"\nSequence shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    
    # Model parameters
    vocab_size = preprocessor.vocab_size
    embedding_dim = 16
    hidden_dim = 32
    num_layers = 2
    
    print("\n" + "="*60)
    print("MODEL ARCHITECTURE")
    print("="*60)
    print(f"Vocabulary size: {vocab_size}")
    print(f"Embedding dimension: {embedding_dim}")
    print(f"Hidden dimension: {hidden_dim}")
    print(f"Number of LSTM layers: {num_layers}")
    
    # Create model
    model = LSTMTextGenerator(vocab_size, embedding_dim, hidden_dim, num_layers)
    
    print(f"\nModel parameters:")
    for name, param in model.named_parameters():
        print(f"  {name}: {param.shape}")
    
    # Forward pass with verbose output
    print("\n" + "="*60)
    print("FORWARD PASS WITH DIMENSION TRACKING")
    print("="*60)
    
    model.eval()
    with torch.no_grad():
        output, (h_n, c_n) = model(X, verbose=True)
    
    # Show embedding details
    print("\n" + "="*60)
    print("EMBEDDING LAYER DETAILS")
    print("="*60)
    
    sample_input = X[0:1, :3]  # First sequence, first 3 tokens
    print(f"Sample input tokens: {sample_input.squeeze().tolist()}")
    print(f"Corresponding characters: '{preprocessor.decode(sample_input.squeeze().tolist())}'")
    
    sample_embeddings = model.embedding(sample_input)
    print(f"Sample embeddings shape: {sample_embeddings.shape}")
    print(f"First token embedding:\n{sample_embeddings[0, 0, :].detach().numpy()}")
    
    # Show LSTM state evolution
    print("\n" + "="*60)
    print("LSTM STATE EVOLUTION")
    print("="*60)
    
    # Process one token at a time to show state evolution
    single_input = X[0:1, :1]  # First token of first sequence
    hidden = model.init_hidden(1, X.device)
    
    print("Processing tokens one by one:")
    for i in range(min(5, X.shape[1])):
        token = X[0:1, i:i+1]
        char = preprocessor.idx_to_char[token.item()]
        
        with torch.no_grad():
            _, (h_n, c_n) = model(token, hidden)
        
        print(f"Token {i}: '{char}' (idx: {token.item()})")
        print(f"  Hidden state norm: {torch.norm(h_n).item():.4f}")
        print(f"  Cell state norm: {torch.norm(c_n).item():.4f}")
        
        hidden = (h_n, c_n)

def generate_text(model, preprocessor, seed_text, length=50, temperature=1.0):
    """Generate text using the trained model"""
    model.eval()
    
    # Encode seed text
    current_seq = preprocessor.encode(seed_text)
    generated = current_seq.copy()
    
    # Initialize hidden state
    hidden = model.init_hidden(1, next(model.parameters()).device)
    
    with torch.no_grad():
        for _ in range(length):
            # Convert to tensor
            x = torch.tensor([current_seq]).long()
            
            # Forward pass
            output, hidden = model(x, hidden)
            
            # Get probabilities for the last token
            probs = F.softmax(output[0, -1] / temperature, dim=0)
            
            # Sample next token
            next_token = torch.multinomial(probs, 1).item()
            
            # Update sequences
            generated.append(next_token)
            current_seq = current_seq[1:] + [next_token]  # Sliding window
    
    return preprocessor.decode(generated)

# Run the demonstration
if __name__ == "__main__":
    print("LSTM TEXT GENERATION WITH EMBEDDINGS AND DIMENSION TRACKING")
    print("="*80)
    
    demonstrate_embeddings_and_processing()
    
    print("\n" + "="*60)
    print("TRAINING EXAMPLE")
    print("="*60)
    
    # Simple training example
    text = "hello world " * 20  # Repeat for more data
    preprocessor = TextPreprocessor()
    preprocessor.fit(text)
    encoded_text = preprocessor.encode(text)
    
    sequences, targets = create_sequences(encoded_text, 10)
    X_train = torch.tensor(sequences)
    y_train = torch.tensor(targets)
    
    model = LSTMTextGenerator(preprocessor.vocab_size, 16, 32, 2)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    print("Training for 50 epochs...")
    model.train()
    for epoch in range(50):
        optimizer.zero_grad()
        
        output, _ = model(X_train)
        loss = criterion(output.reshape(-1, preprocessor.vocab_size), 
                        y_train.reshape(-1))
        
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
    
    # Generate some text
    print("\nGenerated text:")
    generated = generate_text(model, preprocessor, "hello", 30)
    print(f"'{generated}'")

LSTM TEXT GENERATION WITH EMBEDDINGS AND DIMENSION TRACKING
Vocabulary size: 18
Characters: [' ', 'a', 'd', 'e', 'f', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'w', 'x']
TEXT PREPROCESSING
Original text: 'hello world this is a simple example for lstm text generation'
Encoded text: [6, 3, 8, 8, 11, 0, 16, 11, 13, 8, 2, 0, 15, 6, 7, 14, 0, 7, 14, 0, 1, 0, 14, 7, 9, 12, 8, 3, 0, 3, 17, 1, 9, 12, 8, 3, 0, 4, 11, 13, 0, 8, 14, 15, 9, 0, 15, 3, 17, 15, 0, 5, 3, 10, 3, 13, 1, 15, 7, 11, 10]
Vocabulary mapping: {' ': 0, 'a': 1, 'd': 2, 'e': 3, 'f': 4, 'g': 5, 'h': 6, 'i': 7, 'l': 8, 'm': 9, 'n': 10, 'o': 11, 'p': 12, 'r': 13, 's': 14, 't': 15, 'w': 16, 'x': 17}

Sequence shape: torch.Size([5, 10])
Target shape: torch.Size([5, 10])

MODEL ARCHITECTURE
Vocabulary size: 18
Embedding dimension: 16
Hidden dimension: 32
Number of LSTM layers: 2

Model parameters:
  embedding.weight: torch.Size([18, 16])
  lstm.weight_ih_l0: torch.Size([128, 16])
  lstm.weight_hh_l0: torch.Size([128, 32]