In [7]:
import torch as t
from data import get_data
from main import Config, PoetryModel
import numpy as np

def test_data_loading():
    """Test data loading and examine data structures"""
    # Initialize config
    opt = Config()
    opt.pickle_path = './data/tang.npz'
    
    # Get data
    data, word2ix, ix2word = get_data(opt)
    print("\n=== Data Overview ===")
    print(f"Data shape: {data.shape}")
    print(f"Vocabulary size: {len(word2ix)}")
    
    # Check special tokens and handle potential missing tokens
    print("\n=== Special Tokens ===")
    special_tokens = ['<START>', '<EOP>', ' ']
    for token in special_tokens:
        try:
            print(f"{token}: {word2ix.get(token, 'Not found')}")
        except:
            print(f"{token}: Not in vocabulary")
    
    # Show some sample vocabulary items
    print("\n=== Sample Characters ===")
    sample_chars = list("春江花月夜")
    for char in sample_chars:
        if char in word2ix:
            print(f"{char}: {word2ix[char]}")
    
    # Show first few items in ix2word
    print("\n=== First Few ix2word Mappings ===")
    for i in range(min(10, len(ix2word))):
        print(f"{i}: {ix2word[i]}")
    
    return data, word2ix, ix2word, opt

def test_dataloader(data, opt, ix2word):
    """Test DataLoader functionality"""
    print("\n=== DataLoader Test ===")
    data = t.from_numpy(data)
    dataloader = t.utils.data.DataLoader(
        data,
        batch_size=opt.batch_size,
        shuffle=True,
        num_workers=1
    )
    
    # Get one batch
    batch = next(iter(dataloader))
    print(f"Batch shape: {batch.shape}")
    
    # # Show contents of first few poems in the batch
    # print("\n=== Sample Poems from Batch ===")
    # for poem_idx in range(min(3, batch.shape[0])):  # Show first 3 poems
    #     poem = batch[poem_idx]
    #     # Convert indices to characters and filter out padding
    #     chars = [ix2word[idx.item()] for idx in poem if idx.item() < len(ix2word)]
    #     print(f"\nPoem {poem_idx + 1}:")
    #     print(''.join(chars))
    #     print(f"Length: {len(chars)} characters")
        
    #     # Show raw indices for debugging
    #     print("First 10 indices:", poem[:10].tolist())
    
    return dataloader

def test_single_example(data, opt, ix2word):
    """Show a single example from the dataset with better formatting"""
    print("\n=== Single Poem Example ===")
    
    # Convert numpy array to tensor
    data = t.from_numpy(data)
    single_poem = data[0]
    
    # Convert indices to characters
    chars = [ix2word[idx.item()] for idx in single_poem]
    
    # Split into padding and actual poem
    padding = []
    actual_poem = []
    for char in chars:
        if char == '</s>':
            padding.append(char)
        else:
            actual_poem.append(char)
    
    print("Structure breakdown:")
    print(f"Total length: {len(chars)} characters")
    print(f"Padding length: {len(padding)} </s> tokens")
    print(f"Actual poem length: {len(actual_poem)} characters")
    
    print("\nActual poem text:")
    print(''.join(actual_poem))
    
    # Show first few mappings of the actual poem (skipping padding)
    print("\nFirst few character mappings (excluding padding):")
    start_idx = len(padding)  # Skip padding tokens
    for i in range(start_idx, min(start_idx + 10, len(chars))):
        print(f"Index {single_poem[i].item()} -> Character '{chars[i]}'")



def main():
    # Run all tests
    data, word2ix, ix2word, opt = test_data_loading()
    dataloader = test_dataloader(data, opt, ix2word)
    test_single_example(data, opt, ix2word)

if __name__ == "__main__":
    main() 


=== Data Overview ===
Data shape: (57580, 125)
Vocabulary size: 8293

=== Special Tokens ===
<START>: 8291
<EOP>: 8290
 : Not found

=== Sample Characters ===
春: 3189
江: 193
花: 2808
月: 6933
夜: 7440

=== First Few ix2word Mappings ===
0: 憁
1: 耀
2: 枅
3: 涉
4: 谈
5: 伊
6: 鈌
7: 薙
8: 亟
9: 洞

=== DataLoader Test ===
Batch shape: torch.Size([128, 125])

=== Single Poem Example ===
Structure breakdown:
Total length: 125 characters
Padding length: 75 </s> tokens
Actual poem length: 50 characters

Actual poem text:
<START>度门能不访，冒雪屡西东。已想人如玉，遥怜马似骢。乍迷金谷路，稍变上阳宫。还比相思意，纷纷正满空。<EOP>

First few character mappings (excluding padding):
Index 8291 -> Character '<START>'
Index 6731 -> Character '度'
Index 4770 -> Character '门'
Index 1787 -> Character '能'
Index 8118 -> Character '不'
Index 7577 -> Character '访'
Index 7066 -> Character '，'
Index 4817 -> Character '冒'
Index 648 -> Character '雪'
Index 7121 -> Character '屡'


In [8]:
import torch
import torch.nn as nn

# Create a small vocabulary (let's say we have 10 Chinese characters)
vocab_size = 10
embedding_dim = 4  # Small dimension for demonstration
batch_size = 3
seq_length = 2

# Create an embedding layer
embedding = nn.Embedding(vocab_size, embedding_dim)

# Create some sample input indices
# Let's say we have 3 sequences, each with 2 characters
input_indices = torch.tensor([
    [0, 1],  # First sequence: character 0, character 1
    [2, 3],  # Second sequence: character 2, character 3
    [4, 5]   # Third sequence: character 4, character 5
])

# Convert input to expected shape [seq_len, batch_size]
input_indices = input_indices.transpose(0, 1)

print("Input indices shape:", input_indices.shape)
print("Input indices:\n", input_indices)

# Get embeddings
embedded = embedding(input_indices)

print("\nEmbedding shape:", embedded.shape)
print("Embedded vectors:\n", embedded)

# Let's look at one specific embedding
print("\nEmbedding for index 0:\n", embedded[0][0])

Input indices shape: torch.Size([2, 3])
Input indices:
 tensor([[0, 2, 4],
        [1, 3, 5]])

Embedding shape: torch.Size([2, 3, 4])
Embedded vectors:
 tensor([[[ 2.1206, -0.2110, -0.0433, -1.3023],
         [-1.8514, -0.8825, -0.9430, -0.4287],
         [ 0.5552, -0.5138,  0.7194,  0.4082]],

        [[ 1.6580,  0.1504, -0.8308,  0.5962],
         [ 1.5268, -0.2469, -1.1589,  0.1992],
         [-1.5513, -1.4598, -0.0251,  1.0985]]], grad_fn=<EmbeddingBackward0>)

Embedding for index 0:
 tensor([ 2.1206, -0.2110, -0.0433, -1.3023], grad_fn=<SelectBackward0>)


In [10]:
import torch
import torch.nn as nn

# Setup example
batch_size = 128
embedding_dim = 256
hidden_size = 512
num_layers = 2
vocab_size = 5000  # Example vocabulary size

# Let's say we have first character '春' (represented by index 42) for all sequences
first_char = torch.tensor([[42] * batch_size])  # Shape: [1, 128]

# 1. First, embed the character
embedding = nn.Embedding(vocab_size, embedding_dim)
embedded = embedding(first_char)  # Shape: [1, 128, 256]

# 2. Initialize LSTM
lstm = nn.LSTM(embedding_dim, hidden_size, num_layers)
h0 = torch.zeros(num_layers, batch_size, hidden_size)
c0 = torch.zeros(num_layers, batch_size, hidden_size)

# 3. Process through LSTM
output, (hn, cn) = lstm(embedded, (h0, c0))  # hn: final hidden state [num_layers, batch_size, hidden_size]
                                            # cn: final cell state [num_layers, batch_size, hidden_size]
# Print shapes of LSTM outputs
print("\nOutput shape:", output.shape)  # Should be [1, 128, 512]
print("Hidden state (hn) shape:", hn.shape)  # Should be [2, 128, 512] 
print("Cell state (cn) shape:", cn.shape)  # Should be [2, 128, 512]

# 4. Get predictions for next character
linear = nn.Linear(hidden_size, vocab_size)
predictions = linear(output.squeeze(0))  # Shape: [128, 5000]

# 5. Get most likely next characters
top_predictions = torch.topk(predictions, k=5, dim=1)

print("For input character '春':")
print("Top 5 predicted next characters for first sequence:")
print(f"Probabilities: {top_predictions.values[0]}")
print(f"Indices: {top_predictions.indices[0]}")


Output shape: torch.Size([1, 128, 512])
Hidden state (hn) shape: torch.Size([2, 128, 512])
Cell state (cn) shape: torch.Size([2, 128, 512])
For input character '春':
Top 5 predicted next characters for first sequence:
Probabilities: tensor([0.0655, 0.0649, 0.0645, 0.0641, 0.0638], grad_fn=<SelectBackward0>)
Indices: tensor([2920, 1614, 3569, 4749, 1323])
