In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import time
import math
import os
from collections import Counter

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bestwater/wikitext-2-v1")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/zkweng/.cache/kagglehub/datasets/bestwater/wikitext-2-v1/versions/1


**THEN MOVE THE FILE DIRECTORY TO THIS DIRECTORY**

In [3]:
import os

os.environ['CUDA_LAUNCH_BLOCKING']="1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

# Set random seeds for reproducibility
random.seed(2)
np.random.seed(2)
torch.manual_seed(2)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# WikiText2 corpus paths
corpus_path = './wikitext-2'
train_path = os.path.join(corpus_path, 'wiki.train.tokens')
valid_path = os.path.join(corpus_path, 'wiki.valid.tokens')
test_path = os.path.join(corpus_path, 'wiki.test.tokens')

In [5]:
# Hyperparameters
embed_dim = 100
hidden_dim = 256
dropout_prob = 0.5
num_epochs = 20
batch_size = 32
seq_length = 20  # Number of unrolled time steps
learning_rate = 0.001
vocab_size = 10000  # Reduced vocabulary size
unk_threshold = 5  # Frequency threshold for unknown tokens

In [6]:
def read_corpus(path):
    """Read corpus file and return list of whitespace-tokenized words"""
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read().replace('\n', ' <eos> ')
    return text.split(' ')

def build_vocab(tokens, threshold=unk_threshold):
    """Build vocabulary from tokens with frequency threshold"""
    counter = Counter(tokens)
    # Sort tokens by frequency (descending)
    sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    
    # Create vocabulary: reserve 0 for padding, 1 for <unk>
    vocab = {'<pad>': 0, '<unk>': 1, '<eos>': 2}
    idx = 3
    
    # Add tokens that appear more than threshold times
    for token, count in sorted_tokens:
        if count >= threshold and idx < vocab_size:
            if token and token != '<eos>':  # Skip empty tokens and already added special tokens
                vocab[token] = idx
                idx += 1
        if idx >= vocab_size:
            break
    
    return vocab

def tokens_to_indices(tokens, vocab):
    """Convert tokens to indices using vocabulary"""
    return [vocab.get(token, vocab['<unk>']) for token in tokens if token]

def batchify(data, bsz):
    """Divide dataset into batches and arrange for back-propagation through time"""
    # Work out how cleanly we can divide the dataset into bsz parts
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit
    data = data[:nbatch * bsz]
    # Evenly divide the data across the bsz batches
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

def get_batch(source, i, seq_length):
    """Get a batch for training"""
    seq_len = min(seq_length, source.size(0) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [7]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout_prob):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=False, dropout=dropout_prob)
        self.dropout = nn.Dropout(dropout_prob)
        self.decoder = nn.Linear(hidden_dim, vocab_size)
        self.hidden_dim = hidden_dim
        self.init_weights()
        
    def init_weights(self):
        """Initialize weights for better training"""
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def init_hidden(self, batch_size):
        """Initialize hidden state"""
        return torch.zeros(1, batch_size, self.hidden_dim).to(device)
        
    def forward(self, x, hidden):
        """Forward pass"""
        # x shape: (seq_len, batch_size)
        emb = self.dropout(self.embedding(x))  # (seq_len, batch_size, embed_dim)
        output, hidden = self.rnn(emb, hidden)  # output: (seq_len, batch_size, hidden_dim)
        output = self.dropout(output)
        decoded = self.decoder(output.view(-1, self.hidden_dim))  # (seq_len*batch_size, vocab_size)
        return decoded, hidden


In [8]:
def train(model, train_data, criterion, optimizer, seq_length):
    """Train model for one epoch"""
    model.train()
    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    
    # Get total number of batches
    num_batches = (train_data.size(0) - 1) // seq_length
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, seq_length)):
        # Skip if we'd go out of bounds
        if i > train_data.size(0) - 2:
            continue
            
        data, targets = get_batch(train_data, i, seq_length)
        
        # Initialize hidden state for new batch
        hidden = hidden.detach()
        
        # Forward pass
        output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        total_loss += loss.item()
        
        if batch % 50 == 0 and batch > 0:
            cur_loss = total_loss / 50
            elapsed = time.time() - start_time
            print(f'| epoch {epoch:3d} | batch {batch:5d}/{num_batches:5d} | '
                  f'ms/batch {elapsed * 1000 / 50:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}')
            total_loss = 0
            start_time = time.time()
    
def evaluate(model, eval_data, criterion, seq_length):
    """Evaluate the model"""
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(batch_size)
    num_tokens = 0
    
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, seq_length):
            # Skip if we'd go out of bounds
            if i > eval_data.size(0) - 2:
                continue
                
            data, targets = get_batch(eval_data, i, seq_length)
            hidden = hidden.detach()
            output, hidden = model(data, hidden)
            
            loss = criterion(output, targets)
            total_loss += loss.item() * targets.size(0)
            num_tokens += targets.size(0)
    
    # Calculate perplexity
    avg_loss = total_loss / num_tokens if num_tokens > 0 else float('inf')
    perplexity = math.exp(avg_loss)
    return perplexity

In [9]:
try:
    # Load and preprocess data
    print("Loading and preprocessing corpus...")
    train_tokens = read_corpus(train_path)
    valid_tokens = read_corpus(valid_path)
    test_tokens = read_corpus(test_path)

    # Build vocabulary from training tokens
    vocab = build_vocab(train_tokens)
    print(f"Vocabulary size: {len(vocab)}")

    # Convert tokens to indices
    train_indices = tokens_to_indices(train_tokens, vocab)
    valid_indices = tokens_to_indices(valid_tokens, vocab)
    test_indices = tokens_to_indices(test_tokens, vocab)

    # Safety check: ensure all indices are valid
    max_idx = max(train_indices + valid_indices + test_indices)
    if max_idx >= len(vocab):
        print(f"Warning: Found index {max_idx} but vocab size is {len(vocab)}")
        print("Fixing indices...")
        train_indices = [min(idx, len(vocab) - 1) for idx in train_indices]
        valid_indices = [min(idx, len(vocab) - 1) for idx in valid_indices]
        test_indices = [min(idx, len(vocab) - 1) for idx in test_indices]

    # Convert to tensors and batchify - use CPU first for safety
    train_data = batchify(torch.tensor(train_indices, dtype=torch.long), batch_size)
    valid_data = batchify(torch.tensor(valid_indices, dtype=torch.long), batch_size)
    test_data = batchify(torch.tensor(test_indices, dtype=torch.long), batch_size)

    # Initialize model
    model = RNNModel(len(vocab), embed_dim, hidden_dim, dropout_prob).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
except Exception as e:
    import traceback
    print(f"An error occurred: {e}")
    traceback.print_exc()

Loading and preprocessing corpus...
Vocabulary size: 9999
Fixing indices...




In [10]:
# Training loop
try:
    best_val_ppl = float('inf')
    print("Starting training...")

    for epoch in range(1, num_epochs + 1):
        epoch_start_time = time.time()
        train(model, train_data, criterion, optimizer, seq_length)
        val_ppl = evaluate(model, valid_data, criterion, seq_length)
        
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {(time.time() - epoch_start_time):5.2f}s | '
              f'valid ppl {val_ppl:8.2f}')
        print('-' * 89)
        
        # Save the model if validation performance improves
        if val_ppl < best_val_ppl:
            best_val_ppl = val_ppl
            torch.save(model.state_dict(), 'best_rnn_model.pth')
            print(f"New best model saved with perplexity: {val_ppl:8.2f}")

    # Load best model and evaluate on test set
    try:
        model.load_state_dict(torch.load('best_rnn_model.pth'))
        test_ppl = evaluate(model, test_data, criterion, seq_length)
        print('=' * 89)
        print(f'| End of training | test ppl {test_ppl:8.2f}')
        print('=' * 89)
    except Exception as e:
        print(f"Error loading best model: {e}")
        print("Evaluating with current model instead.")
        test_ppl = evaluate(model, test_data, criterion, seq_length)
        print('=' * 89)
        print(f'| End of training | test ppl {test_ppl:8.2f}')
        print('=' * 89)

except Exception as e:
    import traceback
    print(f"An error occurred: {e}")
    traceback.print_exc()

Starting training...
| epoch   1 | batch    50/ 3263 | ms/batch  7.87 | loss  7.19 | ppl  1331.74
| epoch   1 | batch   100/ 3263 | ms/batch  4.24 | loss  6.38 | ppl   592.34
| epoch   1 | batch   150/ 3263 | ms/batch  4.20 | loss  6.19 | ppl   486.03
| epoch   1 | batch   200/ 3263 | ms/batch  4.12 | loss  6.08 | ppl   437.33
| epoch   1 | batch   250/ 3263 | ms/batch  4.24 | loss  5.96 | ppl   388.67
| epoch   1 | batch   300/ 3263 | ms/batch  4.22 | loss  5.88 | ppl   359.57
| epoch   1 | batch   350/ 3263 | ms/batch  4.31 | loss  5.88 | ppl   357.22
| epoch   1 | batch   400/ 3263 | ms/batch  4.30 | loss  5.88 | ppl   356.52
| epoch   1 | batch   450/ 3263 | ms/batch  4.25 | loss  5.86 | ppl   351.13
| epoch   1 | batch   500/ 3263 | ms/batch  4.15 | loss  5.78 | ppl   323.83
| epoch   1 | batch   550/ 3263 | ms/batch  4.25 | loss  5.80 | ppl   329.24
| epoch   1 | batch   600/ 3263 | ms/batch  4.30 | loss  5.70 | ppl   299.36
| epoch   1 | batch   650/ 3263 | ms/batch  4.20 | loss

  model.load_state_dict(torch.load('best_rnn_model.pth'))


| End of training | test ppl   107.55


In [11]:
# Function to generate text
def generate_text(model, vocab, seed_text="the", max_length=50):
    """Generate text using the trained model"""
    model.eval()
    
    # Create reverse vocab (index to token)
    idx_to_token = {idx: token for token, idx in vocab.items()}
    
    # Convert seed text to tensor
    if seed_text in vocab:
        input_idx = vocab[seed_text]
    else:
        input_idx = vocab['<unk>']
    
    input_tensor = torch.tensor([[input_idx]], device=device)
    hidden = model.init_hidden(1)
    
    generated_tokens = [seed_text]
    
    with torch.no_grad():
        for _ in range(max_length):
            output, hidden = model(input_tensor, hidden)
            
            # Sample from the output distribution
            probs = torch.softmax(output, dim=1)
            next_token_idx = torch.multinomial(probs, 1).item()
            
            # Add generated token to output
            generated_tokens.append(idx_to_token.get(next_token_idx, '<unk>'))
            
            # Update input for next iteration
            input_tensor = torch.tensor([[next_token_idx]], device=device)
    
    return ' '.join(generated_tokens)

# Generate and print some text
print("\nGenerated Text Sample:")
print(generate_text(model, vocab, seed_text="the", max_length=50))


Generated Text Sample:
the <unk> , while both <unk> the 2010 , <unk> was H. Wright of the state of 1994 . Despite the <unk> , the characters were noted that it was not come for level in <unk> , modern years , though she told is commentators considered mainly in its <unk> itself
