In [None]:
from utils import ModelConfig, TrainingConfig, StreamingTextDataset, Trainer, ToyTransformer
import json
import torch

# Configure model - using real tokenizer vocab size now
model_config = ModelConfig(
    model_type='transformer_1L',  # Change this to experiment with different architectures
    vocab_size=50257,  # GPT-2 tokenizer size
    d_model=512,  # Moderate size for experiments
    n_head=8,
    n_ctx=512,  # Shorter context for faster training
    dropout=0.1
)

training_config = TrainingConfig(
    model_config=model_config,
    batch_size=16,  # Adjust based on GPU memory
    learning_rate=3e-3,
    max_iters=10000,
    eval_interval=500,
    log_interval=50
)

# Create data loaders
print("Initializing datasets...")
train_dataset = StreamingTextDataset(
    dataset_name='HuggingFaceFW/fineweb',  # Or 'openwebtext' or 'EleutherAI/pile'
    subset='sample-10BT',  # 10B token sample
    split='train',  # Will automatically exclude validation samples
    seq_length=model_config.n_ctx,
    validation_ratio=0.001  # 0.1% for validation
)

val_dataset = StreamingTextDataset(
    dataset_name='HuggingFaceFW/fineweb',
    subset='sample-10BT',
    split='validation',  # Will automatically filter to validation samples only
    seq_length=model_config.n_ctx,
    validation_ratio=0.001  # Same ratio to ensure consistent split
)

Initializing datasets...


Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/27468 [00:00<?, ?it/s]

In [None]:

# Create model
model = ToyTransformer(model_config)
print(f"Model type: {model_config.model_type}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")


# Create trainer
trainer = Trainer(model, training_config)

# Training loop
print("Starting training...")
for iter in range(training_config.max_iters):
    # Get batch of real data
    x, y = train_dataset.get_batch(training_config.batch_size)
    
    # Train step
    loss, lr = trainer.train_step(x, y)
    
    # Logging
    if iter % training_config.log_interval == 0:
        print(f"Iter {iter}: loss={loss:.4f}, lr={lr:.6f}")
        
    # Evaluation
    if iter % training_config.eval_interval == 0 and iter > 0:
        val_losses = []
        for _ in range(20):  # Evaluate on 20 batches
            x_val, y_val = val_dataset.get_batch(training_config.batch_size)
            _, val_loss = model(x_val, y_val)
            val_losses.append(val_loss.item())
        val_loss = np.mean(val_losses)
        print(f"Validation loss: {val_loss:.4f}")
        
        # Generate sample text
        model.eval()
        context = torch.zeros((1, 1), dtype=torch.long, device='cuda')
        generated = model.generate(context, max_new_tokens=100, temperature=0.8)
        print(f"Sample generation: {train_dataset.tokenizer.decode(generated[0].tolist())}")
        model.train()

print("Training complete!")

# Save model
torch.save(model.state_dict(), f'toy_transformer_{model_config.model_type}.pt')
print(f"Model saved to toy_transformer_{model_config.model_type}.pt")
# turn model config into a dictionary
model_config_dict = model_config.__dict__
# save model config to a json file
with open(f'{model_config.model_type}_config.json', 'w') as f:
    json.dump(model_config_dict, f)

Model type: transformer_1L
Parameters: 29,933,632
Starting training...
Iter 0: loss=11.2547, lr=0.000000
Iter 50: loss=11.0478, lr=0.001500
Iter 100: loss=10.5238, lr=0.003000
Iter 150: loss=9.5576, lr=0.002999
Iter 200: loss=8.7601, lr=0.002997
Iter 250: loss=8.3081, lr=0.002994
Iter 300: loss=7.7128, lr=0.002989
Iter 350: loss=7.8214, lr=0.002983
Iter 400: loss=7.7710, lr=0.002975
Iter 450: loss=7.7964, lr=0.002966
Iter 500: loss=7.3637, lr=0.002956


Token indices sequence length is longer than the specified maximum sequence length for this model (1961 > 1024). Running this sequence through the model will result in indexing errors


Validation loss: 7.4487
Sample generation: !
’s is a lot of the day and juice Colleg Colleg Colleg of my own intensity. When 168 168, but the State superherootor, she was not only a lot of his life has a great way. The blues was much more than a result in this season. It is a little more than years 820. The architects, but alsoraped who have been the new theatrical work and she is going to be ancand the bursting ethanol I thought wasUnix Talks.
In the (< to
Iter 550: loss=7.7004, lr=0.002944
Iter 600: loss=7.5519, lr=0.002931
Iter 650: loss=8.2446, lr=0.002917
Iter 700: loss=7.0425, lr=0.002901
Iter 750: loss=6.7437, lr=0.002884
Iter 800: loss=7.0528, lr=0.002866
Iter 850: loss=6.9519, lr=0.002847
Iter 900: loss=6.7577, lr=0.002826
Iter 950: loss=6.7265, lr=0.002804
Iter 1000: loss=6.8433, lr=0.002781
Validation loss: 6.7015
Sample generation: ! We are at all times, the door and more.
When I do I'm not have a different areas. The first never had been found that very much with food in a

In [None]:
from utils import visualize_prediction_error

# grab the first sample from the training dataset
x, y = train_dataset.get_batch(1)

# visualize the prediction error
text = val_dataset.tokenizer.decode(x[0].tolist())
visualize_prediction_error(model, val_dataset.tokenizer, text)

 on VHS, and even a letter penned by Holden expressing his apologies for not being able to attend the town's centennial celebration. Sitting atop powder-blue sheets on the baby bed is a copy of Holden's birth certificate.
Keller said he's unsure if any of Holden's kin has been back to O'Fallon to see the baby bed, but it's one the must-see items of visitors to the museum.
"Pictures with the bed," he said, "are very popular."
How to see it: 101 W. State St., O'Fallon
618-624-8409 or ofallonhistory.net
Contact reporter Sarah Baraba 618-344-0264, ext. 105FlirtySparks is one of the most user-friendly dating apps for singles seeking their soulmates. It is oriented directly toward making the matching process as simple as it can be, so all your dating needs are fulfilled.
Check out the options FlirtySparks has for you:
- Browse profiles of local charmers
- View lovely photos
- Chat with singles in friendly chat rooms
- Add users to your friend list
Install FlirtySparks and meet your bright fu

In [1]:
import torch
cuda = torch.cuda.is_available()
print(f"CUDA is available: {cuda}")

if cuda:
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available")

CUDA is available: True
CUDA device name: NVIDIA RTX A4000


In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA built with PyTorch: {torch.version.cuda}")b

PyTorch version: 2.8.0+cu126
CUDA built with PyTorch: 12.6


In [None]:
import os
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")
print(f"LD_LIBRARY_PATH: {os.environ.get('LD_LIBRARY_PATH', 'Not set')}")

In [None]:
import os
# Try unsetting it
if 'CUDA_VISIBLE_DEVICES' in os.environ:
    del os.environ['CUDA_VISIBLE_DEVICES']

# Or try setting it explicitly to 0
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Then reimport torch
import torch
print(torch.cuda.is_available())