In [1]:
from dataset import TinyShakespeareDataset
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt

from diffusion_transformer import TextDiffusionModel
from loss import DiffusionLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
dataset = TinyShakespeareDataset('input.txt', seq_len=16)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [3]:
next(iter(dataset))

(tensor([17, 18, 27, 30, 17,  1, 35, 17,  1, 28, 30, 27, 15, 17, 17, 16]),
 tensor([18, 21, 30, 31, 32,  1, 15, 21, 32, 21, 38, 17, 26, 10,  0, 14]))

In [4]:
# Hyperparameters
vocab_size = dataset.vocab_size  # Size of the vocabulary plus mask
embedding_dim = 64  # Size of embeddings (e.g., BERT-like model)
hidden_dim = 64  # Transformer hidden layer size
num_iterations = 20  # Number of iterative refinement steps
max_seq_len = 16  # Maximum sequence length
num_layers = 4
nhead = 4
# self, vocab_size, embedding_dim, hidden_dim, num_layers, nhead, max_seq_len, dropout=0.1
# Instantiate the model
#self, vocab_size, embed_dim, max_seq_length, num_steps, num_heads=8
model = TextDiffusionModel(vocab_size, embedding_dim, max_seq_len, num_iterations, nhead).to(device)

print(model)

TextDiffusionModel(
  (embedding): Embedding(39, 64)
  (transformer_encoder): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
    (linear1): Linear(in_features=64, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=64, bias=True)
    (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (guided_attention): GuidedAttentionLayer(
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
    )
  )
  (noise_predictor): Linear(in_features=64, out_features=64, bias=True)
)


In [5]:

# Initialize loss
criterion = DiffusionLoss(model)

# Initialize optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [7]:
batch.shape

torch.Size([32, 16])

In [6]:
losses = []
num_epochs = 10
num_steps = 20
batch_size = 32


for epoch in range(num_epochs):
    epoch_loss = 0
    for batch, _ in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        batch = batch.to(device)

        # Sample random timesteps
        t = torch.randint(0, num_steps, (batch_size,), device=batch.device)
        # Forward pass
        
        predicted_noise, actual_noise = model(batch, t)
        optimizer.zero_grad()

        loss = criterion(predicted_noise, actual_noise )
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(dataloader)
    losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

# Plot the loss curve
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs+1), losses)
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.show()


Epoch 1/10:   0%|          | 0/34856 [00:00<?, ?it/s]

torch.Size([32, 16, 64]) torch.Size([32, 16, 64]) torch.Size([32, 1, 1])





AssertionError: query should be unbatched 2D or batched 3D tensor but received 4-D query tensor

In [None]:
import torch

def generate_sequence(model, input, seq_len, num_iterations, device='cuda'):
    model.eval()
    
    with torch.no_grad():
        batch_size = 1  # Single sequence generation at a time
        
        # Initialize with tokenized input sequence
        x = torch.tensor([dataset.char_to_idx[char] for char in input], dtype=torch.long).unsqueeze(0).to(device)
        
        # Run the sequence through the model to perform denoising and generate tokens
        for i in range(num_iterations):
            x = model(x)
            #translate to index
            x = torch.argmax(x, dim=-1)
            

        # Sample the generated tokens
        return x.squeeze(0).cpu()

# Example usage:
seq_len = 100  # Desired length of the generated sequence
num_iterations = 1000  # Number of diffusion steps, which is handled by the model

# Generate a sequence
generated_tokens = generate_sequence(model, "oh no romeo", seq_len, num_iterations, device='cuda')

# Convert tokens to characters
# Assuming `dataset.idx_to_char` maps indices to characters
generated_text = ''.join(dataset.idx_to_char[token.item()] for token in generated_tokens.cpu())


In [None]:
generated_tokens

tensor([[ 0.7485,  2.1508, -2.1154, -5.1974, -4.9926, -1.0194, -0.0119, -2.2634,
         -0.6855, -4.8638, -0.6134, -1.6207, -1.9330,  1.1898, -0.1320,  0.0816,
          0.6501,  1.6042,  0.0556, -0.2745,  1.0356,  1.0558, -2.8067, -0.7493,
          0.6877,  0.1805,  0.9682,  1.4411, -0.3341, -2.7760,  1.1259,  1.0783,
          1.3638,  0.5503, -0.7214,  0.1840, -3.4219,  0.2789, -3.3638],
        [ 0.7485,  2.1508, -2.1154, -5.1974, -4.9926, -1.0194, -0.0119, -2.2634,
         -0.6855, -4.8638, -0.6134, -1.6207, -1.9330,  1.1898, -0.1320,  0.0816,
          0.6501,  1.6042,  0.0556, -0.2745,  1.0356,  1.0558, -2.8067, -0.7493,
          0.6877,  0.1805,  0.9682,  1.4411, -0.3341, -2.7760,  1.1259,  1.0783,
          1.3638,  0.5503, -0.7214,  0.1840, -3.4219,  0.2789, -3.3638],
        [ 0.7485,  2.1508, -2.1154, -5.1974, -4.9926, -1.0194, -0.0119, -2.2634,
         -0.6855, -4.8638, -0.6134, -1.6207, -1.9330,  1.1898, -0.1320,  0.0816,
          0.6501,  1.6042,  0.0556, -0.2745,

In [None]:
print("".join([dataset.idx_to_char[x.item()] for x in  generated_tokens]))

           
