In [None]:
%%capture
!pip install torch transformers datasets tokenizers wandb tqdm numpy huggingface-hub accelerate gradio

In [None]:
# Cell 2 - Imports and Setup
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import get_cosine_schedule_with_warmup
import wandb
from tqdm.auto import tqdm
import numpy as np
from typing import Optional, List
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import os
import gc
from contextlib import contextmanager
import jsonpointer
import json

# Clear GPU memory and cache
torch.cuda.empty_cache()
gc.collect()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

@contextmanager
def nullcontext():
    yield

Using device: cuda


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_seq_length: int = 512):
        super().__init__()
        position = torch.arange(max_seq_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_seq_length, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """x: [batch_size, seq_len, d_model]"""
        return x + self.pe[:, :x.size(1), :]

class DecoderBlock(nn.Module):
    def __init__(self, d_model: int, n_heads: int, d_ff: int = 2048, dropout: float = 0.1):
        super().__init__()
        self.self_attention = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output, _ = self.self_attention(x, x, x, attn_mask=mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class TransformerDecoder(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 d_model: int = 1024,
                 n_layers: int = 12,
                 n_heads: int = 16,
                 d_ff: int = 4096,
                 max_seq_length: int = 256,
                 dropout: float = 0.1):
        super().__init__()

        self.max_seq_length = max_seq_length
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.dropout = nn.Dropout(dropout)

        self.layers = nn.ModuleList([
            DecoderBlock(d_model, n_heads, d_ff, dropout)
            for _ in range(n_layers)
        ])

        self.final_layer = nn.Linear(d_model, vocab_size)
        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.token_embedding.weight, mean=0.0, std=0.01)

        for layer in self.layers:
            nn.init.normal_(layer.self_attention.in_proj_weight, mean=0.0, std=0.01)
            nn.init.normal_(layer.self_attention.out_proj.weight, mean=0.0, std=0.01)

            for name, param in layer.ff.named_parameters():
                if 'weight' in name:
                    nn.init.normal_(param, mean=0.0, std=0.01)
                elif 'bias' in name:
                    nn.init.zeros_(param)

        nn.init.normal_(self.final_layer.weight, mean=0.0, std=0.01)
        nn.init.zeros_(self.final_layer.bias)

    def forward(self, x, mask=None):
        # Create causal mask if not provided
        if mask is None:
            seq_length = x.size(1)
            mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool()
            mask = mask.to(x.device)

        x = self.token_embedding(x)
        x = x.transpose(0, 1)  # Convert to sequence-first format
        x = self.positional_encoding(x)
        x = self.dropout(x)
        x = x.transpose(0, 1)  # Convert back to batch-first

        for layer in self.layers:
            x = layer(x, mask=mask)

        output = self.final_layer(x)
        return output

    @classmethod
    def from_pretrained(cls, model_path: str, device: str = 'cpu'):
        """Load a pretrained model from a directory"""
        try:
            # Load config
            config_path = os.path.join(model_path, "config.json")
            if not os.path.exists(config_path):
                raise FileNotFoundError(f"Config not found at {config_path}")

            with open(config_path) as f:
                config = json.load(f)

            # Create model instance
            model = cls(
                vocab_size=config['vocab_size'],
                d_model=config['d_model'],
                n_layers=config['n_layers'],
                n_heads=config['n_heads'],
                d_ff=config['d_ff'],
                max_seq_length=config['max_seq_length'],
                dropout=config.get('dropout', 0.1)
            )

            # Load weights
            weights_path = os.path.join(model_path, "pytorch_model.bin")
            if not os.path.exists(weights_path):
                raise FileNotFoundError(f"Weights not found at {weights_path}")

            state_dict = torch.load(weights_path, map_location=device)
            model.load_state_dict(state_dict)

            return model.to(device)

        except Exception as e:
            raise Exception(f"Error loading model from {model_path}: {str(e)}")

    def generate(self, tokenizer, prompt, max_length=100, temperature=0.5, device='cpu', top_k=20):
      """Generate text from a prompt"""
      self.eval()

      # Tokenize the prompt
      tokens = tokenizer.encode(prompt).ids
      input_ids = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)

      with torch.no_grad():
          for _ in range(max_length):
              # Forward pass to get logits
              outputs = self(input_ids)
              next_token_logits = outputs[:, -1, :].float()

              # Scale logits by temperature
              next_token_logits = next_token_logits / temperature

              # Get the top_k tokens and their probabilities
              probs = torch.softmax(next_token_logits, dim=-1)
              probs, indices = torch.topk(probs, top_k, dim=-1)

              # Normalize probabilities
              probs = probs / torch.sum(probs, dim=-1, keepdim=True)

              # Sample from the top_k probabilities
              next_token = indices[0, torch.multinomial(probs[0], num_samples=1).item()].unsqueeze(0)

              # Append the sampled token to the input sequence
              input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)

              # Stop if the EOS token is generated
              if next_token.item() == tokenizer.token_to_id("[EOS]"):
                  break

      # Decode the generated sequence
      return tokenizer.decode(input_ids[0].tolist())




In [None]:
class TextDataset(Dataset):
    def __init__(self, texts: List[str], tokenizer, max_length: int = 256, stride: int = 128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        print("Encoding texts...")
        self.encoded_texts = self._encode_texts(texts)

    def _encode_texts(self, texts):
        encoded = []
        for text in texts:
            tokens = self.tokenizer.encode(text).ids
            if len(tokens) < self.max_length:
                tokens = tokens + [self.tokenizer.token_to_id("[PAD]")] * (self.max_length - len(tokens))
            else:
                tokens = tokens[:self.max_length]
            encoded.append(tokens)
        return encoded

    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, idx):
        return torch.tensor(self.encoded_texts[idx])

def create_tokenizer(texts, vocab_size=50000):
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"],
        min_frequency=2
    )
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.train_from_iterator(texts, trainer)
    return tokenizer

def save_tokenizer(tokenizer, path: str):
    """Save tokenizer to disk"""
    tokenizer.save(f"{path}/tokenizer.json")

def load_tokenizer(path: str):
    """Load tokenizer from disk"""
    return Tokenizer.from_file(f"{path}/tokenizer.json")

In [None]:
class TransformerTrainer:
    def __init__(
        self,
        model,
        train_dataloader,
        val_dataloader: Optional = None,
        lr: float = 3e-4,
        device = None,
        gradient_accumulation_steps: int = 32
    ):
        self.device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Training on device: {self.device}")

        self.model = model.to(self.device)
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.gradient_accumulation_steps = gradient_accumulation_steps

        # Optimizer setup
        self.optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=lr,
            weight_decay=0.01,
            eps=1e-8,
            betas=(0.9, 0.95)
        )

        # Learning rate scheduler
        num_training_steps = len(train_dataloader) * 50 // gradient_accumulation_steps
        num_warmup_steps = num_training_steps // 5

        self.scheduler = get_cosine_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        self.train_losses = []
        self.best_loss = float('inf')
        self.scaler = torch.amp.GradScaler('cuda') if self.device.type == 'cuda' else None

    def _compute_loss(self, batch):
      x = batch.to(self.device)

      # Create causal mask
      seq_length = x.size(1)
      mask = (torch.triu(torch.ones(seq_length, seq_length)) == 1).transpose(0, 1)
      mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
      mask = mask.to(self.device)

      with torch.amp.autocast('cuda') if self.device.type == 'cuda' else nullcontext():
          logits = self.model(x, mask=mask)
          shift_logits = logits[:, :-1, :].contiguous()
          shift_labels = x[:, 1:].contiguous()

          # Fix: Use -100 as default ignore_index for padding
          loss = F.cross_entropy(
              shift_logits.view(-1, shift_logits.size(-1)),
              shift_labels.view(-1),
              label_smoothing=0.1,
              ignore_index=-100  # Fixed: Use -100 as default padding index
          )

      return loss

    def save_checkpoint(self, epoch, loss, path='checkpoints'):
        os.makedirs(path, exist_ok=True)
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'loss': loss,
            'train_losses': self.train_losses
        }
        torch.save(checkpoint, f'{path}/checkpoint_epoch_{epoch}.pt')

    def train(self, epochs=50):
        print("\nStarting training...")
        wandb.init(project="transformer-training")

        for epoch in range(epochs):
            self.model.train()
            epoch_loss = 0

            pbar = tqdm(total=len(self.train_dataloader),
                       desc=f"Epoch {epoch+1}/{epochs}",
                       leave=True)

            running_loss = 0
            self.optimizer.zero_grad()

            for batch_idx, batch in enumerate(self.train_dataloader, 1):
                try:
                    loss = self._compute_loss(batch)
                    scaled_loss = loss / self.gradient_accumulation_steps

                    self.scaler.scale(scaled_loss).backward()

                    if batch_idx % self.gradient_accumulation_steps == 0:
                        self.scaler.unscale_(self.optimizer)
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)

                        self.scaler.step(self.optimizer)
                        self.scaler.update()
                        self.scheduler.step()
                        self.optimizer.zero_grad()

                        running_loss += loss.item()
                        avg_loss = running_loss / self.gradient_accumulation_steps
                        self.train_losses.append(avg_loss)

                        wandb.log({
                            'loss': avg_loss,
                            'learning_rate': self.scheduler.get_last_lr()[0]
                        })

                        pbar.set_postfix({
                            'loss': f'{avg_loss:.4f}',
                            'lr': f'{self.scheduler.get_last_lr()[0]:.2e}'
                        })
                        pbar.update(self.gradient_accumulation_steps)

                        epoch_loss += running_loss
                        running_loss = 0

                        if batch_idx % 500 == 0:
                            torch.cuda.empty_cache()

                except RuntimeError as e:
                    if "out of memory" in str(e):
                        print(f"\nOOM error in batch {batch_idx}. Skipping...")
                        if hasattr(torch.cuda, 'empty_cache'):
                            torch.cuda.empty_cache()
                        continue
                    else:
                        raise e

            avg_epoch_loss = epoch_loss / len(self.train_dataloader)
            print(f"\nEpoch {epoch+1} Summary:")
            print(f"Average Loss: {avg_epoch_loss:.4f}")
            print(f"Learning Rate: {self.scheduler.get_last_lr()[0]:.2e}")

            if avg_epoch_loss < self.best_loss:
                self.best_loss = avg_epoch_loss
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'loss': avg_epoch_loss,
                }, 'best_model.pt')
                print("► New best model saved!")

            if avg_epoch_loss < 0.099999:
                print(f"\n✓ Reached target loss!")
                break

In [None]:
print("Loading data...")
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
texts = [t.strip() for t in text.split('\n') if t.strip()]
print(f"Loaded {len(texts)} text segments")

print("\nCreating tokenizer...")
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(
    vocab_size=50000,
    special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"],
)
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train_from_iterator(texts, trainer)
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")

# Save tokenizer
print("Saving tokenizer...")
tokenizer.save("tokenizer.json")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")

print("\nTesting tokenizer...")
test_text = texts[0][:100]
encoded = tokenizer.encode(test_text)
decoded = tokenizer.decode(encoded.ids)
print(f"Original: {test_text}")
print(f"Decoded : {decoded}")

print("\nCreating dataset...")
dataset = TextDataset(texts, tokenizer, max_length=256, stride=128)
print(f"Created dataset with {len(dataset)} sequences")

# Create train/val split
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print("\nCreating dataloaders...")
train_loader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=True,
    pin_memory=True
)

print("\nInitializing model...")
model = TransformerDecoder(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=1024,
    n_layers=12,
    n_heads=16,
    d_ff=4096,
    max_seq_length=256
)

total_params = sum(p.numel() for p in model.parameters())
print(f"Model initialized with {total_params:,} parameters")
print("Verifying random initialization...")
print(f"Embedding mean: {model.token_embedding.weight.mean().item():.4f}")
print(f"Embedding std: {model.token_embedding.weight.std().item():.4f}")

trainer = TransformerTrainer(
    model=model,
    train_dataloader=train_loader,
    val_dataloader=None,
    lr=3e-4,
    device=device,
    gradient_accumulation_steps=16
)

# Start training
trainer.train(epochs=10)

Loading data...
Loaded 32777 text segments

Creating tokenizer...
Vocabulary size: 18150
Saving tokenizer...
Vocabulary size: 18150

Testing tokenizer...
Original: First Citizen:
Decoded : First Citizen :

Creating dataset...
Encoding texts...
Created dataset with 32777 sequences

Creating dataloaders...

Initializing model...
Model initialized with 188,344,038 parameters
Verifying random initialization...
Embedding mean: 0.0000
Embedding std: 0.0100
Training on device: cuda

Starting training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch 1/10:   0%|          | 0/8195 [00:00<?, ?it/s]


Epoch 1 Summary:
Average Loss: 0.1356
Learning Rate: 3.00e-05
► New best model saved!


Epoch 2/10:   0%|          | 0/8195 [00:00<?, ?it/s]


Epoch 2 Summary:
Average Loss: 0.0913
Learning Rate: 6.00e-05
► New best model saved!

✓ Reached target loss!


In [None]:
# Cell 8 - Verify Training Results
import os

def verify_training():
    """Verify that training completed successfully and files exist"""
    print("Verifying training results...")

    # 1. Check if best_model.pt exists and load it
    if not os.path.exists('best_model.pt'):
        raise FileNotFoundError("best_model.pt not found! Training may not have completed.")

    checkpoint = torch.load('best_model.pt', map_location='cpu')
    print(f"\n✓ Found best_model.pt")
    print(f"  • Best loss: {checkpoint['loss']:.4f}")
    print(f"  • Saved at epoch: {checkpoint['epoch']}")

    # 2. Check if tokenizer.json exists
    if not os.path.exists('tokenizer.json'):
        raise FileNotFoundError("tokenizer.json not found! Tokenizer was not saved.")

    # Load tokenizer to verify it works
    test_tokenizer = Tokenizer.from_file('tokenizer.json')
    vocab_size = test_tokenizer.get_vocab_size()
    print(f"\n✓ Found tokenizer.json")
    print(f"  • Vocabulary size: {vocab_size}")

    # 3. Test model generation
    print("\nTesting model generation...")
    model = TransformerDecoder(
        vocab_size=vocab_size,
        d_model=1024,
        n_layers=12,
        n_heads=16,
        d_ff=4096,
        max_seq_length=256
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    test_prompt = "First Citizen"
    try:
        generated = model.generate(test_tokenizer, test_prompt, max_length=100,top_k=50,device='cpu')
        print(f"\n✓ Model generation test successful")
        print(f"  • Input: {test_prompt}")
        print(f"  • Output: {generated}")
        return True
    except Exception as e:
        print(f"\n❌ Model generation test failed: {str(e)}")
        return False

# Run verification
is_ready = verify_training()

if is_ready:
    print("\n✅ All checks passed! Ready for deployment.")
else:
    print("\n❌ Please fix the issues before deploying.")

Verifying training results...


  checkpoint = torch.load('best_model.pt', map_location='cpu')



✓ Found best_model.pt
  • Best loss: 0.0913
  • Saved at epoch: 1

✓ Found tokenizer.json
  • Vocabulary size: 18150

Testing model generation...

✓ Model generation test successful
  • Input: First Citizen
  • Output: First Citizen :

✅ All checks passed! Ready for deployment.


In [None]:
# Cell 8 - Verify and Deploy
import os
import json
from huggingface_hub import HfApi, create_repo, login
import shutil

def prepare_and_deploy(username="ninagala", model_repo_name="shakespeare-model", space_repo_name="shakespeare-app"):
    """Prepare and deploy model and demo"""
    print("Starting deployment process...")

    # Create directories
    os.makedirs("model_repo", exist_ok=True)
    os.makedirs("space_repo", exist_ok=True)

    try:
        # 1. Verify files exist
        if not os.path.exists('best_model.pt'):
            raise FileNotFoundError("best_model.pt not found!")
        if not os.path.exists('tokenizer.json'):
            raise FileNotFoundError("tokenizer.json not found!")

        # 2. Prepare model files
        print("\nPreparing model files...")

        # Load checkpoint
        checkpoint = torch.load('best_model.pt', map_location='cpu', weights_only=True)

        # Save config
        config = {
            'vocab_size': tokenizer.get_vocab_size(),
            'd_model': 1024,
            'n_layers': 12,
            'n_heads': 16,
            'd_ff': 4096,
            'max_seq_length': 256,
            'dropout': 0.1,
            'best_loss': checkpoint['loss']
        }

        with open("model_repo/config.json", "w") as f:
            json.dump(config, f, indent=2)

        # Save model weights
        torch.save(checkpoint['model_state_dict'], "model_repo/pytorch_model.bin")

        # Copy tokenizer
        shutil.copy('tokenizer.json', "model_repo/tokenizer.json")

        # 3. Create Gradio demo files
        print("Creating Gradio files...")

        # Update app.py with correct model path
        with open("space_repo/app.py", "w") as f:
            f.write(f"""
import gradio as gr
import torch
import torch.nn as nn
import torch.nn.functional as F
from tokenizers import Tokenizer
from huggingface_hub import hf_hub_download
import os
import json
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_seq_length: int = 512):
        super().__init__()
        position = torch.arange(max_seq_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_seq_length, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class DecoderBlock(nn.Module):
    def __init__(self, d_model: int, n_heads: int, d_ff: int = 2048, dropout: float = 0.1):
        super().__init__()
        self.self_attention = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output, _ = self.self_attention(x, x, x, attn_mask=mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class TransformerDecoder(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 d_model: int = 1024,
                 n_layers: int = 12,
                 n_heads: int = 16,
                 d_ff: int = 4096,
                 max_seq_length: int = 256,
                 dropout: float = 0.1):
        super().__init__()

        self.max_seq_length = max_seq_length
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.dropout = nn.Dropout(dropout)

        self.layers = nn.ModuleList([
            DecoderBlock(d_model, n_heads, d_ff, dropout)
            for _ in range(n_layers)
        ])

        self.final_layer = nn.Linear(d_model, vocab_size)
        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.token_embedding.weight, mean=0.0, std=0.01)

        for layer in self.layers:
            nn.init.normal_(layer.self_attention.in_proj_weight, mean=0.0, std=0.01)
            nn.init.normal_(layer.self_attention.out_proj.weight, mean=0.0, std=0.01)

            for name, param in layer.ff.named_parameters():
                if 'weight' in name:
                    nn.init.normal_(param, mean=0.0, std=0.01)
                elif 'bias' in name:
                    nn.init.zeros_(param)

        nn.init.normal_(self.final_layer.weight, mean=0.0, std=0.01)
        nn.init.zeros_(self.final_layer.bias)

    def forward(self, x, mask=None):
        # Create causal mask if not provided
        if mask is None:
            seq_length = x.size(1)
            mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool()
            mask = mask.to(x.device)

        x = self.token_embedding(x)
        x = x.transpose(0, 1)  # Convert to sequence-first format
        x = self.positional_encoding(x)
        x = self.dropout(x)
        x = x.transpose(0, 1)  # Convert back to batch-first

        for layer in self.layers:
            x = layer(x, mask=mask)

        output = self.final_layer(x)
        return output

    @classmethod
    def from_pretrained(cls, model_id: str, device: str = 'cpu'):
        config_file = hf_hub_download(repo_id=model_id, filename="config.json")
        with open(config_file) as f:
            config = json.load(f)
        model = cls(
            vocab_size=config['vocab_size'],
            d_model=config['d_model'],
            n_layers=config['n_layers'],
            n_heads=config['n_heads'],
            d_ff=config['d_ff'],
            max_seq_length=config['max_seq_length'],
            dropout=config.get('dropout', 0.1)
        )
        weights_file = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin")
        state_dict = torch.load(weights_file, map_location=device)
        model.load_state_dict(state_dict)
        return model.to(device)


def generate_text(prompt, max_length=100, temperature=0.7):
    try:
        # Load model and tokenizer from Hugging Face Hub
        model_id = "{username}/{model_repo_name}"
        tokenizer_file = hf_hub_download(repo_id=model_id, filename="tokenizer.json")

        model = TransformerDecoder.from_pretrained(model_id)
        tokenizer = Tokenizer.from_file(tokenizer_file)

        # Generate text
        model.eval()
        tokens = tokenizer.encode(prompt).ids
        input_ids = torch.tensor(tokens).unsqueeze(0)

        with torch.no_grad():
            for _ in range(max_length):
                outputs = model(input_ids)
                next_token_logits = outputs[:, -1, :] / temperature
                probs = F.softmax(next_token_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
                input_ids = torch.cat([input_ids, next_token], dim=1)

                if next_token.item() == tokenizer.token_to_id("[EOS]"):
                    break

        return tokenizer.decode(input_ids[0].tolist())
    except Exception as e:
        return f"Error: {{str(e)}}"

# Create Gradio interface
demo = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(lines=3, placeholder="Enter your prompt here...", label="Prompt"),
        gr.Slider(20, 200, value=100, step=1, label="Maximum Length"),
        gr.Slider(0.1, 2.0, value=0.7, label="Temperature")
    ],
    outputs=gr.Textbox(label="Generated Text"),
    title="Shakespeare Text Generator",
    description="Generate Shakespeare-style text using a transformer decoder.",
    examples=[
        ["To be, or not to be"],
        ["Friends, Romans, countrymen"],
        ["Now is the winter of our discontent"]
    ]
)

if __name__ == "__main__":
    demo.launch()
""")

        # Create requirements.txt
        with open("space_repo/requirements.txt", "w") as f:
            f.write("""
torch>=2.0.0
transformers>=4.30.0
gradio>=3.40.0
tokenizers>=0.14.0
huggingface-hub>=0.16.4
""".strip())

        # 4. Deploy to Hugging Face
        print("\nDeploying to Hugging Face...")

        # Login
        HF_TOKEN = input("Enter your Hugging Face token: ")
        login(token=HF_TOKEN)

        api = HfApi()

        # Push model
        print("\nPushing model...")
        model_repo = f"{username}/{model_repo_name}"
        create_repo(model_repo, exist_ok=True)
        api.upload_folder(
            folder_path="model_repo",
            repo_id=model_repo,
            repo_type="model"
        )
        print(f"✓ Model pushed to: https://huggingface.co/{model_repo}")

        # Push space
        print("\nPushing Gradio demo...")
        space_repo = f"{username}/{space_repo_name}"
        create_repo(space_repo, repo_type="space", space_sdk="gradio", exist_ok=True)
        api.upload_folder(
            folder_path="space_repo",
            repo_id=space_repo,
            repo_type="space"
        )
        print(f"✓ Demo pushed to: https://huggingface.co/spaces/{space_repo}")

    except Exception as e:
        print(f"\n❌ Error during deployment: {str(e)}")
        raise e
    finally:
        # Cleanup
        shutil.rmtree("model_repo", ignore_errors=True)
        shutil.rmtree("space_repo", ignore_errors=True)

# Execute deployment
prepare_and_deploy(
    username="ninagala",
    model_repo_name="shakespeare-model-decoder",  # Updated name
    space_repo_name="shakespeare-decoder-app"     # Updated name
)

# https://huggingface.co/ninagala/shakespeare-model-decoder/tree/main

Starting deployment process...

Preparing model files...
Creating Gradio files...

Deploying to Hugging Face...
Enter your Hugging Face token: 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

Pushing model...


No files have been modified since last commit. Skipping to prevent empty commit.


✓ Model pushed to: https://huggingface.co/ninagala/shakespeare-model-decoder

Pushing Gradio demo...
✓ Demo pushed to: https://huggingface.co/spaces/ninagala/shakespeare-decoder-app
