# üîç Diagnostic: Why 0% Accuracy with Near-Zero Loss?

Something is wrong. Let's debug step by step.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

In [None]:
# ============================================================
# MINIMAL TEST DATA
# ============================================================

# Super simple examples
EXAMPLES = [
    ('walk', 'WALK'),
    ('run', 'RUN'),
    ('jump', 'JUMP'),
    ('walk twice', 'WALK WALK'),
    ('run twice', 'RUN RUN'),
]

print("Training examples:")
for cmd, out in EXAMPLES:
    print(f"  '{cmd}' ‚Üí '{out}'")

In [None]:
# ============================================================
# VOCABULARY
# ============================================================

class Vocabulary:
    def __init__(self):
        self.word2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.idx2word = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.n_words = 4
    
    def add_sentence(self, sentence):
        for word in sentence.split():
            if word not in self.word2idx:
                self.word2idx[word] = self.n_words
                self.idx2word[self.n_words] = word
                self.n_words += 1
    
    def encode(self, sentence, add_sos=False, add_eos=True):
        tokens = []
        if add_sos:
            tokens.append(self.word2idx['<SOS>'])
        tokens.extend([self.word2idx.get(w, self.word2idx['<UNK>']) for w in sentence.split()])
        if add_eos:
            tokens.append(self.word2idx['<EOS>'])
        return tokens
    
    def decode(self, indices):
        words = []
        for idx in indices:
            if isinstance(idx, torch.Tensor):
                idx = idx.item()
            if idx == self.word2idx['<EOS>']:
                break
            if idx not in [self.word2idx['<PAD>'], self.word2idx['<SOS>']]:
                words.append(self.idx2word.get(idx, '<UNK>'))
        return ' '.join(words)

# Build vocabularies
src_vocab = Vocabulary()
tgt_vocab = Vocabulary()

for cmd, out in EXAMPLES:
    src_vocab.add_sentence(cmd.lower())
    tgt_vocab.add_sentence(out)

print(f"Source vocab ({src_vocab.n_words}): {src_vocab.word2idx}")
print(f"Target vocab ({tgt_vocab.n_words}): {tgt_vocab.word2idx}")

In [None]:
# ============================================================
# TEST ENCODING/DECODING
# ============================================================

print("Testing encode/decode:")
for cmd, out in EXAMPLES:
    src_enc = src_vocab.encode(cmd.lower())
    tgt_enc = tgt_vocab.encode(out)
    
    src_dec = src_vocab.decode(src_enc)
    tgt_dec = tgt_vocab.decode(tgt_enc)
    
    print(f"  '{cmd}' ‚Üí {src_enc} ‚Üí '{src_dec}'")
    print(f"  '{out}' ‚Üí {tgt_enc} ‚Üí '{tgt_dec}'")
    print(f"  Match: {tgt_dec == out}")
    print()

In [None]:
# ============================================================
# DATASET
# ============================================================

class SimpleDataset(Dataset):
    def __init__(self, examples, src_vocab, tgt_vocab, max_len=10):
        self.examples = examples
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        cmd, out = self.examples[idx]
        
        src = self.src_vocab.encode(cmd.lower())
        tgt = self.tgt_vocab.encode(out, add_sos=True, add_eos=True)  # Add SOS for teacher forcing
        
        # Pad
        src = src[:self.max_len] + [0] * max(0, self.max_len - len(src))
        tgt = tgt[:self.max_len] + [0] * max(0, self.max_len - len(tgt))
        
        return torch.tensor(src), torch.tensor(tgt)

dataset = SimpleDataset(EXAMPLES, src_vocab, tgt_vocab)
loader = DataLoader(dataset, batch_size=5, shuffle=True)

# Check one batch
src_batch, tgt_batch = next(iter(loader))
print(f"Source batch shape: {src_batch.shape}")
print(f"Target batch shape: {tgt_batch.shape}")
print(f"\nFirst source: {src_batch[0].tolist()}")
print(f"First target: {tgt_batch[0].tolist()}")
print(f"Decoded target: '{tgt_vocab.decode(tgt_batch[0])}'")

In [None]:
# ============================================================
# SIMPLE TRANSFORMER
# ============================================================

class SimpleTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=64, nhead=4, num_layers=2, max_len=10):
        super().__init__()
        
        self.d_model = d_model
        self.max_len = max_len
        self.tgt_vocab_size = tgt_vocab_size
        
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = nn.Embedding(max_len, d_model)
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=d_model*4,
            dropout=0.1,
            batch_first=True
        )
        
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
    
    def forward(self, src, tgt):
        B = src.size(0)
        src_len = src.size(1)
        tgt_len = tgt.size(1)
        
        # Positions
        src_pos = torch.arange(src_len, device=src.device).unsqueeze(0).expand(B, -1)
        tgt_pos = torch.arange(tgt_len, device=tgt.device).unsqueeze(0).expand(B, -1)
        
        # Embeddings
        src_emb = self.src_emb(src) + self.pos_emb(src_pos)
        tgt_emb = self.tgt_emb(tgt) + self.pos_emb(tgt_pos)
        
        # Masks
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_len, device=src.device)
        src_pad_mask = (src == 0)
        tgt_pad_mask = (tgt == 0)
        
        # Forward
        out = self.transformer(
            src_emb, tgt_emb,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_pad_mask,
            tgt_key_padding_mask=tgt_pad_mask
        )
        
        return self.fc_out(out)
    
    def generate(self, src, max_len=10, verbose=False):
        """Autoregressive generation."""
        self.eval()
        B = src.size(0)
        
        # Start with SOS token
        tgt = torch.ones(B, 1, dtype=torch.long, device=src.device)  # SOS = 1
        
        if verbose:
            print(f"  Starting generation with SOS token")
        
        for step in range(max_len):
            with torch.no_grad():
                output = self.forward(src, tgt)
                logits = output[:, -1, :]  # Last position
                next_token = logits.argmax(dim=-1, keepdim=True)
                
                if verbose:
                    probs = torch.softmax(logits, dim=-1)
                    top_prob, top_idx = probs[0].topk(3)
                    print(f"  Step {step}: predicted token {next_token[0].item()}, "
                          f"top3: {[(idx.item(), f'{p:.2f}') for idx, p in zip(top_idx, top_prob)]}")
                
                tgt = torch.cat([tgt, next_token], dim=1)
                
                # Stop if EOS
                if (next_token == 2).all():  # EOS = 2
                    break
        
        return tgt

model = SimpleTransformer(
    src_vocab_size=src_vocab.n_words,
    tgt_vocab_size=tgt_vocab.n_words,
    d_model=64,
    nhead=4,
    num_layers=2
).to(device)

n_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {n_params:,}")

In [None]:
# ============================================================
# TRAINING
# ============================================================

def train(model, loader, epochs=200):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore PAD
    
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            
            # Teacher forcing: input is tgt[:-1], target is tgt[1:]
            tgt_input = tgt[:, :-1]  # Remove last token
            tgt_output = tgt[:, 1:]  # Remove first token (SOS)
            
            optimizer.zero_grad()
            
            output = model(src, tgt_input)
            
            # Reshape for loss
            output = output.reshape(-1, output.size(-1))
            tgt_output = tgt_output.reshape(-1)
            
            loss = criterion(output, tgt_output)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        if (epoch + 1) % 50 == 0:
            print(f"Epoch {epoch+1}: Loss = {total_loss/len(loader):.6f}")
    
    return total_loss / len(loader)

final_loss = train(model, loader, epochs=200)
print(f"\nFinal loss: {final_loss:.6f}")

In [None]:
# ============================================================
# DETAILED EVALUATION
# ============================================================

print("="*60)
print("DETAILED EVALUATION")
print("="*60)

model.eval()

for cmd, expected in EXAMPLES:
    print(f"\n--- Testing: '{cmd}' ‚Üí expected: '{expected}' ---")
    
    # Encode source
    src = src_vocab.encode(cmd.lower())
    src = src[:10] + [0] * (10 - len(src))
    src_tensor = torch.tensor([src], device=device)
    
    print(f"Source tokens: {src}")
    
    # Generate
    output = model.generate(src_tensor, max_len=8, verbose=True)
    
    print(f"Output tokens: {output[0].tolist()}")
    
    # Decode
    predicted = tgt_vocab.decode(output[0])
    
    print(f"Predicted: '{predicted}'")
    print(f"Expected:  '{expected}'")
    print(f"Match: {predicted == expected}")
    
    if predicted != expected:
        print(f"  Predicted bytes: {[ord(c) for c in predicted]}")
        print(f"  Expected bytes:  {[ord(c) for c in expected]}")

In [None]:
# ============================================================
# CHECK: What does the model predict at each position?
# ============================================================

print("\n" + "="*60)
print("TEACHER FORCING CHECK")
print("="*60)

# Test with teacher forcing - should be near perfect
model.eval()

for cmd, expected in EXAMPLES[:2]:
    print(f"\n--- '{cmd}' ‚Üí '{expected}' ---")
    
    src = src_vocab.encode(cmd.lower())
    src = src[:10] + [0] * (10 - len(src))
    
    tgt = tgt_vocab.encode(expected, add_sos=True, add_eos=True)
    tgt = tgt[:10] + [0] * (10 - len(tgt))
    
    src_tensor = torch.tensor([src], device=device)
    tgt_tensor = torch.tensor([tgt], device=device)
    
    print(f"Target tokens: {tgt}")
    
    with torch.no_grad():
        output = model(src_tensor, tgt_tensor[:, :-1])
        predictions = output.argmax(dim=-1)
    
    print(f"Predictions:   {predictions[0].tolist()}")
    print(f"Expected:      {tgt[1:]}")
    
    # Check accuracy
    tgt_output = tgt_tensor[:, 1:]
    mask = tgt_output != 0
    correct = (predictions == tgt_output) & mask
    acc = correct.sum().item() / mask.sum().item()
    print(f"Token accuracy: {acc:.1%}")

In [None]:
# ============================================================
# FINAL ACCURACY CALCULATION
# ============================================================

print("\n" + "="*60)
print("FINAL ACCURACY")
print("="*60)

correct = 0
total = len(EXAMPLES)

for cmd, expected in EXAMPLES:
    src = src_vocab.encode(cmd.lower())
    src = src[:10] + [0] * (10 - len(src))
    src_tensor = torch.tensor([src], device=device)
    
    output = model.generate(src_tensor, max_len=8, verbose=False)
    predicted = tgt_vocab.decode(output[0])
    
    is_correct = predicted == expected
    if is_correct:
        correct += 1
    
    status = "‚úì" if is_correct else "‚úó"
    print(f"{status} '{cmd}' ‚Üí '{predicted}' (expected: '{expected}')")

print(f"\nAccuracy: {correct}/{total} = {correct/total:.1%}")