# Do Llamas Work in English? - NDIF Cookbook Replication

> **Paper:** "Do Llamas Work in English? On the Latent Language of Multilingual Transformers"  
> Wendler, Veselovsky, Monea, West (2024) - [arXiv:2402.10588](https://arxiv.org/abs/2402.10588)

## Scope

**Implemented:**
- Translation experiment (French → Chinese) with logit lens analysis
- Cloze experiment (French fill-in-the-blank)
- Probability tracking across all 32 layers
- Entropy and energy metric computation
- Results saved to `.pt` files for reproducibility

**Not Implemented:**
- Other language pairs (de→zh, ru→zh)
- Larger models (13B, 70B)
- Intervention experiments (Section 5)

This implementation uses **nnsight** with **NDIF remote execution** (no local GPU required).

In [None]:
# Cell 0: Install Dependencies
# Run this cell first to install all required packages

%pip install torch nnsight transformers pandas numpy matplotlib seaborn tqdm --quiet

print("Dependencies installed successfully!")

Note: you may need to restart the kernel to use updated packages.
Dependencies installed successfully!



[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Section 1: Setup and Configuration

In [None]:
# Cell 1.1: Imports
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# nnsight for model access
from nnsight import LanguageModel, CONFIG

# Set random seeds for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

# Plot settings
plt.rcParams.update({'font.size': 16})

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Cell 1.2: Configuration
# ============================================
# REPLACE THESE WITH YOUR TOKENS
# ============================================
NDIF_TOKEN = ""  # Get from https://login.ndif.us
HF_TOKEN = ""    # Get from https://huggingface.co/settings/tokens

# Set NDIF API key
if NDIF_TOKEN:
    CONFIG.set_default_api_key(NDIF_TOKEN)

# Model configuration
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
NUM_LAYERS = 32
HIDDEN_DIM = 4096
VOCAB_SIZE = 32000

# Experiment configuration
INPUT_LANG = 'fr'   # Source language for translation
TARGET_LANG = 'zh'  # Target language for translation
CLOZE_LANG = 'fr'   # Language for cloze task

# Use remote execution (set to False if running locally with GPU)
USE_REMOTE = True

print(f"Model: {MODEL_NAME}")
print(f"Layers: {NUM_LAYERS}")
print(f"Remote execution: {USE_REMOTE}")

Model: meta-llama/Llama-2-7b-hf
Layers: 32
Remote execution: True


In [4]:
# Cell 1.3: Load Model
print("Loading model...")
model = LanguageModel(
    MODEL_NAME,
    device_map="auto",
    token=HF_TOKEN if HF_TOKEN else None
)
tokenizer = model.tokenizer
print(f"Model loaded: {MODEL_NAME}")
print(f"Vocabulary size: {len(tokenizer)}")

Loading model...
Model loaded: meta-llama/Llama-2-7b-hf
Vocabulary size: 32000


## Section 2: Utility Functions

In [5]:
# Cell 2.1: Token Processing Functions

def token_prefixes(token_str: str):
    """Generate all prefixes of a token string."""
    n = len(token_str)
    return [token_str[:i] for i in range(1, n + 1)]


def add_spaces(tokens):
    """Add SentencePiece space marker variants."""
    return ['\u2581' + t for t in tokens] + tokens


def capitalizations(tokens):
    """Get unique tokens (handles case variations)."""
    return list(set(tokens))


def unicode_prefix_tokid(char, tokenizer):
    """
    Get token ID for Unicode byte representation.
    Used for Chinese/Russian characters that may be tokenized as byte sequences.
    """
    try:
        encoded = char.encode()
        start = encoded.__str__()[2:-1].split('\\x')[1]
        unicode_format = '<0x%s>'
        start_key = unicode_format % start.upper()
        vocab = tokenizer.get_vocab()
        if start_key in vocab:
            return vocab[start_key]
    except:
        pass
    return None


def process_tokens(token_str: str, tokenizer, lang):
    """
    Find all valid token IDs for a word.
    Handles prefixes, space markers, and Unicode bytes for zh/ru.
    """
    vocab = tokenizer.get_vocab()
    with_prefixes = token_prefixes(token_str)
    with_spaces = add_spaces(with_prefixes)
    with_capitalizations = capitalizations(with_spaces)
    
    final_tokens = []
    for tok in with_capitalizations:
        if tok in vocab:
            final_tokens.append(vocab[tok])
    
    # Add Unicode byte token for Chinese/Russian
    if lang in ['zh', 'ru']:
        tokid = unicode_prefix_tokid(token_str, tokenizer)
        if tokid is not None:
            final_tokens.append(tokid)
    
    return final_tokens


# Create reverse vocabulary mapping
id2voc = {id: voc for voc, id in tokenizer.get_vocab().items()}


def get_tokens(token_ids, id2voc=id2voc):
    """Convert token IDs to token strings."""
    return [id2voc[tokid] for tokid in token_ids]


# Test token processing
print("Token processing test:")
print(f"  'cloud' -> {process_tokens('cloud', tokenizer, 'en')}")
print(f"  'nuage' -> {process_tokens('nuage', tokenizer, 'fr')}")
print(f"  '\u4e91' -> {process_tokens('\u4e91', tokenizer, 'zh')}")

Token processing test:
  'cloud' -> [29883, 15126, 274, 695, 1067, 9570, 17184, 9274, 23642]
  'nuage' -> [4948, 29876, 3433, 302]
  '云' -> [31784, 231]


In [6]:
# Cell 2.2: Metric Computation Functions

def compute_entropy(probas):
    """
    Compute entropy of probability distribution.
    H = -sum(p * log2(p))
    """
    # Clamp to avoid log(0)
    probas = probas.clamp(min=1e-10)
    return (-probas * torch.log2(probas)).sum(dim=-1)


def compute_token_probs(probs, token_ids):
    """
    Sum probabilities for a set of token IDs.
    This aggregates probability mass across all valid token representations.
    """
    if len(token_ids) == 0:
        return torch.zeros(probs.shape[0])
    return probs[:, torch.tensor(token_ids)].sum(dim=-1)

In [None]:
# Cell 2.3: Energy Calculation

def setup_energy_calculation(model, remote=True):
    """
    Prepare matrices for energy calculation.
    
    Energy measures how aligned the latent representation is with
    the unembedding space (token directions).
    
    For NDIF remote execution, we access weights inside a trace context
    using .save() to transmit only the specific weights we need (~500MB)
    rather than downloading the full model (~13GB).
    
    Args:
        model: nnsight LanguageModel
        remote: Whether to use NDIF remote execution
    
    Returns:
        U_normalized: Normalized unembedding matrix
        avgUU: Average self-similarity baseline for normalization
    """
    print("Setting up energy calculation...")
    print("  Fetching model weights via trace context...")
    
    # Helper to handle both old nnsight (.value) and new nnsight 0.5+ (direct tensor)
    def get_tensor(saved):
        return saved.value if hasattr(saved, 'value') else saved
    
    # Access weights inside trace context - this works with remote execution
    with model.trace("hello", remote=remote):
        U_saved = model.lm_head.weight.save()
        norm_saved = model.model.norm.weight.save()
    
    # Extract actual values after trace completes
    U = get_tensor(U_saved).cpu().float()  # [vocab, hidden]
    norm_weights = get_tensor(norm_saved).cpu().float()  # [hidden]
    
    print(f"  U shape: {U.shape}")
    print(f"  Norm weights shape: {norm_weights.shape}")
    
    # Weight U by RMSNorm weights (as done in original)
    U_weighted = U.clone()
    U_weighted *= norm_weights.unsqueeze(0)
    
    # Normalize U
    U_normalized = U_weighted / ((U_weighted**2).sum(dim=1, keepdim=True))**0.5
    
    # Compute average self-similarity baseline
    v = U.shape[0]  # vocab size
    avgUU = (((U_normalized.T @ U_normalized)**2).sum() / v**2)**0.5
    
    print(f"  avgUU: {avgUU.item():.6f}")
    print("  Energy calculation setup complete!")
    
    return U_normalized, avgUU


def compute_energy(latents, U_normalized, avgUU):
    """
    Compute energy metric for latent representations.
    
    Energy measures how much the latent vector projects onto
    the token directions in the unembedding space.
    
    Args:
        latents: Hidden states [num_layers, hidden_dim]
        U_normalized: Normalized unembedding matrix
        avgUU: Baseline for normalization
    
    Returns:
        Energy values per layer [num_layers]
    """
    # Normalize latents
    latents_normalized = latents.float()
    latents_normalized = latents_normalized / (((latents_normalized**2).mean(dim=-1, keepdim=True))**0.5)
    latents_normalized = latents_normalized / latents_normalized.norm(dim=-1, keepdim=True)
    
    # Project onto U and compute energy
    norm = ((U_normalized @ latents_normalized.T)**2).mean(dim=0)**0.5
    
    return norm / avgUU

In [8]:
# Cell 2.4: Visualization Functions

PLT_PARAMS = {'linewidth': 2.2}


def plot_ci(ax, data, label, color='blue', linestyle='-', tik_step=5, 
            do_lines=True, plt_params=PLT_PARAMS):
    """
    Plot mean with 95% confidence interval.
    
    Args:
        ax: Matplotlib axis
        data: Tensor of shape [n_samples, n_layers]
        label: Legend label
        color: Line color
        linestyle: Line style
        tik_step: X-axis tick interval
        do_lines: Whether to draw vertical grid lines
    """
    mean = data.mean(dim=0).numpy()
    std = data.std(dim=0).numpy()
    n = data.shape[0]
    
    x = np.arange(data.shape[1]) + 1  # Layers start from 1
    y_upper = mean + (1.96 / np.sqrt(n)) * std
    y_lower = mean - (1.96 / np.sqrt(n)) * std
    
    if do_lines:
        upper = max(round(data.shape[1] / 10) * 10 + 1, data.shape[1] + 1)
        ax.set_xticks(np.arange(0, upper, tik_step))
        for i in range(0, upper, tik_step):
            ax.axvline(i, color='black', linestyle='--', alpha=0.2, linewidth=1)
    
    ax.plot(x, mean, label=label, color=color, linestyle=linestyle, **plt_params)
    ax.fill_between(x, y_lower, y_upper, color=color, alpha=0.3)
    ax.spines[['right', 'top']].set_visible(False)


def plot_ci_plus_heatmap(data, heat, label, color='tab:orange', tik_step=5,
                         do_colorbar=True, nums=[0.99, 0.18, 0.025, 0.6],
                         labelpad=10):
    """
    Plot probability CI with entropy heatmap above.
    
    Args:
        data: Probability data [n_samples, n_layers]
        heat: Entropy data [n_samples, n_layers]
        label: Legend label
        color: Line color
        tik_step: X-axis tick interval
        do_colorbar: Whether to show colorbar
        nums: Colorbar position [left, bottom, width, height]
    
    Returns:
        fig, ax (heatmap), ax2 (probability plot)
    """
    fig, (ax, ax2) = plt.subplots(
        nrows=2, sharex=True,
        gridspec_kw={'height_ratios': [1, 10]},
        figsize=(5, 3)
    )
    
    if do_colorbar:
        fig.subplots_adjust(right=0.8)
    
    # Plot CI
    plot_ci(ax2, data, label, color=color, tik_step=tik_step)
    
    # Plot entropy heatmap
    y = heat.mean(dim=0).numpy()
    x = np.arange(y.shape[0]) + 1
    shift = 0.5
    extent = [x[0] - (x[1] - x[0]) / 2. - shift, 
              x[-1] + (x[1] - x[0]) / 2. + shift, 0, 1]
    img = ax.imshow(y[np.newaxis, :], cmap="plasma", aspect="auto",
                    extent=extent, vmin=0, vmax=14)
    ax.set_yticks([])
    
    if do_colorbar:
        cbar_ax = fig.add_axes(nums)
        cbar = plt.colorbar(img, cax=cbar_ax)
        cbar.set_label('entropy', rotation=90, labelpad=labelpad)
    
    plt.tight_layout()
    return fig, ax, ax2

## Section 3: Data Loading

In [9]:
# Cell 3.1: Load Language Data

DATA_DIR = "./data/langs"

# Load French data (for both translation and cloze)
df_fr = pd.read_csv(f'{DATA_DIR}/fr/clean.csv')
print(f"French words loaded: {len(df_fr)}")

# Load Chinese data (for translation target)
df_zh = pd.read_csv(f'{DATA_DIR}/zh/clean.csv')
print(f"Chinese words loaded: {len(df_zh)}")

# Display sample
print("\nSample French data:")
df_fr[['word_original', 'word_translation']].head()

French words loaded: 118
Chinese words loaded: 139

Sample French data:


Unnamed: 0,word_original,word_translation
0,cloud,nuage
1,bag,sac
2,mouth,bouche
3,soil,sol
4,mountain,montagne


In [10]:
# Cell 3.2: Prepare Merged Dataset for Translation

# Merge French and Chinese data on English word
df_merged = df_zh.merge(df_fr, on='word_original', suffixes=('_zh', '_fr'))
df_merged.rename(columns={
    'word_original': 'en',
    'word_translation_zh': 'zh',
    'word_translation_fr': 'fr'
}, inplace=True)

# Filter out cases where English word appears in target translation
# (to avoid trivial matches)
df_merged = df_merged[~df_merged.apply(
    lambda row: row['en'].lower() in row['zh'].lower(), axis=1
)].reset_index(drop=True)

print(f"Merged dataset size: {len(df_merged)}")
print("\nSample merged data:")
df_merged[['en', 'fr', 'zh']].head()

Merged dataset size: 118

Sample merged data:


Unnamed: 0,en,fr,zh
0,cloud,nuage,云
1,bag,sac,包
2,mouth,bouche,口
3,soil,sol,土
4,mountain,montagne,山


## Section 4: Core Analysis Functions

In [None]:
# Cell 4.1: Hidden State Extraction with Logit Lens

def get_all_layer_probs_and_latents(model, prompt, remote=True):
    """
    Extract probabilities and latents at all layers using logit lens.
    
    The logit lens applies the final layer norm and unembedding (lm_head)
    to intermediate hidden states, allowing us to see what tokens the
    model would predict at each layer.
    
    IMPORTANT: With remote=True, we must:
    1. Create lists INSIDE the trace context
    2. Append proxy objects (not .save() results) to lists
    3. Stack inside the trace
    4. Call .save() only on the final stacked tensors
    
    Args:
        model: nnsight LanguageModel
        prompt: Input text
        remote: Whether to use NDIF remote execution
    
    Returns:
        probs: Tensor [num_layers, vocab_size] - probabilities at last token position
        latents: Tensor [num_layers, hidden_dim] - raw hidden states at last token position
    """
    with model.trace(prompt, remote=remote):
        # Create lists INSIDE the trace context
        all_latents = []
        all_probs = []

        for layer_idx in range(NUM_LAYERS):
            hs = model.model.layers[layer_idx].output

            # Append proxy objects (NOT .save() results) to lists
            all_latents.append(hs[:, -1, :])

            # Apply logit lens to last token position
            last_hs = hs[:, -1:, :]  # [batch, 1, hidden]
            normalized = model.model.norm(last_hs)
            logits = model.lm_head(normalized)  # [batch, 1, vocab]
            all_probs.append(F.softmax(logits[:, 0, :].float(), dim=-1))

        # Stack INSIDE trace, then save only the final stacked tensors
        stacked_latents = torch.stack(all_latents, dim=0).save()
        stacked_probs = torch.stack(all_probs, dim=0).save()

    # Extract values after trace exits
    def get_tensor(saved):
        val = saved.value if hasattr(saved, 'value') else saved
        # Shape is [num_layers, batch, dim] - squeeze batch dimension
        return val.squeeze(1) if val.dim() == 3 else val

    latents = get_tensor(stacked_latents)
    probs = get_tensor(stacked_probs)

    return probs, latents

In [12]:
# Cell 4.2: Complete Analysis Pipeline

def analyze_prompt(model, prompt, latent_token_ids, out_token_ids,
                   U_normalized, avgUU, remote=True):
    """
    Run complete analysis for a single prompt.
    
    Args:
        model: nnsight LanguageModel
        prompt: Input text
        latent_token_ids: Token IDs for the "latent" (English) word
        out_token_ids: Token IDs for the target language word
        U_normalized: Normalized unembedding matrix (for energy)
        avgUU: Energy normalization baseline
        remote: Whether to use NDIF remote execution
    
    Returns:
        dict with keys:
            - latent_token_probs: [num_layers] English token probability per layer
            - out_token_probs: [num_layers] Target language token probability per layer
            - entropy: [num_layers] Distribution entropy per layer
            - energy: [num_layers] Energy metric per layer
            - latents: [num_layers, hidden_dim] Raw hidden states
    """
    # Get probabilities and latents at all layers
    probs, latents = get_all_layer_probs_and_latents(model, prompt, remote=remote)
    
    # Move to CPU for metric computation
    probs = probs.cpu()
    latents = latents.cpu()
    
    # Compute metrics
    latent_probs = compute_token_probs(probs, latent_token_ids)
    out_probs = compute_token_probs(probs, out_token_ids)
    entropy = compute_entropy(probs)
    energy = compute_energy(latents, U_normalized, avgUU)
    
    return {
        'latent_token_probs': latent_probs,
        'out_token_probs': out_probs,
        'entropy': entropy,
        'energy': energy,
        'latents': latents
    }

## Section 5: Translation Experiment

Test the latent language hypothesis using few-shot translation prompts (French -> Chinese).

In [13]:
# Cell 5.1: Translation Prompt Generation

LANG_NAMES = {
    'fr': 'Fran\u00e7ais',
    'de': 'Deutsch',
    'ru': '\u0420\u0443\u0441\u0441\u043a\u0438\u0439',
    'en': 'English',
    'zh': '\u4e2d\u6587'
}


def create_translation_prompt(df, idx, input_lang='fr', output_lang='zh',
                               latent_lang='en', n_examples=5):
    """
    Create few-shot translation prompt.
    
    Format:
        Fran\u00e7ais: "word1" - \u4e2d\u6587: "translation1"
        Fran\u00e7ais: "word2" - \u4e2d\u6587: "translation2"
        ...
        Fran\u00e7ais: "target_word" - \u4e2d\u6587: "
    
    Returns:
        dict with prompt and token IDs, or None if invalid
    """
    df = df.reset_index(drop=True)
    
    # Sample examples (excluding current word)
    other_indices = [i for i in range(len(df)) if i != idx]
    example_indices = np.random.choice(other_indices, n_examples - 1, replace=False)
    
    # Build prompt with examples
    prompt = ""
    for ex_idx in example_indices:
        row = df.iloc[ex_idx]
        prompt += f'{LANG_NAMES[input_lang]}: "{row[input_lang]}" - {LANG_NAMES[output_lang]}: "{row[output_lang]}"\n'
    
    # Add target (incomplete)
    target_row = df.iloc[idx]
    prompt += f'{LANG_NAMES[input_lang]}: "{target_row[input_lang]}" - {LANG_NAMES[output_lang]}: "'
    
    # Get token IDs
    out_token_ids = process_tokens(target_row[output_lang], tokenizer, output_lang)
    latent_token_ids = process_tokens(target_row[latent_lang], tokenizer, 'en')
    
    # Skip if tokens are empty or overlap
    if len(out_token_ids) == 0 or len(latent_token_ids) == 0:
        return None
    if output_lang != 'en' and len(set(out_token_ids) & set(latent_token_ids)) > 0:
        return None
    
    return {
        'prompt': prompt,
        'out_token_ids': out_token_ids,
        'latent_token_ids': latent_token_ids,
        'out_token_str': target_row[output_lang],
        'latent_token_str': target_row[latent_lang],
        'in_token_str': target_row[input_lang]
    }


# Test prompt generation
test_prompt = create_translation_prompt(df_merged, 0, INPUT_LANG, TARGET_LANG)
if test_prompt:
    print("Example translation prompt:")
    print(test_prompt['prompt'])
    print(f"\nExpected output: {test_prompt['out_token_str']}")
    print(f"Latent (English): {test_prompt['latent_token_str']}")

Example translation prompt:
Français: "cinq" - 中文: "五"
Français: "cœur" - 中文: "心"
Français: "partie" - 中文: "部"
Français: "trois" - 中文: "三"
Français: "nuage" - 中文: "

Expected output: 云
Latent (English): cloud


In [14]:
# Cell 5.2: Build Translation Dataset

print(f"Building translation dataset ({INPUT_LANG} -> {TARGET_LANG})...")

translation_dataset = []
for idx in range(len(df_merged)):
    data = create_translation_prompt(
        df_merged, idx,
        input_lang=INPUT_LANG,
        output_lang=TARGET_LANG,
        latent_lang='en'
    )
    if data is not None:
        translation_dataset.append(data)

print(f"Translation dataset size: {len(translation_dataset)}")

Building translation dataset (fr -> zh)...
Translation dataset size: 118


In [None]:
# Cell 5.2.1: Data Save/Load Utilities

RESULTS_DIR = "./results"
os.makedirs(RESULTS_DIR, exist_ok=True)

def save_results(results, name):
    """Save experiment results to .pt file (excluding latents to save space)."""
    to_save = {k: v for k, v in results.items() if k != 'latents'}
    path = f"{RESULTS_DIR}/{name}_results.pt"
    torch.save(to_save, path)
    print(f"Saved {name} results to {path}")
    return path

def load_results(name):
    """Load experiment results from .pt file."""
    path = f"{RESULTS_DIR}/{name}_results.pt"
    if os.path.exists(path):
        results = torch.load(path, weights_only=False)
        print(f"Loaded {name} results from {path}")
        return results
    return None

print(f"Results directory: {RESULTS_DIR}")

In [15]:
# Cell 5.3: Run Translation Analysis

# Setup energy calculation (fetches weights via trace context)
U_normalized, avgUU = setup_energy_calculation(model, remote=USE_REMOTE)

# Collect results
translation_results = {
    'latent_token_probs': [],
    'out_token_probs': [],
    'entropy': [],
    'energy': [],
    'latents': []
}

print(f"\nRunning translation analysis on {len(translation_dataset)} prompts...")
for data in tqdm(translation_dataset, desc="Translation"):
    try:
        result = analyze_prompt(
            model,
            data['prompt'],
            data['latent_token_ids'],
            data['out_token_ids'],
            U_normalized,
            avgUU,
            remote=USE_REMOTE
        )
        
        for key in translation_results:
            translation_results[key].append(result[key])
    except Exception as e:
        print(f"Error processing prompt: {e}")
        continue

# Stack results into tensors
for key in translation_results:
    if len(translation_results[key]) > 0:
        translation_results[key] = torch.stack(translation_results[key])

print(f"\nResults collected: {translation_results['latent_token_probs'].shape[0]} samples")

# Save results to file
save_results(translation_results, "translation")

Setting up energy calculation...
  Fetching model weights via trace context...


⬇ Downloading: 100%|██████████| 262M/262M [00:19<00:00] 


  U shape: torch.Size([32000, 4096])
  Norm weights shape: torch.Size([4096])
  avgUU: 0.058622
  Energy calculation setup complete!

Running translation analysis on 118 prompts...


Translation:   0%|          | 0/118 [00:00<?, ?it/s]

⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:   1%|          | 1/118 [00:03<06:52,  3.52s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:   2%|▏         | 2/118 [00:05<05:12,  2.69s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:   3%|▎         | 3/118 [00:07<04:44,  2.48s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:   3%|▎         | 4/118 [00:10<04:33,  2.40s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:   4%|▍         | 5/118 [00:12<04:24,  2.34s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:   5%|▌         | 6/118 [00:14<04:18,  2.31s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:   6%|▌         | 7/118 [00:16<04:11,  2.27s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:   7%|▋         | 8/118 [00:19<04:15,  2.32s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 793k/793k [00:00<00:00]
Translation:   8%|▊         | 9/118 [00:21<04:22,  2.41s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:   8%|▊         | 10/118 [00:24<04:21,  2.42s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:   9%|▉         | 11/118 [00:26<04:17,  2.41s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  10%|█         | 12/118 [00:29<04:14,  2.40s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 793k/793k [00:00<00:00]
Translation:  11%|█         | 13/118 [00:32<04:34,  2.61s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 793k/793k [00:00<00:00]
Translation:  12%|█▏        | 14/118 [00:34<04:24,  2.54s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 785k/785k [00:00<00:00]
Translation:  13%|█▎        | 15/118 [00:37<04:21,  2.54s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  14%|█▎        | 16/118 [00:45<07:10,  4.22s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  14%|█▍        | 17/118 [00:49<06:57,  4.13s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  15%|█▌        | 18/118 [00:51<06:00,  3.60s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  16%|█▌        | 19/118 [00:55<06:04,  3.68s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  17%|█▋        | 20/118 [01:02<07:50,  4.81s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  18%|█▊        | 21/118 [01:05<06:33,  4.06s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  19%|█▊        | 22/118 [01:07<05:40,  3.54s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 785k/785k [00:00<00:00]
Translation:  19%|█▉        | 23/118 [01:09<05:01,  3.18s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  20%|██        | 24/118 [01:12<04:35,  2.93s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  21%|██        | 25/118 [01:14<04:19,  2.79s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  22%|██▏       | 26/118 [01:16<04:06,  2.68s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  23%|██▎       | 27/118 [01:19<03:54,  2.58s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  24%|██▎       | 28/118 [01:26<06:07,  4.09s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  25%|██▍       | 29/118 [01:29<05:17,  3.57s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  25%|██▌       | 30/118 [01:31<04:39,  3.17s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  26%|██▋       | 31/118 [01:33<04:12,  2.90s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  27%|██▋       | 32/118 [01:36<03:53,  2.72s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  28%|██▊       | 33/118 [01:39<04:03,  2.86s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  29%|██▉       | 34/118 [01:41<03:44,  2.67s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  30%|██▉       | 35/118 [01:43<03:32,  2.56s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  31%|███       | 36/118 [01:46<03:24,  2.50s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 785k/785k [00:00<00:00]
Translation:  31%|███▏      | 37/118 [01:48<03:16,  2.43s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  32%|███▏      | 38/118 [01:50<03:11,  2.40s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  33%|███▎      | 39/118 [01:53<03:05,  2.35s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  34%|███▍      | 40/118 [01:55<03:02,  2.34s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  35%|███▍      | 41/118 [01:57<02:57,  2.30s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  36%|███▌      | 42/118 [01:59<02:56,  2.33s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  36%|███▋      | 43/118 [02:02<02:52,  2.30s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  37%|███▋      | 44/118 [02:04<02:50,  2.30s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  38%|███▊      | 45/118 [02:06<02:45,  2.27s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  39%|███▉      | 46/118 [02:08<02:42,  2.25s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  40%|███▉      | 47/118 [02:12<03:13,  2.73s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  41%|████      | 48/118 [02:20<04:48,  4.12s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  42%|████▏     | 49/118 [02:22<04:06,  3.57s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  42%|████▏     | 50/118 [02:24<03:37,  3.20s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  43%|████▎     | 51/118 [02:27<03:18,  2.96s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  44%|████▍     | 52/118 [02:31<03:37,  3.29s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  45%|████▍     | 53/118 [02:33<03:15,  3.01s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  46%|████▌     | 54/118 [02:35<02:58,  2.79s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  47%|████▋     | 55/118 [02:38<02:47,  2.65s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  47%|████▋     | 56/118 [02:40<02:39,  2.57s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 785k/785k [00:00<00:00]
Translation:  48%|████▊     | 57/118 [02:43<02:47,  2.75s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 736k/736k [00:00<00:00]
Translation:  49%|████▉     | 58/118 [02:46<02:52,  2.88s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  50%|█████     | 59/118 [02:49<02:39,  2.71s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  51%|█████     | 60/118 [02:52<02:42,  2.80s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  52%|█████▏    | 61/118 [02:58<03:43,  3.92s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  53%|█████▎    | 62/118 [03:02<03:30,  3.76s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  53%|█████▎    | 63/118 [03:04<03:01,  3.30s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  54%|█████▍    | 64/118 [03:07<02:55,  3.24s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  55%|█████▌    | 65/118 [03:09<02:38,  2.99s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 736k/736k [00:00<00:00]
Translation:  56%|█████▌    | 66/118 [03:12<02:25,  2.80s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  57%|█████▋    | 67/118 [03:14<02:15,  2.65s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  58%|█████▊    | 68/118 [03:22<03:30,  4.22s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  58%|█████▊    | 69/118 [03:24<02:59,  3.66s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  59%|█████▉    | 70/118 [03:27<02:36,  3.26s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  60%|██████    | 71/118 [03:29<02:20,  3.00s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  61%|██████    | 72/118 [03:31<02:09,  2.80s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  62%|██████▏   | 73/118 [03:34<02:00,  2.69s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 785k/785k [00:00<00:00]
Translation:  63%|██████▎   | 74/118 [03:36<01:53,  2.57s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  64%|██████▎   | 75/118 [03:39<01:58,  2.77s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  64%|██████▍   | 76/118 [03:42<01:50,  2.64s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  65%|██████▌   | 77/118 [03:44<01:44,  2.54s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  66%|██████▌   | 78/118 [03:46<01:39,  2.48s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  67%|██████▋   | 79/118 [03:48<01:33,  2.40s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  68%|██████▊   | 80/118 [03:51<01:29,  2.36s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  69%|██████▊   | 81/118 [03:53<01:29,  2.42s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 736k/736k [00:00<00:00]
Translation:  69%|██████▉   | 82/118 [03:56<01:25,  2.37s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  70%|███████   | 83/118 [03:59<01:32,  2.65s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  71%|███████   | 84/118 [04:09<02:51,  5.05s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 793k/793k [00:00<00:00]
Translation:  72%|███████▏  | 85/118 [04:12<02:18,  4.18s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  73%|███████▎  | 86/118 [04:14<01:54,  3.57s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  74%|███████▎  | 87/118 [04:18<01:52,  3.62s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  75%|███████▍  | 88/118 [04:20<01:37,  3.27s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  75%|███████▌  | 89/118 [04:22<01:25,  2.95s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  76%|███████▋  | 90/118 [04:24<01:16,  2.73s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  77%|███████▋  | 91/118 [04:27<01:09,  2.57s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  78%|███████▊  | 92/118 [04:29<01:01,  2.38s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  79%|███████▉  | 93/118 [04:31<01:02,  2.51s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  80%|███████▉  | 94/118 [04:35<01:09,  2.90s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 785k/785k [00:00<00:00]
Translation:  81%|████████  | 95/118 [04:37<01:01,  2.66s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  81%|████████▏ | 96/118 [04:39<00:55,  2.51s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 785k/785k [00:00<00:00]
Translation:  82%|████████▏ | 97/118 [04:42<00:50,  2.40s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  83%|████████▎ | 98/118 [04:53<01:39,  5.00s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 744k/744k [00:00<00:00]
Translation:  84%|████████▍ | 99/118 [04:55<01:18,  4.12s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  85%|████████▍ | 100/118 [04:57<01:03,  3.51s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 802k/802k [00:00<00:00]
Translation:  86%|████████▌ | 101/118 [04:59<00:53,  3.12s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  86%|████████▋ | 102/118 [05:02<00:48,  3.04s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  87%|████████▋ | 103/118 [05:04<00:40,  2.71s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  88%|████████▊ | 104/118 [05:06<00:35,  2.53s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  89%|████████▉ | 105/118 [05:08<00:31,  2.40s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 785k/785k [00:00<00:00]
Translation:  90%|████████▉ | 106/118 [05:10<00:27,  2.31s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation:  91%|█████████ | 107/118 [05:12<00:24,  2.25s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  92%|█████████▏| 108/118 [05:14<00:21,  2.17s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  92%|█████████▏| 109/118 [05:16<00:19,  2.12s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  93%|█████████▎| 110/118 [05:18<00:16,  2.10s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  94%|█████████▍| 111/118 [05:20<00:14,  2.11s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 793k/793k [00:00<00:00]
Translation:  95%|█████████▍| 112/118 [05:22<00:12,  2.08s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  96%|█████████▌| 113/118 [05:25<00:10,  2.11s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 761k/761k [00:00<00:00]
Translation:  97%|█████████▋| 114/118 [05:27<00:08,  2.08s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 785k/785k [00:00<00:00]
Translation:  97%|█████████▋| 115/118 [05:36<00:13,  4.36s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 769k/769k [00:00<00:00]
Translation:  98%|█████████▊| 116/118 [05:38<00:07,  3.68s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 777k/777k [00:00<00:00]
Translation:  99%|█████████▉| 117/118 [05:40<00:03,  3.20s/it]

Error processing prompt: stack expects a non-empty TensorList


⬇ Downloading: 100%|██████████| 752k/752k [00:00<00:00]
Translation: 100%|██████████| 118/118 [05:42<00:00,  2.91s/it]


Error processing prompt: stack expects a non-empty TensorList


AttributeError: 'list' object has no attribute 'shape'

In [None]:
# Cell 5.4: Translation Visualizations

# Load results if not in memory (allows running visualization without re-running analysis)
if 'translation_results' not in dir() or translation_results is None:
    translation_results = load_results("translation")


print("=" * 60)
print(f"TRANSLATION EXPERIMENT: {INPUT_LANG} -> {TARGET_LANG}")
print("=" * 60)

# Plot 1: Probability curves with entropy heatmap
fig, ax, ax2 = plot_ci_plus_heatmap(
    translation_results['latent_token_probs'],
    translation_results['entropy'],
    'English (latent)',
    color='tab:orange',
    tik_step=5,
    do_colorbar=True
)
plot_ci(ax2, translation_results['out_token_probs'],
        TARGET_LANG, color='tab:blue', do_lines=False)

ax2.set_xlabel('Layer')
ax2.set_ylabel('Probability')
ax2.set_xlim(0, NUM_LAYERS + 1)
ax2.set_ylim(0, 1)
ax2.legend(loc='upper left')
plt.suptitle(f'Translation: {INPUT_LANG} \u2192 {TARGET_LANG}', y=1.02)
plt.tight_layout()
plt.show()

# Plot 2: Energy
fig, ax = plt.subplots(figsize=(5, 3))
plot_ci(ax, translation_results['energy'], 'energy', color='tab:green', tik_step=5)
ax.set_xlabel('Layer')
ax.set_ylabel('Energy')
ax.set_xlim(0, NUM_LAYERS + 1)
plt.title(f'Energy: Translation {INPUT_LANG} \u2192 {TARGET_LANG}')
plt.tight_layout()
plt.show()

# Print statistics
peak_en_layer = translation_results['latent_token_probs'].mean(dim=0).argmax().item() + 1
peak_out_layer = translation_results['out_token_probs'].mean(dim=0).argmax().item() + 1
print(f"\nPeak English probability at layer: {peak_en_layer}")
print(f"Peak {TARGET_LANG} probability at layer: {peak_out_layer}")
print(f"Max English probability: {translation_results['latent_token_probs'].mean(dim=0).max():.3f}")
print(f"Max {TARGET_LANG} probability: {translation_results['out_token_probs'].mean(dim=0).max():.3f}")

## Section 6: Cloze Experiment

Test the latent language hypothesis using fill-in-the-blank prompts in the target language.

In [None]:
# Cell 6.1: Cloze Prompt Generation

def create_cloze_prompt(df, idx, target_lang='fr', n_examples=2):
    """
    Create cloze (fill-in-blank) prompt.
    
    Uses the blank_prompt_translation_masked column which contains
    prompts like:
        Un "___" est une masse visible... R\u00e9ponse : "nuage".
    
    We truncate before the answer to create the completion task.
    
    Returns:
        dict with prompt and token IDs, or None if invalid
    """
    df = df.reset_index(drop=True)
    
    # Sample example prompts
    other_indices = [i for i in range(len(df)) if i != idx]
    example_indices = np.random.choice(other_indices, n_examples, replace=False)
    
    # Build prompt with complete examples
    prompt = ""
    for ex_idx in example_indices:
        row = df.iloc[ex_idx]
        prompt += f"{row['blank_prompt_translation_masked']}\n"
    
    # Add target prompt (truncated before answer)
    target_row = df.iloc[idx]
    target_prompt = target_row['blank_prompt_translation_masked']
    
    # Truncate to just before the answer
    # Chinese uses full-width colon (\uff1a), others use regular colon
    if target_lang == 'zh':
        prompt += target_prompt.split("\uff1a")[0] + ': "'
    else:
        prompt += target_prompt.split(":")[0] + ': "'
    
    # Get token IDs
    out_token_ids = process_tokens(target_row['word_translation'], tokenizer, target_lang)
    latent_token_ids = process_tokens(target_row['word_original'], tokenizer, 'en')
    
    # Skip if tokens are empty or overlap
    if len(out_token_ids) == 0 or len(latent_token_ids) == 0:
        return None
    if target_lang != 'en' and len(set(out_token_ids) & set(latent_token_ids)) > 0:
        return None
    
    return {
        'prompt': prompt,
        'out_token_ids': out_token_ids,
        'latent_token_ids': latent_token_ids,
        'out_token_str': target_row['word_translation'],
        'latent_token_str': target_row['word_original']
    }


# Test cloze prompt generation
test_cloze = create_cloze_prompt(df_fr, 0, CLOZE_LANG)
if test_cloze:
    print("Example cloze prompt:")
    print(test_cloze['prompt'])
    print(f"\nExpected output: {test_cloze['out_token_str']}")
    print(f"Latent (English): {test_cloze['latent_token_str']}")

In [None]:
# Cell 6.2: Build Cloze Dataset

print(f"Building cloze dataset ({CLOZE_LANG})...")

cloze_dataset = []
for idx in range(len(df_fr)):
    data = create_cloze_prompt(df_fr, idx, target_lang=CLOZE_LANG)
    if data is not None:
        cloze_dataset.append(data)

print(f"Cloze dataset size: {len(cloze_dataset)}")

In [None]:
# Cell 6.3: Run Cloze Analysis

# Collect results
cloze_results = {
    'latent_token_probs': [],
    'out_token_probs': [],
    'entropy': [],
    'energy': [],
    'latents': []
}

print(f"\nRunning cloze analysis on {len(cloze_dataset)} prompts...")
for data in tqdm(cloze_dataset, desc="Cloze"):
    try:
        result = analyze_prompt(
            model,
            data['prompt'],
            data['latent_token_ids'],
            data['out_token_ids'],
            U_normalized,
            avgUU,
            remote=USE_REMOTE
        )
        
        for key in cloze_results:
            cloze_results[key].append(result[key])
    except Exception as e:
        print(f"Error processing prompt: {e}")
        continue

# Stack results into tensors
for key in cloze_results:
    if len(cloze_results[key]) > 0:
        cloze_results[key] = torch.stack(cloze_results[key])

print(f"\nResults collected: {cloze_results['latent_token_probs'].shape[0]} samples")

# Save results to file
save_results(cloze_results, "cloze")

In [None]:
# Cell 6.4: Cloze Visualizations

# Load results if not in memory (allows running visualization without re-running analysis)
if 'cloze_results' not in dir() or cloze_results is None:
    cloze_results = load_results("cloze")


print("=" * 60)
print(f"CLOZE EXPERIMENT: {CLOZE_LANG}")
print("=" * 60)

# Plot 1: Probability curves with entropy heatmap
fig, ax, ax2 = plot_ci_plus_heatmap(
    cloze_results['latent_token_probs'],
    cloze_results['entropy'],
    'English (latent)',
    color='tab:orange',
    tik_step=5,
    do_colorbar=True
)
plot_ci(ax2, cloze_results['out_token_probs'],
        CLOZE_LANG, color='tab:blue', do_lines=False)

ax2.set_xlabel('Layer')
ax2.set_ylabel('Probability')
ax2.set_xlim(0, NUM_LAYERS + 1)
ax2.set_ylim(0, 1)
ax2.legend(loc='upper left')
plt.suptitle(f'Cloze Task: {CLOZE_LANG}', y=1.02)
plt.tight_layout()
plt.show()

# Plot 2: Energy
fig, ax = plt.subplots(figsize=(5, 3))
plot_ci(ax, cloze_results['energy'], 'energy', color='tab:green', tik_step=5)
ax.set_xlabel('Layer')
ax.set_ylabel('Energy')
ax.set_xlim(0, NUM_LAYERS + 1)
plt.title(f'Energy: Cloze {CLOZE_LANG}')
plt.tight_layout()
plt.show()

# Print statistics
peak_en_layer = cloze_results['latent_token_probs'].mean(dim=0).argmax().item() + 1
peak_out_layer = cloze_results['out_token_probs'].mean(dim=0).argmax().item() + 1
print(f"\nPeak English probability at layer: {peak_en_layer}")
print(f"Peak {CLOZE_LANG} probability at layer: {peak_out_layer}")
print(f"Max English probability: {cloze_results['latent_token_probs'].mean(dim=0).max():.3f}")
print(f"Max {CLOZE_LANG} probability: {cloze_results['out_token_probs'].mean(dim=0).max():.3f}")

## Section 7: Summary

In [None]:
# Cell 7.0: Compile and Save All Results

# Compile all experimental data (excluding latents to save space)
all_results = {
    'translation': {
        'latent_token_probs': translation_results['latent_token_probs'],
        'out_token_probs': translation_results['out_token_probs'],
        'entropy': translation_results['entropy'],
        'energy': translation_results['energy'],
    },
    'cloze': {
        'latent_token_probs': cloze_results['latent_token_probs'],
        'out_token_probs': cloze_results['out_token_probs'],
        'entropy': cloze_results['entropy'],
        'energy': cloze_results['energy'],
    },
    'metadata': {
        'model': MODEL_NAME,
        'num_layers': NUM_LAYERS,
        'input_lang': INPUT_LANG,
        'target_lang': TARGET_LANG,
        'cloze_lang': CLOZE_LANG,
        'num_translation_samples': int(translation_results['latent_token_probs'].shape[0]),
        'num_cloze_samples': int(cloze_results['latent_token_probs'].shape[0]),
    }
}

# Save compiled results
torch.save(all_results, f"{RESULTS_DIR}/all_results.pt")
print(f"Compiled results saved to {RESULTS_DIR}/all_results.pt")

# Print summary of saved data
print("
Saved data structure:")
for exp_name, exp_data in all_results.items():
    if exp_name == 'metadata':
        print(f"  {exp_name}:")
        for k, v in exp_data.items():
            print(f"    {k}: {v}")
    else:
        print(f"  {exp_name}:")
        for key, tensor in exp_data.items():
            print(f"    {key}: {tensor.shape}")

In [None]:
# Cell 7.1: Summary Statistics

print("=" * 70)
print("SUMMARY: Latent Language Hypothesis Results")
print("=" * 70)

print(f"\nModel: {MODEL_NAME}")
print(f"Number of layers: {NUM_LAYERS}")

print("\n" + "-" * 70)
print(f"TRANSLATION TASK ({INPUT_LANG} \u2192 {TARGET_LANG})")
print("-" * 70)
trans_peak_en = translation_results['latent_token_probs'].mean(dim=0).argmax().item() + 1
trans_peak_out = translation_results['out_token_probs'].mean(dim=0).argmax().item() + 1
print(f"  Samples analyzed: {translation_results['latent_token_probs'].shape[0]}")
print(f"  Peak English (latent) probability at layer: {trans_peak_en}")
print(f"  Peak {TARGET_LANG} (output) probability at layer: {trans_peak_out}")
print(f"  Max English probability: {translation_results['latent_token_probs'].mean(dim=0).max():.4f}")
print(f"  Max {TARGET_LANG} probability: {translation_results['out_token_probs'].mean(dim=0).max():.4f}")

print("\n" + "-" * 70)
print(f"CLOZE TASK ({CLOZE_LANG})")
print("-" * 70)
cloze_peak_en = cloze_results['latent_token_probs'].mean(dim=0).argmax().item() + 1
cloze_peak_out = cloze_results['out_token_probs'].mean(dim=0).argmax().item() + 1
print(f"  Samples analyzed: {cloze_results['latent_token_probs'].shape[0]}")
print(f"  Peak English (latent) probability at layer: {cloze_peak_en}")
print(f"  Peak {CLOZE_LANG} (output) probability at layer: {cloze_peak_out}")
print(f"  Max English probability: {cloze_results['latent_token_probs'].mean(dim=0).max():.4f}")
print(f"  Max {CLOZE_LANG} probability: {cloze_results['out_token_probs'].mean(dim=0).max():.4f}")

print("\n" + "=" * 70)
print("INTERPRETATION")
print("=" * 70)
print("""
The latent language hypothesis predicts that:
1. English tokens should have higher probability in MIDDLE layers
2. Target language tokens should have higher probability in LATER layers

This pattern suggests the model:
- First processes input and maps to English-centric concept space
- Then translates from English concepts to target language output

If English peaks before the target language across layers, this supports
the hypothesis that English serves as an internal "pivot language".
""")
print("=" * 70)