# Transcript-Conditioned LLM Reasoning Demo

This notebook demonstrates that a language model does not internally commit to a secret in a 20-questions game. Instead, it reconstructs the secret from the visible transcript.

**Method:** We define a small secret space of animals, generate a unique Q&A transcript for each secret, and show that the same model instance declares a different secret depending solely on the transcript it sees.

## 1. Setup

In [None]:
!pip install --upgrade filelock Pillow "numpy>=1.24,<2" torch torchvision transformers accelerate huggingface_hub bitsandbytes

In [None]:
import getpass
from huggingface_hub import login

token = getpass.getpass("Enter your HuggingFace token: ")
login(token=token)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

## 2. Secret Space Definition

In [None]:
SECRETS = {
    "cat": {
        "mammal": True,
        "flies": False,
        "swims": False,
        "has_fur": True,
        "fish": False,
        "predator": True,
        "pet": True,
        "barks": False,
    },
    "dog": {
        "mammal": True,
        "flies": False,
        "swims": False,
        "has_fur": True,
        "fish": False,
        "predator": True,
        "pet": True,
        "barks": True,
    },
    "eagle": {
        "mammal": False,
        "flies": True,
        "swims": False,
        "has_fur": False,
        "fish": False,
        "predator": True,
        "pet": False,
        "barks": False,
    },
    "salmon": {
        "mammal": False,
        "flies": False,
        "swims": True,
        "has_fur": False,
        "fish": True,
        "predator": False,
        "pet": False,
        "barks": False,
    },
    "shark": {
        "mammal": False,
        "flies": False,
        "swims": True,
        "has_fur": False,
        "fish": True,
        "predator": True,
        "pet": False,
        "barks": False,
    },
    "crocodile": {
        "mammal": False,
        "flies": False,
        "swims": True,
        "has_fur": False,
        "fish": False,
        "predator": True,
        "pet": False,
        "barks": False,
    },
}

# Verify all secrets have the same feature keys
feature_keys = list(next(iter(SECRETS.values())).keys())
assert all(list(v.keys()) == feature_keys for v in SECRETS.values()), "Feature keys mismatch"

# Verify all secrets are unique
vectors = [tuple(v.values()) for v in SECRETS.values()]
assert len(vectors) == len(set(vectors)), "Duplicate feature vectors found"

print(f"Defined {len(SECRETS)} secrets with {len(feature_keys)} features each.")
print(f"Features: {feature_keys}")

## 3. Question Templates

In [None]:
QUESTIONS = {
    "mammal": "Is it a mammal?",
    "flies": "Does it fly?",
    "swims": "Does it swim?",
    "has_fur": "Does it have fur?",
    "fish": "Is it a fish?",
    "predator": "Is it a predator?",
    "pet": "Is it commonly kept as a pet?",
    "barks": "Does it bark?",
}

# Verify 1-to-1 mapping with feature keys
assert set(QUESTIONS.keys()) == set(feature_keys), "Questions don't match features"
print("Question templates defined.")

## 4. Transcript Generator

In [None]:
def build_transcript(secret_name: str) -> str:
    """Build a Q&A transcript for the given secret using fixed question order."""
    features = SECRETS[secret_name]
    lines = []
    for feature in feature_keys:
        question = QUESTIONS[feature]
        answer = "Yes." if features[feature] else "No."
        lines.append(f"User: {question}")
        lines.append(f"Assistant: {answer}")
    return "\n".join(lines)


# Preview one transcript
print("=== Transcript for 'eagle' ===")
print(build_transcript("eagle"))

## 5. Prompt Builder

In [None]:
SECRET_LIST = list(SECRETS.keys())


def build_prompt(secret_list: list[str], transcript: str) -> str:
    """Wrap transcript into a prompt for the model."""
    secrets_str = ", ".join(secret_list)
    return (
        f"You are playing 20 questions. You have secretly chosen one of the following: "
        f"{secrets_str}.\n"
        f"Here is the conversation so far:\n\n"
        f"{transcript}\n\n"
        f"User: I give up. What was the secret?\n"
        f"Assistant:"
    )


# Preview
print(build_prompt(SECRET_LIST, build_transcript("shark")))

## 6. Feasibility Checker

In [None]:
def compute_feasible_set(transcript: str) -> list[str]:
    """Parse transcript answers and return secrets consistent with them."""
    # Parse answers from transcript
    observed = {}
    lines = transcript.strip().split("\n")
    i = 0
    while i < len(lines) - 1:
        user_line = lines[i].strip()
        assistant_line = lines[i + 1].strip()
        if user_line.startswith("User:") and assistant_line.startswith("Assistant:"):
            question_text = user_line[len("User:"):].strip()
            answer_text = assistant_line[len("Assistant:"):].strip().lower()
            # Find which feature this question maps to
            for feature, q in QUESTIONS.items():
                if q == question_text:
                    observed[feature] = answer_text.startswith("yes")
                    break
            i += 2
        else:
            i += 1

    # Filter secrets
    feasible = []
    for name, features in SECRETS.items():
        match = True
        for feature, value in observed.items():
            if features.get(feature) != value:
                match = False
                break
        if match:
            feasible.append(name)
    return feasible


# Verify every secret's transcript uniquely identifies it
for name in SECRETS:
    t = build_transcript(name)
    fs = compute_feasible_set(t)
    assert fs == [name], f"Expected [{name}], got {fs}"
    print(f"{name}: feasible set = {fs}  ✓")

## 7. Model Loading

In [None]:
from transformers import BitsAndBytesConfig

MODEL_NAME = "meta-llama/Llama-3.1-70B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

print(f"Loading tokenizer from {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(f"Loading model from {MODEL_NAME} (4-bit quantized)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=quantization_config,
)
model.eval()

print("Model loaded.")

## 8. Inference Function

In [None]:
def run_experiment(transcript: str) -> str:
    """Run inference on a transcript and return the model's predicted secret."""
    prompt = build_prompt(SECRET_LIST, transcript)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode only the newly generated tokens
    generated = tokenizer.decode(output[0][input_len:], skip_special_tokens=True).strip()
    return generated

## 9. Experiment Loop

In [None]:
# Generate all transcripts
transcripts = {name: build_transcript(name) for name in SECRETS}

# Run experiments
results = {}
for name, transcript in transcripts.items():
    prediction = run_experiment(transcript)
    feasible = compute_feasible_set(transcript)
    results[name] = prediction

    print(f"Transcript implied: {name}")
    print(f"Feasible set size: {len(feasible)}")
    print(f"Feasible set: {feasible}")
    print(f"Model said: {prediction}")
    print("-" * 40)

## 10. Contradictory Case

This transcript is logically impossible — no secret matches all answers.

In [None]:
contradictory_transcript = """User: Is it a mammal?
Assistant: Yes.
User: Is it a fish?
Assistant: Yes.
User: Does it fly?
Assistant: Yes."""

feasible = compute_feasible_set(contradictory_transcript)
print(f"Feasible set size: {len(feasible)}")
print(f"Feasible set: {feasible}")
print()

contradictory_result = run_experiment(contradictory_transcript)
print(f"Contradictory transcript result:")
print(f"Model said: {contradictory_result}")
print("-" * 40)
print()
print("The model produced an answer despite no valid secret existing.")
print("This confirms the model reconstructs state from the transcript")
print("rather than maintaining a committed hidden state.")

## 11. Summary

In [None]:
print("=" * 50)
print("RESULTS SUMMARY")
print("=" * 50)
print()

# Check if different transcripts produced different outputs
unique_outputs = set(results.values())
print(f"Unique model outputs: {len(unique_outputs)} (from {len(results)} transcripts)")
print()

for name, prediction in results.items():
    match_marker = "" if name.lower() in prediction.lower() else "  <-- MISMATCH"
    print(f"  {name:>12s} -> {prediction}{match_marker}")

print()
print(f"Contradictory -> {contradictory_result}")
print()
print("If most consistent transcripts yielded the correct secret,")
print("the model is reconstructing state from the transcript.")

## 12. Reproducibility Info

The results above were produced in the following environment. The cell below will dynamically capture the current environment when re-run; the reference configuration is recorded here.

| Component      | Value                                  |
|----------------|----------------------------------------|
| Platform       | Linux 6.8.0-1040-nvidia-64k (Ubuntu 22.04.5 LTS) |
| Architecture   | aarch64                                |
| Python         | 3.10.12                                |
| PyTorch        | 2.7.0                                  |
| Transformers   | 5.1.0                                  |
| Accelerate     | 1.12.0                                 |
| Bitsandbytes   | 0.49.1                                 |
| NumPy          | ≥1.24, <2 (pinned)                    |
| GPU            | NVIDIA GH200 480GB (97,871 MiB)        |
| Model          | meta-llama/Llama-3.1-70B-Instruct      |
| Quantization   | 4-bit (BitsAndBytesConfig, fp16 compute) |
| Temperature    | 0.1                                    |
| max_new_tokens | 50                                     |
| do_sample      | True                                   |

In [None]:
import platform
import subprocess
import transformers
import bitsandbytes
import accelerate

def get_gpu_info():
    try:
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"],
            capture_output=True, text=True
        )
        return result.stdout.strip()
    except FileNotFoundError:
        return "nvidia-smi not found"

print("=" * 50)
print("REPRODUCIBILITY INFO")
print("=" * 50)
print()
print(f"Platform:        {platform.platform()}")
print(f"Architecture:    {platform.machine()}")
print(f"Python:          {platform.python_version()}")
print(f"PyTorch:         {torch.__version__}")
print(f"CUDA available:  {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version:    {torch.version.cuda}")
print(f"Transformers:    {transformers.__version__}")
print(f"Accelerate:      {accelerate.__version__}")
print(f"Bitsandbytes:    {bitsandbytes.__version__}")
print(f"NumPy:           {__import__('numpy').__version__}")
print()
print(f"GPU:             {get_gpu_info()}")
print()
print(f"Model:           {MODEL_NAME}")
print(f"Quantization:    4-bit (BitsAndBytesConfig)")
print(f"Temperature:     0.1")
print(f"max_new_tokens:  50")
print(f"do_sample:       True")