In [2]:
from pathlib import Path
from typing import List, Tuple, Sequence, Dict, Any
import numpy as np
import re
import os
from itertools import combinations
import matplotlib.pyplot as plt
import random
import string

from llama_cpp import Llama

#   notebook is in LlmStenoExplore/notebooks
REPO_ROOT = Path("..").resolve()

MODEL_REGISTRY = {
    "phi3_mini_q4": REPO_ROOT / "models/phi3/Phi-3-mini-4k-instruct-q4.gguf",
    "llama3_8b_q4_k_m": REPO_ROOT / "models/llama3_8b/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
}

def load_language_model(model_key: str) -> Llama:
    model_path = MODEL_REGISTRY[model_key]
    if not model_path.exists():
        raise FileNotFoundError(f"Model file not found: {model_path}")

    maximum_context_tokens = 8192 if "llama3" in model_key else 4096

    return Llama(
        model_path=str(model_path),
        n_ctx=maximum_context_tokens,
        n_gpu_layers=0,
        n_threads=os.cpu_count() or 4,
        n_batch=256,
        logits_all=True,
        verbose=False,
    )

llm = load_language_model("llama3_8b_q4_k_m")

In [3]:

def _make_prefix_ids(prefix: str, model: Llama) -> List[int]:
    """
    Turn a textual prefix (k or k') into initial context token ids.

    - If prefix is non-empty: tokenize it and drop the BOS token
      (this matches the authors' implementation).
    - If prefix is empty: use a single BOS token.

    This is used both in encoding (get_token_ranks_like_paper)
    and decoding (decode_from_ranks_like_paper), so empty/non-empty
    keys are treated consistently everywhere.
    """
    if prefix:
        # Tokenize with BOS, then drop BOS (index 0)
        token_ids = model.tokenize(prefix.encode("utf-8"), add_bos=True)
        return token_ids[1:]
    else:
        # No textual prefix: start context from BOS
        return [model.token_bos()]

In [4]:

def get_token_ranks_like_paper(
    text: str,
    model: Llama,
    prefix: str = "A text:",
) -> List[int]:
    """
    Token-level rank computation following the paper's recipe:

      1. Tokenize e and k' with the LLM tokenizer.
      2. For each token e_i, compute its rank among ALL vocab tokens
         under p(· | k', e_1,...,e_{i-1}).

    This mirrors the authors' get_token_ranks_llama_cpp, but uses
    _make_prefix_ids so it behaves sensibly even when prefix == "".
    """
    # Initial context tokens from k' (or BOS if prefix == "")
    prefix_ids = _make_prefix_ids(prefix, model)

    # Ensure text is valid UTF-8 and tokenize with leading space, drop BOS
    text = text.encode("utf-8", errors="ignore").decode("utf-8")
    text_ids = model.tokenize((" " + text).encode("utf-8"), add_bos=True)[1:]

    model.reset()
    model.eval(prefix_ids)

    ranks: List[int] = []

    # One rank per token in text_ids
    for token_id in text_ids:
        # logits for next token given current context
        logits = np.array(model.scores[model.n_tokens - 1], dtype=np.float32)

        # rank of token_id among all vocab entries (1-based)
        sorted_indices = np.argsort(logits)[::-1]
        positions = np.where(sorted_indices == token_id)[0]
        if positions.size == 0:
            raise RuntimeError(f"Token id {token_id} not found in logits")
        rank = int(positions[0]) + 1
        ranks.append(rank)

        # extend context with this token
        model.eval([token_id])

    return ranks

In [5]:

def decode_from_ranks_like_paper(
    prompt: str,
    ranks: List[int],
    model: Llama,
) -> str:
    """
    Token-level decoder matching the paper's scheme:

      - Turn prompt k or k' into initial context via _make_prefix_ids.
      - For each rank r_i:
          * get logits for next token given current context
          * pick the r_i-th most probable token
          * feed it and append to the sequence
      - Detokenize and, if prompt is non-empty, strip it from the front.
    """
    prompt_ids = _make_prefix_ids(prompt, model)

    model.reset()
    model.eval(prompt_ids)

    generated_ids = list(prompt_ids)

    for rank in ranks:
        logits = np.array(model.scores[model.n_tokens - 1], dtype=np.float32)

        sorted_indices = np.argsort(logits)[::-1]
        if rank < 1 or rank > len(sorted_indices):
            raise ValueError(
                f"Rank {rank} out of range for vocabulary size {len(sorted_indices)}"
            )

        next_token_id = int(sorted_indices[rank - 1])
        generated_ids.append(next_token_id)

        model.eval([next_token_id])

    decoded_bytes = model.detokenize(generated_ids)
    decoded_text = decoded_bytes.decode("utf-8", errors="ignore")

    # If we had a textual prompt, strip it; for empty prompt we only had BOS,
    # which normally does not render as visible text.
    if prompt and decoded_text.startswith(prompt):
        decoded_text = decoded_text[len(prompt):].lstrip()

    return decoded_text

In [6]:

def hide_text_token_level(
    secret_text: str,
    secret_prefix: str,
    secret_key: str,
    model: Llama = llm,
) -> Tuple[str, List[int]]:
    """
    Encode pipeline (e -> ranks -> stegotext):

      1. Compute ranks for secret_text e after prefix k'.
      2. Generate stegotext s from key k by following those ranks.
    """
    ranks = get_token_ranks_like_paper(
        text=secret_text,
        model=model,
        prefix=secret_prefix,
    )
    stegotext = decode_from_ranks_like_paper(
        prompt=secret_key,
        ranks=ranks,
        model=model,
    )
    return stegotext, ranks


def reveal_text_token_level(
    stegotext: str,
    secret_prefix: str,
    secret_key: str,
    model: Llama = llm,
) -> str:
    """
    Decode pipeline (s -> ranks -> e):

      1. From stegotext s and key k, recover the same ranks.
      2. From those ranks and prefix k', reconstruct e.
    """
    recovered_ranks = get_token_ranks_like_paper(
        text=stegotext,
        model=model,
        prefix=secret_key,
    )
    recovered_text = decode_from_ranks_like_paper(
        prompt=secret_prefix,
        ranks=recovered_ranks,
        model=model,
    )
    return recovered_text

In [7]:
def run_example(
    secret_text: str,
    secret_prefix: str,
    secret_key: str,
    model: Llama = llm,
) -> None:
    """
    Run one full encode/decode example and log everything consistently:
      - secret text
      - ranks_e and their length
      - stegotext
      - token counts for secret and stego (same tokenization as get_token_ranks_like_paper)
      - recovered text and equality check
    """
    print("=" * 80)
    print("Secret text e:")
    print(secret_text)
    print()

    # Encode: e -> (ranks_e) -> stegotext
    stegotext, ranks_e = hide_text_token_level(
        secret_text=secret_text,
        secret_prefix=secret_prefix,
        secret_key=secret_key,
        model=model,
    )

    print("ranks_e (len = {}):".format(len(ranks_e)))
    print(ranks_e)
    print()

    print("Stegotext s:")
    print(stegotext)
    print()

    # Same tokenization scheme as get_token_ranks_like_paper
    secret_token_ids = model.tokenize((" " + secret_text).encode("utf-8"), add_bos=True)[1:]
    stego_token_ids  = model.tokenize((" " + stegotext).encode("utf-8"), add_bos=True)[1:]

    print("Secret tokens :", len(secret_token_ids))
    print("Stego tokens  :", len(stego_token_ids))
    print("len(ranks_e)  :", len(ranks_e))
    print()

    # Sanity checks
    assert len(secret_token_ids) == len(ranks_e), "Token count for e does not match len(ranks_e)"
    assert len(stego_token_ids)  == len(ranks_e), "Token count for s does not match len(ranks_e)"

    # Decode: s -> (ranks) -> e
    recovered_text = reveal_text_token_level(
        stegotext=stegotext,
        secret_prefix=secret_prefix,
        secret_key=secret_key,
        model=model,
    )

    print("Recovered e:")
    print(recovered_text)
    print("Recovered == secret_text:", recovered_text == secret_text)
    print("-" * 60)


In [8]:
# Example 1
secret_text_1  = "THE CURRENT SYSTEM HAS REPEATEDLY FAILED"
secret_prefix_1 = "A text:"   # k'
secret_key_1    = "Here it is: the infamous British roasted boar with mint sauce. How to make it perfect."

run_example(secret_text_1, secret_prefix_1, secret_key_1, model=llm)

# Example 2
secret_text_2  = "The cats like to meow all the time. It is annoying."
secret_prefix_2 = ""  # k'
secret_key_2    = "The cat is a feline member just like lions and tigers but much smaller."  # k

run_example(secret_text_2, secret_prefix_2, secret_key_2, model=llm)


Secret text e:
THE CURRENT SYSTEM HAS REPEATEDLY FAILED

ranks_e (len = 9):
[164, 639, 21, 10, 10, 17, 1, 1, 1]

Stegotext s:
Get sufficient roas tting time. The

Secret tokens : 9
Stego tokens  : 9
len(ranks_e)  : 9

Recovered e:
THE CURRENT SYSTEM HAS REPEATEDLY FAILED
Recovered == secret_text: True
------------------------------------------------------------
Secret text e:
The cats like to meow all the time. It is annoying.

ranks_e (len = 14):
[89803, 14969, 58, 1, 106, 1, 26, 1, 1, 1, 6, 2, 6, 1]

Stegotext s:
önemlidir Ringvorstellung Schriftgutachten. They have sharp claws

Secret tokens : 14
Stego tokens  : 14
len(ranks_e)  : 14

Recovered e:
The cats like to meow all the time. It is annoying.
Recovered == secret_text: True
------------------------------------------------------------


# goal context

conduct research on the 'sensitivity of the key in respect to the stenography text produced, the stego text'. For instance if I have a key eg, 'I like cats' and then another 'I like kittens' to calculate the distance between the keys but then to find the equivalent distance between the stego text to see how much different is the keys are and the mapping to the distance between the stego texts. To do this for a few examples to see how that can work out. so effectively some simple examples of the (d_k, d_s| e) for the same message 'e' and some key and produce the stego text 's' and the distance 'd_k' and 'd_s'.

studying here is the map

k -> s(k;e)

fixed hidden message e, e: how much does changing the key k change the stegotext s? we want empirical pairs

(dk(k1,k2), ds(s1,s2)∣e), with  si=s(ki;e)


## Distances for keys and stegotexts

1. Character level edit distance, Normalized Levenshtein:

d_k^{char}(k1,k2) = edit_distance(k1,k2) / max⁡(∣k1∣,∣k2∣,1)

says how much you had to literally edit the string.

(%pip install python-Levenshtein)

2. Token level distance using the same tokenizer as the LLM. Since the protocol is token based, it is natural to look at key distance in token space.

Let key_token_ids(k) be the tokenization you already use for prompts (via _make_prefix_ids, but without the BOS heuristic). For two keys with token sequences of possibly different lengths we can use a normalized edit distance in token space.

3. Embedding distance

In a sentence embedding model (for example a small sentence transformer) we can also measure cosine distance between embeddings of keys:

dkemb(k1,k2)=1−cos⁡(emb(k1),emb(k2))

says you how far apart the prompts are semantically, not just lexically.

4. Token level Hamming distance under the Llama tokenizer.

For fixed e, all stegotexts have exactly the same number of tokens (always use the same rank sequence), very clean:

dstok(s1,s2)= 1/n \sum_i^n  \delta[t_i^(1) \neq t_i^(2)]

where t_i^(j) is the i-th token of stegotext sj	in the Llama tokenizer and n is the common length. This is a per position token mismatch rate.


# plan

scatter of key distance vs stego distance

Fix a secret text e (something like 10-ish words).
Fix a prefix k' (or "").
Generate many keys of similar length (for example, 5 words).
For many key pairs (k1, k2):
compute d_k (Levenshtein between keys),
compute d_s (Levenshtein between corresponding stegotexts).

Plot d_k on the x axis, d_s on the y axis, and save to results

In [9]:
import Levenshtein

def levenshtein_raw(a: str, b: str) -> int:
    """
    Raw Levenshtein edit distance between two strings.
    """
    return Levenshtein.distance(a, b)


def levenshtein_normalized(a: str, b: str) -> float:
    """
    Normalized Levenshtein distance in [0, 1], using max length
    as the normalization factor.

    0.0 means identical strings, values closer to 1.0 mean more different.
    """
    raw_distance = Levenshtein.distance(a, b)
    maximum_length = max(len(a), len(b))
    if maximum_length == 0:
        return 0.0
    return raw_distance / maximum_length

In [12]:
def compute_key_and_stego_distances(
    secret_text: str,
    secret_prefix: str,
    secret_key_one: str,
    secret_key_two: str,
    model: Llama = llm,
) -> Dict[str, Any]:
    """
    For a fixed secret message e and prefix k',
    generate stegotexts for two keys and compute:

      - Levenshtein distance between the keys
      - Levenshtein distance between the stegotexts

    Returns a dictionary with:
      - secret_key_one, secret_key_two
      - stegotext_one, stegotext_two
      - d_k_raw, d_k_norm
      - d_s_raw, d_s_norm
    """

    # Generate stegotext for key 1
    stegotext_one, _ = hide_text_token_level(
        secret_text=secret_text,
        secret_prefix=secret_prefix,
        secret_key=secret_key_one,
        model=model,
    )

    # Generate stegotext for key 2
    stegotext_two, _ = hide_text_token_level(
        secret_text=secret_text,
        secret_prefix=secret_prefix,
        secret_key=secret_key_two,
        model=model,
    )

    # Distances between keys
    d_k_raw = levenshtein_raw(secret_key_one, secret_key_two)
    d_k_norm = levenshtein_normalized(secret_key_one, secret_key_two)

    # Distances between stegotexts
    d_s_raw = levenshtein_raw(stegotext_one, stegotext_two)
    d_s_norm = levenshtein_normalized(stegotext_one, stegotext_two)

    return {
        "secret_key_one": secret_key_one,
        "secret_key_two": secret_key_two,
        "stegotext_one": stegotext_one,
        "stegotext_two": stegotext_two,
        "d_k_raw": d_k_raw,
        "d_k_norm": d_k_norm,
        "d_s_raw": d_s_raw,
        "d_s_norm": d_s_norm,
    }

In [13]:
secret_text = "The cats like to meow all the time. It is annoying."
secret_prefix = ""  # or "A text:" if you want to use a prefix

secret_key_one = "I like cats"
secret_key_two = "I like kittens"

result = compute_key_and_stego_distances(
    secret_text=secret_text,
    secret_prefix=secret_prefix,
    secret_key_one=secret_key_one,
    secret_key_two=secret_key_two,
    model=llm,
)

for key, value in result.items():
    print(f"{key}: {value}")

secret_key_one: I like cats
secret_key_two: I like kittens
stegotext_one: asticsearch slack-github**

I need to find the common interests that are
stegotext_two: uyếnTek và các chuyến bay an toàn. I have been following your
d_k_raw: 5
d_k_norm: 0.35714285714285715
d_s_raw: 58
d_s_norm: 0.8055555555555556


In [11]:

def precompute_ranks_for_secret(
    secret_text: str,
    secret_prefix: str,
    model: Llama = llm,
) -> List[int]:
    """
    Compute the rank sequence for a fixed secret text e and prefix k'
    once, to be reused for many different keys k.
    """
    ranks = get_token_ranks_like_paper(
        text=secret_text,
        model=model,
        prefix=secret_prefix,
    )
    return ranks


def generate_stegotext_from_ranks(
    ranks: List[int],
    secret_key: str,
    model: Llama = llm,
) -> str:
    """
    Given a rank sequence and a key k, generate the corresponding
    stegotext s(k; e) by following those ranks under prompt=k.
    """
    stegotext = decode_from_ranks_like_paper(
        prompt=secret_key,
        ranks=ranks,
        model=model,
    )
    return stegotext

In [17]:
DEFAULT_KEY_VOCABULARY = [
    "cats", "kittens", "dogs", "puppies",
    "music", "books", "coffee", "travel",
    "coding", "movies", "reading", "walking",
    "running", "summer", "winter", "sunny",
    "rainy", "happy", "sad", "quiet",
]


def generate_random_keys(
    number_of_keys: int,
    number_of_words: int,
    random_seed: int = 0,
    vocabulary: Sequence[str] = DEFAULT_KEY_VOCABULARY,
) -> List[str]:
    """
    Generate simple natural-language-like keys, each with the same
    number of words (approx same length). For example:

        "Cats love quiet music."
    """
    random_generator = random.Random(random_seed)
    keys: List[str] = []

    for _ in range(number_of_keys):
        words = [random_generator.choice(vocabulary) for _ in range(number_of_words)]
        sentence = " ".join(words).capitalize() + "."
        keys.append(sentence)

    return keys


In [24]:
def generate_stegotexts_for_keys(
    ranks_for_secret: List[int],
    keys: Sequence[str],
    model: Llama = llm,
) -> Dict[str, str]:
    """
    For a fixed secret (encoded by ranks_for_secret), generate stegotext
    for each key.
    """
    stegotext_by_key: Dict[str, str] = {}
    for key in keys:
        stegotext_by_key[key] = generate_stegotext_from_ranks(
            ranks=ranks_for_secret,
            secret_key=key,
            model=model,
        )
    return stegotext_by_key


def compute_pairwise_key_and_stego_distances(
    keys: Sequence[str],
    stegotext_by_key: Dict[str, str],
) -> List[Dict[str, Any]]:
    records: List[Dict[str, Any]] = []

    for key_one, key_two in combinations(keys, 2):
        stego_one = stegotext_by_key[key_one]
        stego_two = stegotext_by_key[key_two]

        # Distances between keys
        d_k_raw = Levenshtein.distance(key_one, key_two)
        max_key_length = max(len(key_one), len(key_two), 1)
        d_k_norm = d_k_raw / max_key_length

        # Distances between stegotexts
        d_s_raw = Levenshtein.distance(stego_one, stego_two)
        max_stego_length = max(len(stego_one), len(stego_two), 1)
        d_s_norm = d_s_raw / max_stego_length

        records.append(
            {
                "key_one": key_one,
                "key_two": key_two,
                "stego_one": stego_one,
                "stego_two": stego_two,
                "d_k_raw": d_k_raw,
                "d_k_norm": d_k_norm,
                "d_s_raw": d_s_raw,
                "d_s_norm": d_s_norm,
            }
        )

    return records

In [8]:
def plot_key_vs_stego_levenshtein(
    pairwise_records: Sequence[Dict[str, Any]],
    use_normalized: bool = True,
    output_directory: Path = REPO_ROOT / "results",
    output_filename: str = "key_vs_stego_levenshtein_scatter.png",
) -> Path:
    """
    Create a scatter plot with key distance on the x axis and
    stegotext distance on the y axis. Save it to output_directory
    and return the path.
    """
    output_directory.mkdir(parents=True, exist_ok=True)

    if use_normalized:
        x_values = [record["d_k_norm"] for record in pairwise_records]
        y_values = [record["d_s_norm"] for record in pairwise_records]
        x_label = "Key Levenshtein distance (normalized)"
        y_label = "Stegotext Levenshtein distance (normalized)"
    else:
        x_values = [record["d_k_raw"] for record in pairwise_records]
        y_values = [record["d_s_raw"] for record in pairwise_records]
        x_label = "Key Levenshtein distance (raw)"
        y_label = "Stegotext Levenshtein distance (raw)"

    plt.figure()
    plt.scatter(x_values, y_values, alpha=0.7)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title("Sensitivity of stegotext to key (Levenshtein distance)")
    plt.grid(True)

    output_path = output_directory / output_filename
    plt.savefig(output_path, dpi=200, bbox_inches="tight")
    plt.close()

    print(f"Saved scatter plot to: {output_path}")
    return output_path


In [25]:
# Choose secret text and prefix (k')
secret_text = "Cats like to meow all the time, it is annoying."
secret_prefix = ""  # or "A text:" if you want to condition e

# Precompute ranks for the secret once
ranks_for_secret = precompute_ranks_for_secret(
    secret_text=secret_text,
    secret_prefix=secret_prefix,
    model=llm,
)

# Generate a bunch of keys of equal word length
number_of_keys = 30
number_of_words_per_key = 5

keys = generate_random_keys(
    number_of_keys=number_of_keys,
    number_of_words=number_of_words_per_key,
    random_seed=42,
)

# Stegotexts for each key
stegotext_by_key = generate_stegotexts_for_keys(
    ranks_for_secret=ranks_for_secret,
    keys=keys,
    model=llm,
)

# All pairwise distances (d_k, d_s | e)
pairwise_records = compute_pairwise_key_and_stego_distances(
    keys=keys,
    stegotext_by_key=stegotext_by_key,
)

# inspect a couple of records
for record in pairwise_records[:3]:
    print("===")
    print("key_one:", record["key_one"])
    print("key_two:", record["key_two"])
    print("d_k_norm:", record["d_k_norm"])
    print("d_s_norm:", record["d_s_norm"])

# Plot and save to ../results/
plot_key_vs_stego_levenshtein(
    pairwise_records=pairwise_records,
    use_normalized=True,
)


===
key_one: Puppies cats coding travel travel.
key_two: Music puppies happy dogs sad.
d_k_norm: 0.7941176470588235
d_s_norm: 0.9074074074074074
===
key_one: Puppies cats coding travel travel.
key_two: Summer kittens cats dogs coffee.
d_k_norm: 0.7647058823529411
d_s_norm: 0.8703703703703703
===
key_one: Puppies cats coding travel travel.
key_two: Travel rainy quiet cats happy.
d_k_norm: 0.7352941176470589
d_s_norm: 0.9464285714285714
Saved scatter plot to: /home/meow/Documents/repos/LlmStenoExplore/results/key_vs_stego_levenshtein_scatter.png


PosixPath('/home/meow/Documents/repos/LlmStenoExplore/results/key_vs_stego_levenshtein_scatter.png')

In [9]:
import random
import string
import Levenshtein


def make_key_with_substitution_edits(
    base_key: str,
    number_of_edits: int,
    random_seed: int = None,
) -> str:
    """
    Create a variant of base_key by applying exactly `number_of_edits`
    character substitutions (no insertions or deletions).
    This ensures the raw Levenshtein distance is exactly `number_of_edits`
    as long as we only perform substitutions.

    We only edit alphabetic characters to keep the key readable and
    leave spaces and punctuation intact.
    """
    if number_of_edits <= 0:
        return base_key

    random_generator = random.Random(random_seed)

    characters = list(base_key)
    editable_positions = [index for index, character in enumerate(characters) if character.isalpha()]

    if not editable_positions:
        # Fallback: nothing alphabetic to edit
        return base_key

    # Clamp edits to available positions
    edits_to_apply = min(number_of_edits, len(editable_positions))

    positions_to_edit = random_generator.sample(editable_positions, edits_to_apply)

    alphabet = string.ascii_letters

    for index in positions_to_edit:
        original_character = characters[index]
        possible_replacements = [ch for ch in alphabet if ch != original_character]
        characters[index] = random_generator.choice(possible_replacements)

    mutated_key = "".join(characters)

    # Optional assertion: Levenshtein distance should match the number of edits we applied
    raw_distance = Levenshtein.distance(base_key, mutated_key)
    assert raw_distance == edits_to_apply, f"Expected distance {edits_to_apply}, got {raw_distance}"

    return mutated_key


from typing import List, Dict, Any, Sequence
from pathlib import Path
import matplotlib.pyplot as plt


def key_distance_sweep_against_base(
    secret_text: str,
    secret_prefix: str,
    base_key: str,
    edit_counts: Sequence[int],
    samples_per_edit: int,
    model: Llama = llm,
) -> List[Dict[str, Any]]:
    """
    For a fixed secret text e, prefix k' and base key k_base, create variants of k_base
    at various character-level edit distances and measure:

      - d_k_raw, d_k_norm between k_base and k_variant
      - d_s_raw, d_s_norm between their corresponding stegotexts

    Returns a list of records, one per (edit_count, sample) pair.
    """

    # 1. Precompute ranks for the secret once
    ranks_for_secret = precompute_ranks_for_secret(
        secret_text=secret_text,
        secret_prefix=secret_prefix,
        model=model,
    )

    # 2. Stegotext for the base key
    stego_base = generate_stegotext_from_ranks(
        ranks=ranks_for_secret,
        secret_key=base_key,
        model=model,
    )

    records: List[Dict[str, Any]] = []

    for edit_count in edit_counts:
        for sample_index in range(samples_per_edit):
            mutated_key = make_key_with_substitution_edits(
                base_key=base_key,
                number_of_edits=edit_count,
                random_seed=1000 + edit_count * 100 + sample_index,
            )

            stego_mutated = generate_stegotext_from_ranks(
                ranks=ranks_for_secret,
                secret_key=mutated_key,
                model=model,
            )

            # Key distances
            d_k_raw = Levenshtein.distance(base_key, mutated_key)
            max_key_length = max(len(base_key), len(mutated_key), 1)
            d_k_norm = d_k_raw / max_key_length

            # Stegotext distances
            d_s_raw = Levenshtein.distance(stego_base, stego_mutated)
            max_stego_length = max(len(stego_base), len(stego_mutated), 1)
            d_s_norm = d_s_raw / max_stego_length

            records.append(
                {
                    "base_key": base_key,
                    "mutated_key": mutated_key,
                    "edit_count": edit_count,
                    "stego_base": stego_base,
                    "stego_mutated": stego_mutated,
                    "d_k_raw": d_k_raw,
                    "d_k_norm": d_k_norm,
                    "d_s_raw": d_s_raw,
                    "d_s_norm": d_s_norm,
                }
            )

    return records


def build_uniform_edit_counts(base_key: str, number_of_levels: int) -> List[int]:
    """
    Build a set of edit_counts that give (approximately) uniformly spaced
    normalized distances d_k_norm in [0, 1], relative to the number of
    alphabetic characters in base_key.
    """
    number_of_alphabetic_characters = sum(character.isalpha() for character in base_key)
    if number_of_alphabetic_characters == 0:
        return [0]

    raw_counts: List[int] = []
    for level_index in range(number_of_levels):
        fraction = level_index / max(number_of_levels - 1, 1)
        count = int(round(fraction * number_of_alphabetic_characters))
        raw_counts.append(count)

    # Remove duplicates and sort
    edit_counts = sorted(set(raw_counts))
    return edit_counts

def plot_sweep_key_vs_stego_levenshtein(
    records: Sequence[Dict[str, Any]],
    output_directory: Path = REPO_ROOT / "results",
    output_filename: str = "key_vs_stego_levenshtein_sweep.png",
) -> Path:
    """
    Scatter plot of d_k_norm vs d_s_norm for the base-vs-variant experiment.
    """
    output_directory.mkdir(parents=True, exist_ok=True)

    x_values = [record["d_k_norm"] for record in records]
    y_values = [record["d_s_norm"] for record in records]

    plt.figure()
    plt.scatter(x_values, y_values, alpha=0.7)
    plt.xlabel("Key Levenshtein distance (normalized, base vs variant)")
    plt.ylabel("Stegotext Levenshtein distance (normalized)")
    plt.title("Sensitivity of stegotext to key edits (base key sweep)")
    plt.grid(True)

    output_path = output_directory / output_filename
    plt.savefig(output_path, dpi=200, bbox_inches="tight")
    plt.close()

    print(f"Saved sweep scatter plot to: {output_path}")
    return output_path




In [12]:
secret_text = "The cats like to meow all the time. It is annoying."
secret_prefix = ""  # or "A text:" if you prefer
base_key = "Puppies cats coding travel travel."

# Build many edit levels, roughly uniformly spanning [0, 1] in d_k_norm
edit_counts = build_uniform_edit_counts(base_key=base_key, number_of_levels=30)

samples_per_edit = 5  # same as before

sweep_records = key_distance_sweep_against_base(
    secret_text=secret_text,
    secret_prefix=secret_prefix,
    base_key=base_key,
    edit_counts=edit_counts,
    samples_per_edit=samples_per_edit,
    model=llm,
)

plot_sweep_key_vs_stego_levenshtein(sweep_records)


Saved sweep scatter plot to: /home/meow/Documents/repos/LlmStenoExplore/results/key_vs_stego_levenshtein_sweep.png


PosixPath('/home/meow/Documents/repos/LlmStenoExplore/results/key_vs_stego_levenshtein_sweep.png')

 new plot *does* show a very strong "avalanche like" sensitivity, but it is not literally a cryptographic hash. It is more like:

> For almost any nonzero character level change to the key, the resulting stegotext is almost as different as it can be (by Levenshtein), given the fixed length and language constraints.


---

## 1.  plot is really showing

- fix:
  - secret text `e`
  - prefix `k'`
  - base key `k_base`
- For each mutated key `k_variant` :
  - change some number of characters in `k_base` (no insertions, only substitutions),
  - generate stegotext `s_base = s(k_base; e)`,
  - generate `s_variant = s(k_variant; e)`,
  - compute `d_k_norm(k_base, k_variant)` and `d_s_norm(s_base, s_variant)`.

The plot shows:

- one point at `(0, 0)` (same key -> same stegotext), and  
- for any nonzero `d_k_norm`, almost all `d_s_norm` values are in roughly `[0.8, 0.95]`.


> Once the key differs at all in characters, the stegotext looks almost maximally different at the character level.

That is exactly the flat band

---

## 2. Why a tiny change in the key can produce a huge change in the stegotext

Two effects combine here.

### 2.1 Character edits are huge from the model's perspective

Your `make_key_with_substitution_edits` changes individual characters inside words. To Levenshtein this is a tiny change. But to the Llama tokenizer and model it can be catastrophic:

- `"coding"` -> `"coxing"` or `"codxng"` often changes the tokenization completely.
- Many mutated words become out of distribution or extremely rare subword tokens.

So from the LLM's point of view, even one or two character substitutions often mean:

- "clean normal prompt" vs "weird noisy prompt".

The model's internal representation of the context (and therefore its probability ordering over the next token) can change dramatically, even though Levenshtein says "distance 1 or 2".

sweep is really probing something like:

> "How different are stegotexts when I move the key from a natural English sentence to various corrupted versions of that sentence?"

Given the paper's protocol, the key `k` only enters through the conditional distribution `p(. | k, s_<i>)`. If the context embedding changes a lot, the sorted rank order of tokens at each step changes a lot, and with a fixed rank sequence `r_i` you get almost entirely different tokens. :contentReference[oaicite:0]{index=0}  

### 2.2 Fixed rank sequence + different contexts is like random relabeling

For each token position `i`, the protocol does:

- For key `k_1`: choose the token at rank `r_i` under `p(. | k_1, s^{(1)}_<i>)`.
- For key `k_2`: choose the token at the same rank `r_i` but under `p(. | k_2, s^{(2)}_<i>)`.

If you roughly model "sorted vocab by probability under context 1" and "sorted vocab under context 2" as two different permutations of the vocabulary, then "rank `r_i` under context 1" and "rank `r_i` under context 2" will almost never be the same token. That means at each position:

- Probability that the two stegotexts share the same token is very small.
- So the expected fraction of matching tokens is tiny, and the normalized Levenshtein distance is close to 1.

Because your stegotexts are of moderate length and use natural language tokens see roughly `0.8-0.95` rather than literally `1.0`, but it is clearly very high.

---

## 3. Is this "like a hash function"?

In spirit, yes in one important sense, but no in several others.

### 3.1 How it is similar to a hash

seeing an avalanche like effect:

> Any nonzero small character change to `k` almost always produces a stegotext that is "maximally scrambled" compared to `s(k_base; e)` under your distance measure.

This is exactly the kind of behavior we qualitatively expect from a good hash: output looks essentially unrelated even for tiny input changes.

Given the protocol in the paper, that is not surprising: the key's job is to set the entire probability landscape from which we pick tokens by pre fixed ranks. A slightly different landscape generally yields a completely different path. :contentReference[oaicite:1]{index=1}  

So, for character level perturbations of this type, our empirical plots are telling us:

> The map `k -> s(k; e)` behaves almost like a chaotic function: once you depart from exactly the same key, the resulting stegotexts are very far apart.

### 3.2 How it differs from a cryptographic hash

However, it is not a cryptographic hash:

1. **Not designed for uniformity or bit level independence**

   In a hash, changing one input bit flips each output bit with probability 0.5 independently. Here:

   - Output is constrained to be natural language.
   - There are correlations between tokens due to grammar, semantics, and the fixed rank sequence.
   - Distances saturate around `0.8-0.95`, not `1.0`, and in token space there may be more structure than Levenshtein exposes.

2. **Metric mismatch**

   - You measure distance between keys by character Levenshtein.
   - The model "feels" keys in token or embedding space. Two keys that are very close in Levenshtein can be very far to the model (your current experiment), and two keys that are quite far in Levenshtein but semantically similar (word level edits) might have much more similar stegotexts.
   - So the apparent avalanche is partly an artifact of using a metric that damages words, not just changes their semantics.

3. **No formal collision resistance or preimage resistance**

   - Many different keys will lead to broadly similar stegotexts, and the protocol is not designed to minimize such collisions.
   - An attacker who knows `e`, `k'`, and the model can trivially generate infinitely many different keys that produce stegotexts for the same `e` (just change `k`).
   - Security in the paper relies on the secrecy of the key and the need to match both the model and the key, not on hash like one wayness. :contentReference[oaicite:2]{index=2}  

4. **Local versus global behavior**

   - A hash is equally scrambling everywhere in its input space.
   - Here, behavior might depend on the region of prompt space you are in. Your current experiment mutates a single base key with fairly aggressive character noise; if you instead moved between semantically close, clean prompts (word substitutions, added detail, style tweaks), you might see more structure and less perfect scrambling.

---

## 4. How to sharpen this picture with further experiments

If we want to make the "hash like" statement more precise, there are a few natural next experiments:

1. **Word level edits instead of character noise**

   - Replace the base key's words with other words from your vocabulary (keeping grammar and structure intact).
   - Measure `(d_k, d_s)` again.
   - If stegotext distances are still high even when keys remain grammatical and similar in meaning, that is stronger evidence of intrinsic sensitivity rather than just "the model hates corrupted strings".

2. **Token level Hamming distance**

   - Compute the fraction of token positions where two stegotexts disagree.
   - Compare that to character level Levenshtein. Your `0.8-0.95` may simply reflect morphological and subword similarities.

3. **Compare local vs global**

   - Put the base key sweep points and the random key pair points on the same scatter (different colors).
   - If both clouds sit in the same high `d_s` band, that supports the "almost any difference in key -> huge difference in stegotext" story.

---

## 5. Bottom line

- Our latest plot does show that, under your current notion of "small change in key" (character substitutions), the mapping `k -> s(k; e)` is extremely sensitive: once `d_k > 0`, `d_s` is already very large and does not grow much further.
- This is qualitatively hash like in the avalanche sense: tiny key changes yield almost maximally different stegotexts.
- But it is not a cryptographic hash: the output space is constrained, the metric is not bit level, and there is no formal security guarantee or uniformity.

Conceptually, we can say:

> For this protocol and this model, the stegotext behaves approximately like a chaotic function of the key - more like a hash than like a smooth function - especially when you look at character level perturbations.

If we repeat this with word level or prompt style variations, you will get a more nuanced picture of how "hashy" it really is when the key changes are ones the model perceives as small rather than corrupted.


In [13]:
import random
import re
import Levenshtein


def make_key_with_word_replacements(
    base_key: str,
    number_of_word_replacements: int,
    replacement_vocabulary: list[str],
    random_seed: int | None = None,
) -> str:
    """
    Replace `number_of_word_replacements` word tokens in base_key with words
    drawn from replacement_vocabulary.

    This keeps the key grammatical and avoids introducing corrupted tokens,
    so the model is more likely to see these changes as "small" than raw
    character noise.
    """
    if number_of_word_replacements <= 0:
        return base_key

    random_generator = random.Random(random_seed)

    tokens = base_key.split()

    # Only consider tokens that contain at least one alphabetic character
    editable_positions = [
        index
        for index, token in enumerate(tokens)
        if any(character.isalpha() for character in token)
    ]

    if not editable_positions:
        return base_key

    replacements_to_apply = min(number_of_word_replacements, len(editable_positions))
    positions_to_replace = random_generator.sample(
        editable_positions,
        replacements_to_apply,
    )

    mutated_tokens = list(tokens)

    for index in positions_to_replace:
        original_token = mutated_tokens[index]

        # Separate core word from trailing punctuation, e.g. "cats." -> ("cats", ".")
        match = re.match(r"^([A-Za-z']+)([^A-Za-z']*)$", original_token)
        if match is None:
            # If we cannot parse it nicely, just skip this token
            continue

        original_word = match.group(1)
        trailing_punctuation = match.group(2)

        # Choose a replacement word distinct from the original (case insensitive)
        candidate_words = [
            word for word in replacement_vocabulary
            if word.lower() != original_word.lower()
        ]
        if not candidate_words:
            continue

        replacement_word = random_generator.choice(candidate_words)

        # Preserve capitalization pattern of the original word
        if original_word.istitle():
            replacement_word = replacement_word.capitalize()
        elif original_word.isupper():
            replacement_word = replacement_word.upper()

        mutated_tokens[index] = replacement_word + trailing_punctuation

    mutated_key = " ".join(mutated_tokens)
    return mutated_key


REPLACEMENT_VOCABULARY = [
    "cats", "kittens", "dogs", "puppies",
    "music", "books", "coffee", "tea",
    "travel", "coding", "movies", "reading",
    "walking", "running", "summer", "winter",
    "sunny", "rainy", "happy", "quiet",
    "busy", "calm", "evening", "morning",
]

STYLE_PREFIXES = [
    "",
    "In my opinion,",
    "Honestly,",
    "From my perspective,",
    "To be honest,",
]

STYLE_SUFFIXES = [
    "",
    "and that is just how I see it.",
    "most of the time.",
    "when I have some free time.",
    "especially on weekends.",
]


def make_key_with_style_variation(
    base_key: str,
    random_seed: int | None = None,
) -> str:
    """
    Create a stylistic variant of base_key by adding a soft prefix and/or suffix.

    This changes tone and length, but keeps the sentence clean and grammatical.
    """
    random_generator = random.Random(random_seed)

    prefix = random_generator.choice(STYLE_PREFIXES)
    suffix = random_generator.choice(STYLE_SUFFIXES)

    mutated_key = base_key
    if prefix:
        mutated_key = prefix + " " + mutated_key
    if suffix:
        mutated_key = mutated_key + " " + suffix

    return mutated_key


from typing import List


def build_uniform_word_edit_counts(base_key: str, number_of_levels: int) -> List[int]:
    """
    Build a list of distinct word replacement counts that roughly span
    from 0 to the maximum possible number of word replacements.
    """
    tokens = base_key.split()
    number_of_tokens = len(tokens)
    if number_of_tokens == 0:
        return [0]

    raw_counts: List[int] = []
    for level_index in range(number_of_levels):
        fraction = level_index / max(number_of_levels - 1, 1)
        count = int(round(fraction * number_of_tokens))
        raw_counts.append(count)

    edit_counts = sorted(set(raw_counts))
    return edit_counts


from typing import Dict, Any, Sequence
from pathlib import Path


def key_distance_sweep_word_level(
    secret_text: str,
    secret_prefix: str,
    base_key: str,
    edit_counts: Sequence[int],
    samples_per_edit: int,
    replacement_vocabulary: list[str],
    model: Llama = llm,
) -> List[Dict[str, Any]]:
    """
    For a fixed secret text e, prefix k' and base key k_base, create variants of k_base
    using WORD-LEVEL replacements and measure:

      - d_k_raw, d_k_norm between k_base and k_variant
      - d_s_raw, d_s_norm between their corresponding stegotexts
    """

    # 1. Precompute ranks for the secret once
    ranks_for_secret = precompute_ranks_for_secret(
        secret_text=secret_text,
        secret_prefix=secret_prefix,
        model=model,
    )

    # 2. Stegotext for the base key
    stegotext_base = generate_stegotext_from_ranks(
        ranks=ranks_for_secret,
        secret_key=base_key,
        model=model,
    )

    records: List[Dict[str, Any]] = []

    for edit_count in edit_counts:
        for sample_index in range(samples_per_edit):
            mutated_key = make_key_with_word_replacements(
                base_key=base_key,
                number_of_word_replacements=edit_count,
                replacement_vocabulary=replacement_vocabulary,
                random_seed=2000 + edit_count * 100 + sample_index,
            )

            stegotext_mutated = generate_stegotext_from_ranks(
                ranks=ranks_for_secret,
                secret_key=mutated_key,
                model=model,
            )

            # Key distances
            d_k_raw = Levenshtein.distance(base_key, mutated_key)
            max_key_length = max(len(base_key), len(mutated_key), 1)
            d_k_norm = d_k_raw / max_key_length

            # Stegotext distances
            d_s_raw = Levenshtein.distance(stegotext_base, stegotext_mutated)
            max_stego_length = max(len(stegotext_base), len(stegotext_mutated), 1)
            d_s_norm = d_s_raw / max_stego_length

            records.append(
                {
                    "base_key": base_key,
                    "mutated_key": mutated_key,
                    "edit_count": edit_count,
                    "stegotext_base": stegotext_base,
                    "stegotext_mutated": stegotext_mutated,
                    "d_k_raw": d_k_raw,
                    "d_k_norm": d_k_norm,
                    "d_s_raw": d_s_raw,
                    "d_s_norm": d_s_norm,
                }
            )

    return records

def key_distance_sweep_style_variations(
    secret_text: str,
    secret_prefix: str,
    base_key: str,
    number_of_variants: int,
    model: Llama = llm,
) -> List[Dict[str, Any]]:
    """
    For a fixed secret text e, prefix k' and base key k_base, create stylistic
    variants of k_base and measure key and stegotext distances.
    """

    ranks_for_secret = precompute_ranks_for_secret(
        secret_text=secret_text,
        secret_prefix=secret_prefix,
        model=model,
    )

    stegotext_base = generate_stegotext_from_ranks(
        ranks=ranks_for_secret,
        secret_key=base_key,
        model=model,
    )

    records: List[Dict[str, Any]] = []

    for variant_index in range(number_of_variants):
        mutated_key = make_key_with_style_variation(
            base_key=base_key,
            random_seed=3000 + variant_index,
        )

        stegotext_mutated = generate_stegotext_from_ranks(
            ranks=ranks_for_secret,
            secret_key=mutated_key,
            model=model,
        )

        d_k_raw = Levenshtein.distance(base_key, mutated_key)
        max_key_length = max(len(base_key), len(mutated_key), 1)
        d_k_norm = d_k_raw / max_key_length

        d_s_raw = Levenshtein.distance(stegotext_base, stegotext_mutated)
        max_stego_length = max(len(stegotext_base), len(stegotext_mutated), 1)
        d_s_norm = d_s_raw / max_stego_length

        records.append(
            {
                "base_key": base_key,
                "mutated_key": mutated_key,
                "variant_index": variant_index,
                "stegotext_base": stegotext_base,
                "stegotext_mutated": stegotext_mutated,
                "d_k_raw": d_k_raw,
                "d_k_norm": d_k_norm,
                "d_s_raw": d_s_raw,
                "d_s_norm": d_s_norm,
            }
        )

    return records





In [14]:
secret_text = "The cats like to meow all the time. It is annoying."
secret_prefix = ""  # or "A text:" if you want a k' prefix
base_key = "Puppies cats coding travel travel."

# Word-level sweep
word_edit_counts = build_uniform_word_edit_counts(
    base_key=base_key,
    number_of_levels=10,
)

samples_per_edit = 5

word_sweep_records = key_distance_sweep_word_level(
    secret_text=secret_text,
    secret_prefix=secret_prefix,
    base_key=base_key,
    edit_counts=word_edit_counts,
    samples_per_edit=samples_per_edit,
    replacement_vocabulary=REPLACEMENT_VOCABULARY,
    model=llm,
)

plot_sweep_key_vs_stego_levenshtein(
    records=word_sweep_records,
    output_filename="key_vs_stego_word_level_sweep.png",
)

# Style-variation sweep
style_sweep_records = key_distance_sweep_style_variations(
    secret_text=secret_text,
    secret_prefix=secret_prefix,
    base_key=base_key,
    number_of_variants=40,
    model=llm,
)

plot_sweep_key_vs_stego_levenshtein(
    records=style_sweep_records,
    output_filename="key_vs_stego_style_variation_sweep.png",
)


Saved sweep scatter plot to: /home/meow/Documents/repos/LlmStenoExplore/results/key_vs_stego_word_level_sweep.png
Saved sweep scatter plot to: /home/meow/Documents/repos/LlmStenoExplore/results/key_vs_stego_style_variation_sweep.png


PosixPath('/home/meow/Documents/repos/LlmStenoExplore/results/key_vs_stego_style_variation_sweep.png')