In [1]:
import re
import json
from itertools import product
from collections import OrderedDict

In [2]:
# Use an unambiguous delimiter to mark token boundaries.
DELIM = "\x1F"

# Initialize vocabulary exactly as in the original generator.
vocab = OrderedDict({
    "<pad>": 0,
    "<unk>": 1,
    "<bos>": 2,
    "<eos>": 3
})

In [3]:
def load_grammar(filename):
    """
    Load the grammar from file.
    Each rule is split on '|' after removing a trailing ';' if present.
    Supports both ":" and ":=" as rule separators.
    """
    grammar = {}
    current_rule = None
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("//"):
                continue
            if line.endswith(";"):
                line = line[:-1]
            # Match both ":" and ":=" as separators.
            match = re.match(r"^([a-zA-Z0-9_]+)\s*(?::=|:)\s*(.*)$", line)
            if match:
                current_rule = match.group(1)
                grammar[current_rule] = [match.group(2).strip()]
            elif current_rule:
                grammar[current_rule][-1] += " " + line.strip()
    # Now split each rule’s alternatives.
    for rule in grammar:
        alternatives = []
        for alt in grammar[rule]:
            alternatives.extend([a.strip() for a in alt.split("|")])
        grammar[rule] = alternatives
    return grammar

def tokenize(production):
    """
    Splits a production string into tokens while preserving quoted strings.
    For example:  "'ps' opSem 'ls -al'"  →  ["'ps'", "opSem", "'ls -al'"]
    """
    return re.findall(r"'[^']*'|\"[^\"]*\"|\S+", production)

def unquote(token):
    """Remove surrounding quotes, if any."""
    if (token.startswith("'") and token.endswith("'")) or (token.startswith('"') and token.endswith('"')):
        return token[1:-1]
    return token

def expand_all(rule, grammar):
    """
    Recursively expands a given rule.
    Instead of concatenating tokens with an empty string,
    we return a list of token lists.
    """
    if rule not in grammar:
        token = unquote(rule)
        if token not in vocab:
            vocab[token] = len(vocab)
        # Return a list with a single token list.
        return [[token]]
    
    results = []
    for production in grammar[rule]:
        tokens = tokenize(production)
        # For each token in the production, recursively expand.
        # Each expansion is a list of token lists.
        expanded_tokens = [expand_all(tok, grammar) for tok in tokens]
        # Use Cartesian product to combine expansions from each token.
        for prod in product(*expanded_tokens):
            # Each prod is a tuple of token lists; flatten them.
            combined = []
            for token_list in prod:
                combined.extend(token_list)
            results.append(combined)
    return results

def generate_payloads(grammar, start_rule):
    """
    Expand the grammar from the start_rule and join token lists with DELIM.
    """
    token_lists = expand_all(start_rule, grammar)
    # Now join each token list with the delimiter.
    payloads = [DELIM.join(token_list) for token_list in token_lists]
    return payloads

In [4]:
# Load your RCE grammar file (adjust the path as needed)
grammar = load_grammar("grammar/RCE.txt")
all_payloads = generate_payloads(grammar, "start")
unique_payloads = sorted(set(all_payloads))
print("Total unique payloads:", len(unique_payloads))  # Expect 37302

# Optionally, write payloads and vocabulary to files.
with open("rce.txt", "w") as f:
    for payload in unique_payloads:
        f.write(payload + "\n")

with open("vocab.json", "w") as f:
    json.dump(vocab, f, indent=2)

Total unique payloads: 37302
