In [1]:
import json
import itertools
import random
import re

In [2]:
vocab = {
    "<pad>": 0,
    "<unk>": 1,
    "<bos>": 2,
    "<eos>": 3
}

In [3]:
idx = 0 # starting vocab id

def load_grammar_from_file(filename):
    grammar = {}
    current_rule = None

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if not line or line.startswith('//'):  # Skip comments and empty lines
                continue
            # Remove the trailing semicolon or other ending character
            line = line[:-1]
            # Check for a rule definition
            rule_match = re.match(r'([a-zA-Z0-9_]+)\s*:\s*(.*)', line)
            if rule_match:
                current_rule = rule_match.group(1)
                expansions = rule_match.group(2).split('|')  # Split different productions
                grammar[current_rule] = [expansion.strip() for expansion in expansions]
            # Continuation of rules on the next line (for multiline rules)
            elif current_rule:
                expansions = line.split('|')
                grammar[current_rule].extend([expansion.strip() for expansion in expansions])

    return grammar

def remove_first_last_quote(input_string):
    if input_string.startswith("'") and input_string.endswith("'"):
        return input_string[1:-1]
    return input_string

def replace_escaped_quote(input_string):
    if input_string == "\\'":
        return "'"
    return input_string

def generate(rule):
    if rule in grammar:
        expansion = random.choice(grammar[rule]).split()

        original_payload = []
        tokenized_payload = []

        for token in expansion:
            original, token_ids = generate(token)
            original_payload.append(original)
            tokenized_payload.extend(token_ids)

        return ''.join(original_payload), tokenized_payload
    else:
        processed_rule = replace_escaped_quote(remove_first_last_quote(rule))
        if processed_rule not in vocab:
            vocab[processed_rule] = len(vocab)
        return processed_rule, [vocab[processed_rule]]

# Load the RCE grammar
grammar = load_grammar_from_file('grammar/XSS.txt')

In [4]:
def token_ids_to_original_payload(tokenized_payload, vocab):
    id_to_token = {v: k for k, v in vocab.items()}
    original_payload = [id_to_token[token_id] for token_id in tokenized_payload]
    return ''.join(original_payload)

# Quick check to verify tokenization consistency
for i in range(100):
    rce = generate('start')
    if rce[0] != token_ids_to_original_payload(rce[1], vocab):
        print("Mismatch at iteration:", i)

In [5]:
print(vocab)

{'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3, '<': 4, 'input': 5, '/+/': 6, 'onfocus': 7, '+': 8, '=': 9, '%09': 10, 'alertScript': 11, '%0dx': 12, '/': 13, 'XSS': 14, 'audio': 15, 'src': 16, 'type': 17, 'img': 18, 'onload': 19, '%0a': 20, 'onerror': 21, 'iframe2': 22, 'contenteditable': 23, 'onclick': 24, 'embed': 25, "'\\u0061lert(XSS)": 26, '>': 27, '%0d': 28, 'onauxclick': 29, 'form': 30, 'tabindex': 31, '1': 32, 'submit': 33, 'action': 34, 'alert(XSS)': 35, 'onsubmit': 36, '"': 37, '\\u0061lert(XSS)': 38, 'jav%0Dascript%26colon%3B\\u0061lert(XSS)': 39, 'onblur': 40, 'jav%09ascript%26colon%3B\\u0061lert%26%23x28;XSS%26%23x29;': 41, 'button': 42, 'jav\\u0061script%26colon;alert(XSS)': 43, 'details': 44, 'ontoggle': 45, '=terDQuote': 46, 'javascript%26%2300058;alert(XSS)': 47, 'jav%0Dascript%26colon%3B\\u0061lert%26%23x28;XSS%26%23x29;': 48, 'jav%26Tab%3Bascript%26colon%3B\\u0061lert%26%23x28;XSS%26%23x29;': 49, 'a': 50, 'href': 51, 'jav%0Dascript:\\u0061lert(XSS)': 52, 'object': 

In [6]:
output = []
tokenized_output = []
num_payloads = 100000
xss_set = set()

for i in range(num_payloads):
    while True:
        xss = generate('start')
        if xss[0] not in xss_set:
            xss_set.add(xss[0])
            tokenized_output.append(xss[1])
            output.append(f"{xss[0]}\n")
            if xss[0] != token_ids_to_original_payload(xss[1], vocab):
                print("Mismatch at payload:", i)
            break

# Write all generated payloads to file
with open('xss.txt', 'w') as f:
    f.writelines(output)
with open('tokenized_xss.json', 'w') as f:
    json.dump(tokenized_output, f)

In [7]:
with open('vocab.json', 'w') as json_file:
    json.dump(vocab, json_file, indent=4)