In [1]:
vocab = {
    "<pad>": 0,
    "<unk>": 1,
    "<bos>": 2,
    "<eos>": 3
}

In [2]:
import random
import re
idx = 0 # starting vocab id

def load_grammar_from_file(filename):
    grammar = {}
    current_rule = None

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if not line or line.startswith('//'):  # Skip comments and empty lines
                continue
            line = line[:-1]
            # Check for a rule definition
            rule_match = re.match(r'([a-zA-Z0-9_]+)\s*:\s*(.*)', line)
            if rule_match:
                current_rule = rule_match.group(1)
                expansions = rule_match.group(2).split('|')  # Split different productions
                grammar[current_rule] = [expansion.strip() for expansion in expansions]
            # Continuation of rules on the next line (sometimes multiline rules)
            elif current_rule:
                expansions = line.split('|')
                grammar[current_rule].extend([expansion.strip() for expansion in expansions])

    return grammar

def remove_first_last_quote(input_string):
    if input_string.startswith("'") and input_string.endswith("'"):
        return input_string[1:-1]
    return input_string

def replace_escaped_quote(input_string):
    if input_string == "\\'":
        return "'"
    return input_string

def generate(rule):
    if rule in grammar:
        expansion = random.choice(grammar[rule]).split()

        original_payload = []
        tokenized_payload = []

        for token in expansion:
            original, token_ids = generate(token)
            original_payload.append(original)
            tokenized_payload.extend(token_ids)

        return ''.join(original_payload), tokenized_payload
    else:
        processed_rule = replace_escaped_quote(remove_first_last_quote(rule))
        if processed_rule not in vocab:
            vocab[processed_rule] = len(vocab)

        return processed_rule, [vocab[processed_rule]]

grammar = load_grammar_from_file('grammar/SQLi.txt')

In [None]:
def token_ids_to_original_payload(tokenized_payload, vocab):
    id_to_token = {v: k for k, v in vocab.items()}

    original_payload = [id_to_token[token_id] for token_id in tokenized_payload]

    return ''.join(original_payload)

for i in range(100):
    sqli = generate('start')
    if sqli[0]!=token_ids_to_original_payload(sqli[1], vocab):
        print(i)

In [4]:
print(vocab)

{'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3, '0': 4, '%27': 5, '%0b': 6, 'and': 7, '+': 8, '!': 9, '~': 10, 'false': 11, 'or': 12, "'": 13, '': 14, 'is': 15, '6': 16, '1': 17, '<@=1.': 18, '&&': 19, 'not': 20, '%7e': 21, '!@<@': 22, 'true': 23, '=': 24, '(': 25, ')': 26, '%23': 27, '<@!=1.': 28, ';': 29, 'select': 30, 'sleep': 31, '--': 32, 'extractvalue': 33, ',': 34, 'concat': 35, '0x7e': 36, '@@version': 37, '%2C': 38, '9': 39, '@<@': 40, '2': 41, '1<@': 42, '{a': 43, '1}=1': 44, 'union': 45, '5': 46, '{x': 47, '1)}=1': 48, '<@=.1': 49, 'updatexml': 50, '4': 51, '8': 52, '<@!=.1': 53, '@<@.': 54, '-': 55, '<@=1': 56, '{`if`': 57, 'like': 58, '3': 59, '<@!=1': 60, '<': 61, '!@<@.': 62, '7': 63, '1<@.': 64}


In [5]:
import json

output = []
tokenized_output = []
num_payloads = 1000000
sqli_set = set()


for i in range(num_payloads):
    while True:
        sqli = generate('start')
        if sqli[0] not in sqli_set:
            sqli_set.add(sqli[0])
            tokenized_output.append(sqli[1])
            output.append(f"{sqli[0]}\n")
            if sqli[0] != token_ids_to_original_payload(sqli[1], vocab):
                print(i)
            break

# Writing to file in one go
with open('sqli.txt', 'w') as f:
    f.writelines(output)
with open('tokenized_sqli.json', 'w') as f:
    json.dump(tokenized_output, f)

In [6]:
with open('vocab.json', 'w') as json_file:
    json.dump(vocab, json_file, indent=4)

In [7]:
print(len(sqli_set))

1000000


In [None]:
vocab = {
    "<pad>": 0,
    "<unk>": 1,
    "<bos>": 2,
    "<eos>": 3
}

import random
import re
idx = 0 # starting vocab id

def load_grammar_from_file(filename):
    grammar = {}
    current_rule = None

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if not line or line.startswith('//'):  # Skip comments and empty lines
                continue
            line = line[:-1]
            # Check for a rule definition
            rule_match = re.match(r'([a-zA-Z0-9_]+)\s*:\s*(.*)', line)
            if rule_match:
                current_rule = rule_match.group(1)
                expansions = rule_match.group(2).split('|')  # Split different productions
                grammar[current_rule] = [expansion.strip() for expansion in expansions]
            # Continuation of rules on the next line (sometimes multiline rules)
            elif current_rule:
                expansions = line.split('|')
                grammar[current_rule].extend([expansion.strip() for expansion in expansions])

    return grammar

def remove_first_last_quote(input_string):
    if input_string.startswith("'") and input_string.endswith("'"):
        return input_string[1:-1]
    return input_string

def replace_escaped_quote(input_string):
    if input_string == "\\'":
        return "'"
    return input_string

def generate(rule):
    if rule in grammar:
        expansion = random.choice(grammar[rule]).split()

        original_payload = []
        tokenized_payload = []

        for token in expansion:
            original, token_ids = generate(token)
            original_payload.append(original)
            tokenized_payload.extend(token_ids)

        return ''.join(original_payload), tokenized_payload
    else:
        processed_rule = replace_escaped_quote(remove_first_last_quote(rule))
        if processed_rule not in vocab:
            vocab[processed_rule] = len(vocab)

        return processed_rule, [vocab[processed_rule]]

grammar = load_grammar_from_file('grammar/SQLi.txt')

def token_ids_to_original_payload(tokenized_payload, vocab):
    id_to_token = {v: k for k, v in vocab.items()}

    original_payload = [id_to_token[token_id] for token_id in tokenized_payload]

    return ''.join(original_payload)

for i in range(100):
    sqli = generate('start')
    if sqli[0]!=token_ids_to_original_payload(sqli[1], vocab):
        print(i)

import json

output = []
tokenized_output = []
num_payloads = 1000000
sqli_set = set()


for i in range(num_payloads):
    while True:
        sqli = generate('start')
        if sqli[0] not in sqli_set:
            sqli_set.add(sqli[0])
            tokenized_output.append(sqli[1])
            output.append(f"{sqli[0]}\n")
            if sqli[0] != token_ids_to_original_payload(sqli[1], vocab):
                print(i)
            break

# Writing to file in one go
with open('sqli.txt', 'w') as f:
    f.writelines(output)
with open('tokenized_sqli.json', 'w') as f:
    json.dump(tokenized_output, f)

with open('vocab.json', 'w') as json_file:
    json.dump(vocab, json_file, indent=4)