In [None]:
# Import necessary packages
from tokenizers import Tokenizer, AddedToken
from transformers import PreTrainedTokenizerFast, AutoTokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
import sentencepiece as spm
import regex as re
from itertools import chain
from copy import deepcopy
import json
from pprint import pprint
import os
import sys

# Define an example text

In [None]:
texts = ["""OPENQASM 3.0;
include "stdgates.inc";
gate mcx _gate_q_0, _gate_q_1, _gate_q_2, _gate_q_3 {
  h _gate_q_3;
  p(pi/8) _gate_q_0;
  p(pi/8) _gate_q_1;
  p(pi/8) _gate_q_2;
  p(pi/8) _gate_q_3;
  cx _gate_q_0, _gate_q_1;
  p(-pi/8) _gate_q_1;
  cx _gate_q_0, _gate_q_1;
  cx _gate_q_1, _gate_q_2;
  p(-pi/8) _gate_q_2;
  cx _gate_q_0, _gate_q_2;
  p(pi/8) _gate_q_2;
  cx _gate_q_1, _gate_q_2;
  p(-pi/8) _gate_q_2;
  cx _gate_q_0, _gate_q_2;
  cx _gate_q_2, _gate_q_3;
  p(-pi/8) _gate_q_3;
  cx _gate_q_1, _gate_q_3;
  p(pi/8) _gate_q_3;
  cx _gate_q_2, _gate_q_3;
  p(-pi/8) _gate_q_3;
  cx _gate_q_0, _gate_q_3;
  p(pi/8) _gate_q_3;
  cx _gate_q_2, _gate_q_3;
  p(-pi/8) _gate_q_3;
  cx _gate_q_1, _gate_q_3;
  p(pi/8) _gate_q_3;
  cx _gate_q_2, _gate_q_3;
  p(-pi/8) _gate_q_3;
  cx _gate_q_0, _gate_q_3;
  h _gate_q_3;
}
gate mcmt _gate_q_0, _gate_q_1, _gate_q_2, _gate_q_3 {
  h _gate_q_3;
  mcx _gate_q_0, _gate_q_1, _gate_q_2, _gate_q_3;
  h _gate_q_3;
}
gate Oracle _gate_q_0, _gate_q_1, _gate_q_2, _gate_q_3 {
  x _gate_q_0;
  x _gate_q_1;
  x _gate_q_2;
  x _gate_q_3;
  mcmt _gate_q_0, _gate_q_1, _gate_q_2, _gate_q_3;
  x _gate_q_0;
  x _gate_q_1;
  x _gate_q_2;
  x _gate_q_3;
}
gate Diffuser _gate_q_0, _gate_q_1, _gate_q_2, _gate_q_3 {
  h _gate_q_0;
  h _gate_q_1;
  h _gate_q_2;
  h _gate_q_3;
  x _gate_q_0;
  x _gate_q_1;
  x _gate_q_2;
  x _gate_q_3;
  h _gate_q_3;
  mcx _gate_q_0, _gate_q_1, _gate_q_2, _gate_q_3;
  h _gate_q_3;
  x _gate_q_0;
  x _gate_q_1;
  x _gate_q_2;
  x _gate_q_3;
  h _gate_q_0;
  h _gate_q_1;
  h _gate_q_2;
  h _gate_q_3;
}
bit[4] c;
qubit[4] q;
h q[0];
h q[1];
h q[2];
h q[3];
Oracle q[0], q[1], q[2], q[3];
Diffuser q[0], q[1], q[2], q[3];
Oracle q[0], q[1], q[2], q[3];
Diffuser q[0], q[1], q[2], q[3];
Oracle q[0], q[1], q[2], q[3];
Diffuser q[0], q[1], q[2], q[3];
c[0] = measure q[0];
c[1] = measure q[1];
c[2] = measure q[2];
c[3] = measure q[3];
"""]

# Test the Base Tokenizer

In [None]:
# Load base tokenizer
tokenizer = AutoTokenizer.from_pretrained("/home/cm/model/Meta-Llama-3-8B-Instruct")
for text in texts:
    tokens_encoded = tokenizer.encode(text)
    tokens_decoded = []
    for token in tokens_encoded:
        tokens_decoded.append(tokenizer.decode(token))
        
    print('\n--------------\nNumber of tokens:', len(tokens_decoded))
    print(tokens_decoded)

In [None]:
print(tokenizer.tokenize("_gate_q_14")) 

# Define and Test the rule

In [None]:
def _tokenize_line(command):
    command = command.strip()
    if not command:
        return []

    # gate
    if command.startswith("gate"):
        gate_match = re.match(r"gate\s+(\w+)(?:\s*\((.*?)\))?\s+([^{]+)\s*{", command)
        if not gate_match:
            raise SyntaxError(f"Invalid gate definition: {command}")
        gate_name = gate_match.group(1)
        params_part = gate_match.group(2) or ""
        qubits_part = gate_match.group(3)
        params = [p.strip() for p in params_part.split(",") if p.strip()]
        qubits = [q.strip() for q in qubits_part.split(",") if q.strip()]
        tokens = ["gate", gate_name] + params + qubits + ["{"]
        # process the digit after the token
        tokens = [re.sub(r'^(_gate_q_|unitary_|mcx_vchain_)\d+$', r'\1', t) for t in tokens]
        return tokens

    groups = re.match(r"^(\w+)(?:\((.*?)\))?\s+([^;]+);", command)
    if groups:
        op_name = groups.group(1)
        params = groups.group(2)
        targets = groups.group(3)
        tokens = [op_name]
        if params:
            tokens += ["("] + [p.strip() for p in params.split(",")] + [")"]
        tokens += [t.strip() for t in targets.split(",")]
        tokens = [token for token in tokens if token]
        # process the digit after the token
        tokens = [re.sub(r'^(_gate_q_|unitary_|mcx_vchain_)\d+$', r'\1', t) for t in tokens]
        return tokens

    if command == "}":
        return ["}"]

    raise SyntaxError(f"Unrecognized command: {command}")


text = texts[0]
lines = [line.strip() for line in text.split("\n") if line.strip()]
total_tokens = []
for line in lines:
    try:
        tokens = _tokenize_line(line)
        total_tokens.extend(tokens)
    except SyntaxError as e:
        print(f"Error in line: {line}")
        print(e)
        continue
print(total_tokens)

In [None]:
base_vocab = tokenizer.get_vocab()

def load_json_to_list(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Use collected qasm files to collect new tokens
file_path = 'inputs_list_all_0.json'
data_list = load_json_to_list(file_path)

new_vocab = []

for i, data_item in enumerate(data_list):
    lines = [line.strip() for line in data_item.split("\n") if line.strip()]
    
    total_tokens = []
    for line in lines:
        try:
            tokens = _tokenize_line(line)
            total_tokens.extend(tokens)
        except SyntaxError as e:
            # print(f"Error in line: {line}")
            # print(e)
            continue
        
    for token in total_tokens:
        if token and token not in base_vocab and token not in new_vocab:
            new_vocab.append(token)
print(new_vocab)

tokenizer.add_tokens(new_vocab)
    
tokenizer.save_pretrained('./Grover_Extend_Tokenizer')

# Test the New Tokenizer

In [None]:
from pprint import pprint

for text in texts:
    tokens_encoded = tokenizer.encode(text)
    tokens_decoded = []
    for idx, token in enumerate(tokens_encoded):
        tokens_decoded.append(tokenizer.decode(token))
        
    print('\n--------------\nNumber of tokens:', len(tokens_decoded))
    pprint(tokens_decoded)


##### Now one can put the new tokenizer in the ~/model/Meta-Llama-3-8B-Instruct/., with the original tokenizer files being saved and moved to other places for additional usage.