In [None]:
!rm -rf all.txt

In [2]:
!pip install sentencepiece
!sudo apt-get update
!sudo apt-get install -y sentencepiece

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting sentencepiece
  Downloading http://mirrors.aliyun.com/pypi/packages/04/88/14f2f4a2b922d8b39be45bf63d79e6cd3a9b2f248b2fcb98a69b12af12f5/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.1
[0m

In [1]:
#bash
spm_train \
  --input=all.txt \
  --model_prefix=lua-bpe-32k \
  --vocab_size=32768 \
  --character_coverage=0.9995 \
  --model_type=bpe \
  --byte_fallback=true \
  --split_digits=true \
  --split_by_unicode_script=true \
  --allow_whitespace_only_pieces=true \
  --remove_extra_whitespaces=false \
  --normalization_rule_name=nfkc \
  --unk_piece='<|TOKEN:unk|>' \
  --bos_piece='<|endoftext|>' \
  --eos_piece='<|im_end|>' \
  --pad_piece='<|TOKEN:pad|>' \
  --unk_id=0 \
  --bos_id=1 \
  --eos_id=2 \
  --pad_id=3

IndentationError: unexpected indent (3720292483.py, line 2)

In [None]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input='./all.txt',
    model_prefix='lua-bpe-32k',
    vocab_size=32*1024,
    character_coverage=0.9995,
    model_type='bpe',
    byte_fallback=True,
    split_digits=True,
    split_by_unicode_script=True,
    allow_whitespace_only_pieces=True,
    remove_extra_whitespaces=False,
    normalization_rule_name="nfkc",
    unk_piece="<|TOKEN:unk|>",
    bos_piece="<|endoftext|>",
    eos_piece="<|im_end|>",
    pad_piece="<|TOKEN:pad|>",
    unk_id=0,
    bos_id=1,
    eos_id=2,
    pad_id=3,
)

In [3]:
#!/usr/bin/env python3
"""
merge_lua.py
把 A 目录下所有 xxx.jsonl.gz 文件里的 "lua" 字段提取出来
写入到 all.txt（一行一条 lua 脚本内容）。
"""

import gzip
import json
import os
import glob
from pathlib import Path

SRC_DIR = Path("./")           # 原始数据目录
PATTERN = "*.jsonl.gz"        # 匹配规则
DST_FILE = Path("all.txt")    # 输出文件

def extract_lua_fields():
    # 如果目标文件已存在，可改为 'a' 追加；这里用 'w' 覆盖。
    with DST_FILE.open("w", encoding="utf-8") as fout:
        # 按文件名排序，保持确定性顺序（可选）
        for gz_path in sorted(SRC_DIR.glob(PATTERN)):
            print(f"Processing {gz_path} ...")
            with gzip.open(gz_path, "rt", encoding="utf-8") as fin:
                for line_no, line in enumerate(fin, 1):
                    line = line.strip()
                    if not line:          # 跳过空行
                        continue
                    try:
                        obj = json.loads(line)
                    except json.JSONDecodeError as e:
                        print(f"[WARN] {gz_path}:{line_no} JSON 解析失败：{e}")
                        continue
                    lua_code = obj.get("lua")
                    if lua_code is None:
                        print(f"[WARN] {gz_path}:{line_no} 没有 'lua' 字段")
                        continue
                    # 每段 lua 内容占一行；如内容本身含换行需额外处理
                    fout.write(lua_code.rstrip("\n") + "\n")
    print(f"Done. 全部 lua 内容已写入 {DST_FILE.resolve()}")

if __name__ == "__main__":
    extract_lua_fields()

Processing CODE-001.jsonl.gz ...
Processing CODE-002.jsonl.gz ...
Processing CODE-003.jsonl.gz ...
Processing CODE-004.jsonl.gz ...
Processing CODE-005.jsonl.gz ...
Done. 全部 lua 内容已写入 /root/autodl-tmp/all.txt


In [11]:
import sentencepiece as spm
# https://zhuanlan.zhihu.com/p/669328671
sp = spm.SentencePieceProcessor()
sp.load("./lua-bpe-32k/lua-bpe-32k.model")

print(sp.encode_as_pieces("这老者姓左，名叫子穆，是“无量剑”东宗的掌门。那道姑姓辛，道号双清，是“无量剑”西宗掌门。"))
print(sp.encode_as_ids("这老者姓左，名叫子穆，是“无量剑”东宗的掌门。那道姑姓辛，道号双清，是“无量剑”西宗掌门。"))

['▁', '这', '老', '者', '<0xE5>', '<0xA7>', '<0x93>', '左', ',', '名', '<0xE5>', '<0x8F>', '<0xAB>', '子', '<0xE7>', '<0xA9>', '<0x86>', ',', '是', '“', '无', '量', '剑', '”', '东', '<0xE5>', '<0xAE>', '<0x97>', '的', '<0xE6>', '<0x8E>', '<0x8C>', '门', '。', '那', '道', '<0xE5>', '<0xA7>', '<0x91>', '<0xE5>', '<0xA7>', '<0x93>', '<0xE8>', '<0xBE>', '<0x9B>', ',', '道', '号', '双', '清', ',', '是', '“', '无', '量', '剑', '”', '西', '<0xE5>', '<0xAE>', '<0x97>', '<0xE6>', '<0x8E>', '<0x8C>', '门', '。']
[30888, 31391, 32061, 31109, 233, 171, 151, 32014, 30900, 31177, 233, 147, 175, 31161, 235, 173, 138, 30900, 31117, 31331, 31353, 31245, 31834, 31299, 32555, 233, 178, 155, 31025, 234, 146, 144, 31976, 31050, 32281, 31688, 233, 171, 149, 233, 171, 151, 236, 194, 159, 30900, 31688, 31694, 32299, 31970, 30900, 31117, 31331, 31353, 31245, 31834, 31299, 32092, 233, 178, 155, 234, 146, 144, 31976, 31050]


In [None]:
!pip uninstall protobuf
!pip install protobuf==3.20.3

In [None]:
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"

from transformers import LlamaTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm

llama_tokenizer_dir = 'llama-2-7b-bin'
chinese_sp_model_file = './lua-bpe-32k/lua-bpe-32k.model'

# 分词器加载
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)
chinese_sp_model = spm.SentencePieceProcessor()
chinese_sp_model.Load(chinese_sp_model_file)

# 解析
llama_spm = sp_pb2_model.ModelProto()
llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto())
chinese_spm = sp_pb2_model.ModelProto()
chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto())

# 词表长度
print(len(llama_tokenizer),len(chinese_sp_model))

# 添加新token到llama词表
llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces)
print(len(llama_spm_tokens_set))
print(f"Before:{len(llama_spm_tokens_set)}")
for p in chinese_spm.pieces:
    piece = p.piece
    if piece not in llama_spm_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = 0
        llama_spm.pieces.append(new_p)
print(f"New model pieces: {len(llama_spm.pieces)}")

output_sp_dir = '../merged_tokenizer_sp'
output_hf_dir = '../merged_tokenizer_hf'

vocab_content = ''
for p in llama_spm.pieces:
    vocab_content += f"{p.piece} {p.score}\n"
# 保存词表
with open(output_sp_dir+'/llama.vocab', "w", encoding="utf-8") as f:
    f.write(vocab_content)
# 保存spm模型
with open(output_sp_dir+'/llama.model', 'wb') as f:
    f.write(llama_spm.SerializeToString())

# 保存llama新tokenizer
tokenizer = LlamaTokenizer(vocab_file=output_sp_dir+'/llama.model')
tokenizer.save_pretrained(output_hf_dir)
print(f"Chinese-LLaMA tokenizer has been saved to {output_hf_dir}")

In [None]:
!zip -r lua-bpe-32k.zip lua-bpe-32k.model lua-bpe-32k.vocab

In [3]:
!pip install "transformers[sentencepiece]" "protobuf" "tokenizers" "transformers"

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
[0m

In [3]:
!transformers-cli convert --model_type gpt2 --tokenizer_name_or_path ./lua-bpe-32k.model --output_dir ./hf-tokenizer --tokenizer_class PreTrainedTokenizerFast

usage: transformers <command> [<args>] convert [-h] --model_type MODEL_TYPE
                                               --tf_checkpoint TF_CHECKPOINT
                                               --pytorch_dump_output
                                               PYTORCH_DUMP_OUTPUT
                                               [--config CONFIG]
                                               [--finetuning_task_name FINETUNING_TASK_NAME]
transformers <command> [<args>] convert: error: the following arguments are required: --tf_checkpoint, --pytorch_dump_output


In [2]:
import subprocess
import os
# source /etc/network_turbo
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [4]:
!wget --no-check-certificate https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py

--2025-08-23 18:38:34--  https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py
Connecting to 10.37.1.23:12798... connected.
  Self-signed certificate encountered.
Proxy request sent, awaiting response... 200 OK
Length: 6257 (6.1K) [text/plain]
Saving to: ‘sentencepiece_model_pb2.py’


2025-08-23 18:38:35 (190 KB/s) - ‘sentencepiece_model_pb2.py’ saved [6257/6257]



In [None]:
!pip uninstall protobuf
!pip install "protobuf<3.20"
!pip install tokenizers sentencepiece transformers
!pip install tiktoken

In [None]:
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

# 选择适合的 Tokenizer 类，通常 SentencePiece 适合 T5Tokenizer / LlamaTokenizer
from transformers import LlamaTokenizerFast# T5Tokenizer

# 加载 SentencePiece model
tokenizer = LlamaTokenizerFast(
    vocab_file="lua-bpe-32k/lua-bpe-32k.model",
    unk_token="<|TOKEN:unk|>",
    bos_token="<|endoftext|>",
    eos_token="<|im_end|>",
    pad_token="<|TOKEN:pad|>",
)

# 保存为 Huggingface 规范目录，会自动生成 tokenizer.json
tokenizer.save_pretrained("lua-bpe-32k-hf")

# 然后加载 fast 版本，并自动读取 tokenizer.json
from transformers import AutoTokenizer
fast_tokenizer = AutoTokenizer.from_pretrained("lua-bpe-32k-hf")
fast_tokenizer.save_pretrained("lua-bpe-32k-hf-V2")


('lua-bpe-32k-hf-V2/tokenizer_config.json',
 'lua-bpe-32k-hf-V2/special_tokens_map.json',
 'lua-bpe-32k-hf-V2/tokenizer.model',
 'lua-bpe-32k-hf-V2/added_tokens.json',
 'lua-bpe-32k-hf-V2/tokenizer.json')

In [None]:
TOKEN_SET = set()
class OpMode:
    iNone = -1
    iABC = 0
    iABx = 1
    iAsBx = 2
    iAx = 3
    @staticmethod
    def tostr(mode):
        if mode == OpMode.iABC:
            return "iABC"
        elif mode == OpMode.iABx:
            return "iABx"
        elif mode == OpMode.iAsBx:
            return "iAsBx"
        elif mode == OpMode.iAx:
            return "iAx"
        else:
            return "iNone"
class OpArgMask:
    OpArgNone = -1
    OpArgN = 0
    OpArgU = 1
    OpArgR = 2
    OpArgK = 3
    @staticmethod
    def tostr(mask):
        if mask == OpArgMask.OpArgN:
            return "OpArgN"
        elif mask == OpArgMask.OpArgU:
            return "OpArgU"
        elif mask == OpArgMask.OpArgR:
            return "OpArgR"
        elif mask == OpArgMask.OpArgK:
            return "OpArgK"
        else:
            return "OpArgNone"
OpDefines = [
    # op-code | op-name | T | A | B | C | mode | inline | jump
    (0, "OP_MOVE", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (1, "OP_LOADK", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgN, OpMode.iABx,
     lambda i: False, False),
    (2, "OP_LOADKX", 0, 1, OpArgMask.OpArgN, OpArgMask.OpArgN, OpMode.iABx,
     lambda i: True, False),
    (3, "OP_LOADBOOL", 0, 1, OpArgMask.OpArgU, OpArgMask.OpArgU, OpMode.iABC,
     lambda i: i.C != 0, False),
    (4, "OP_LOADNIL", 0, 1, OpArgMask.OpArgU, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (5, "OP_GETUPVAL", 0, 1, OpArgMask.OpArgU, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (6, "OP_GETTABUP", 0, 1, OpArgMask.OpArgU, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (7, "OP_GETTABLE", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (8, "OP_SETTABUP", 0, 0, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (9, "OP_SETUPVAL", 0, 0, OpArgMask.OpArgU, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (10, "OP_SETTABLE", 0, 0, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (11, "OP_NEWTABLE", 0, 1, OpArgMask.OpArgU, OpArgMask.OpArgU, OpMode.iABC,
     lambda i: False, False),
    (12, "OP_SELF", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (13, "OP_ADD", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (14, "OP_SUB", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (15, "OP_MUL", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (16, "OP_MOD", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (17, "OP_POW", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (18, "OP_DIV", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (19, "OP_IDIV", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (20, "OP_BAND", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (21, "OP_BOR", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (22, "OP_BXOR", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (23, "OP_SHL", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (24, "OP_SHR", 0, 1, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: False, False),
    (25, "OP_UNM", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (26, "OP_BNOT", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (27, "OP_NOT", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (28, "OP_LEN", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (29, "OP_CONCAT", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgR, OpMode.iABC,
     lambda i: False, False),
    (30, "OP_JMP", 0, 0, OpArgMask.OpArgR, OpArgMask.OpArgN, OpMode.iAsBx,
     lambda i: False, True),
    (31, "OP_EQ", 1, 0, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: True, False),
    (32, "OP_LT", 1, 0, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: True, False),
    (33, "OP_LE", 1, 0, OpArgMask.OpArgK, OpArgMask.OpArgK, OpMode.iABC,
     lambda i: True, False),
    (34, "OP_TEST", 1, 0, OpArgMask.OpArgN, OpArgMask.OpArgU, OpMode.iABC,
     lambda i: True, False),
    (35, "OP_TESTSET", 1, 1, OpArgMask.OpArgR, OpArgMask.OpArgU, OpMode.iABC,
     lambda i: True, False),
    (36, "OP_CALL", 0, 1, OpArgMask.OpArgU, OpArgMask.OpArgU, OpMode.iABC,
     lambda i: False, False),
    (37, "OP_TAILCALL", 0, 1, OpArgMask.OpArgU, OpArgMask.OpArgU, OpMode.iABC,
     lambda i: False, False),
    (38, "OP_RETURN", 0, 0, OpArgMask.OpArgU, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (39, "OP_FORLOOP", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgN, OpMode.iAsBx,
     lambda i: False, True),
    (40, "OP_FORPREP", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgN, OpMode.iAsBx,
     lambda i: False, True),
    (41, "OP_TFORCALL", 0, 0, OpArgMask.OpArgN, OpArgMask.OpArgU, OpMode.iABC,
     lambda i: True, False),
    (42, "OP_TFORLOOP", 0, 1, OpArgMask.OpArgR, OpArgMask.OpArgN, OpMode.iAsBx,
     lambda i: False, True),
    (43, "OP_SETLIST", 0, 0, OpArgMask.OpArgU, OpArgMask.OpArgU, OpMode.iABC,
     lambda i: i.C == 0, False),
    (44, "OP_CLOSURE", 0, 1, OpArgMask.OpArgU, OpArgMask.OpArgN, OpMode.iABx,
     lambda i: False, False),
    (45, "OP_VARARG", 0, 1, OpArgMask.OpArgU, OpArgMask.OpArgN, OpMode.iABC,
     lambda i: False, False),
    (46, "OP_EXTRAARG", 0, 0, OpArgMask.OpArgU, OpArgMask.OpArgU, OpMode.iAx,
     lambda i: False, False),
    # 超长常规定义范围的指令
    (-1, "OP_UNKNOWN", 0, 0, OpArgMask.OpArgNone, OpArgMask.OpArgNone, OpMode.iNone,
     lambda i: False, False),
]
for opdef in OpDefines:
    TOKEN_SET.add(f"<|{opdef[1]}|>")
    TOKEN_SET.add(f"<|Instruction-T={opdef[2]}|>")
    TOKEN_SET.add(f"<|Instruction-A={opdef[3]}|>")
    TOKEN_SET.add(f"<|Instruction-B={OpArgMask.tostr(opdef[4])}|>")
    TOKEN_SET.add(f"<|Instruction-C={OpArgMask.tostr(opdef[5])}|>")
    TOKEN_SET.add(f"<|Instruction-MODE={OpMode.tostr(opdef[6])}|>")
TOKEN_SET.add("<|Instruction|>")
TOKEN_SET.add("<|Instruction-B-ISK=true|>")
TOKEN_SET.add("<|Instruction-B-ISK=false|>")
TOKEN_SET.add("<|Instruction-C-ISK=true|>")
TOKEN_SET.add("<|Instruction-C-ISK=false|>")
TOKEN_SET.add("<|Instruction-INLINE=true|>")
TOKEN_SET.add("<|Instruction-INLINE=false|>")
TOKEN_SET.add("<|Instruction-JUMP=true|>")
TOKEN_SET.add("<|Instruction-JUMP=false|>")
TOKEN_SET.add("<|Instruction-Bx|>")
TOKEN_SET.add("<|/Instruction-Bx|>")
TOKEN_SET.add("<|Instruction-sBx|>")
TOKEN_SET.add("<|/Instruction-sBx|>")
TOKEN_SET.add("<|Instruction-Ax|>")
TOKEN_SET.add("<|/Instruction-Ax|>")
TOKEN_SET.add("<|/Instruction|>")
TOKEN_SET.add("<|Constant|>")
# 存在：nil|true|false
TOKEN_SET.add("<|/Constant|>")
TOKEN_SET.add("<|Upvaldesc|>")
TOKEN_SET.add("<|/Upvaldesc|>")
TOKEN_SET.add("<|Upvaldesc-name|>")
TOKEN_SET.add("<|/Upvaldesc-name|>")
TOKEN_SET.add("<|NULL|>")
TOKEN_SET.add("<|LocVar|>")
TOKEN_SET.add("<|/LocVar|>")
TOKEN_SET.add("<|LocVar-varname|>")
TOKEN_SET.add("<|/LocVar-varname|>")
TOKEN_SET.add("<|LocVar-startpc|>")
TOKEN_SET.add("<|/LocVar-startpc|>")
TOKEN_SET.add("<|LocVar-endpc|>")
TOKEN_SET.add("<|/LocVar-endpc|>")
TOKEN_SET.add("<|LineInfo|>")
TOKEN_SET.add("<|LineInfo-pad|>")
TOKEN_SET.add("<|/LineInfo|>")
TOKEN_SET.add("<|Proto|>")
TOKEN_SET.add("<|/Proto|>")
TOKEN_SET.add("<|Proto-sizeupvalues|>")
TOKEN_SET.add("<|/Proto-sizeupvalues|>")
TOKEN_SET.add("<|Proto-sizek|>")
TOKEN_SET.add("<|/Proto-sizek|>")
TOKEN_SET.add("<|Proto-sizecode|>")
TOKEN_SET.add("<|/Proto-sizecode|>")
TOKEN_SET.add("<|Proto-sizelineinfo|>")
TOKEN_SET.add("<|/Proto-sizelineinfo|>")
TOKEN_SET.add("<|Proto-sizep|>")
TOKEN_SET.add("<|/Proto-sizep|>")
TOKEN_SET.add("<|Proto-sizelocvars|>")
TOKEN_SET.add("<|/Proto-sizelocvars|>")
TOKEN_SET.add("<|Proto-linedefined|>")
TOKEN_SET.add("<|/Proto-linedefined|>")
TOKEN_SET.add("<|Proto-lastlinedefined|>")
TOKEN_SET.add("<|/Proto-lastlinedefined|>")
TOKEN_SET.add("<|Proto-k|>")
TOKEN_SET.add("<|/Proto-k|>")
TOKEN_SET.add("<|Proto-k-idx|>")
TOKEN_SET.add("<|/Proto-k-idx|>")
TOKEN_SET.add("<|Proto-k-idx|>")
TOKEN_SET.add("<|/Proto-k-idx|>")
TOKEN_SET.add("<|Proto-code|>")
TOKEN_SET.add("<|/Proto-code|>")
TOKEN_SET.add("<|Proto-code-idx|>")
TOKEN_SET.add("<|/Proto-code-idx|>")
TOKEN_SET.add("<|Jump-Target|>")
TOKEN_SET.add("<|/Jump-Target|>")
TOKEN_SET.add("<|Proto-lineinfo|>")
TOKEN_SET.add("<|/Proto-lineinfo|>")
TOKEN_SET.add("<|Proto-locvars|>")
TOKEN_SET.add("<|/Proto-locvars|>")
TOKEN_SET.add("<|Proto-locvars-idx|>")
TOKEN_SET.add("<|/Proto-locvars-idx|>")
TOKEN_SET.add("<|Proto-upvalues|>")
TOKEN_SET.add("<|/Proto-upvalues|>")
TOKEN_SET.add("<|Proto-upvalues-idx|>")
TOKEN_SET.add("<|/Proto-upvalues-idx|>")
TOKEN_SET.add("<|Proto-source|>")
TOKEN_SET.add("<|/Proto-source|>")
TOKEN_SET.add("<|Proto-p|>")
TOKEN_SET.add("<|/Proto-p|>")
TOKEN_SET.add("<|Proto-p-idx|>")
TOKEN_SET.add("<|/Proto-p-idx|>")
for i in range(0, 256):
    TOKEN_SET.add(f"<|Instruction-B-K={i}|>")
    TOKEN_SET.add(f"<|Instruction-B-R={i}|>")
    TOKEN_SET.add(f"<|Instruction-C-K={i}|>")
    TOKEN_SET.add(f"<|Instruction-C-R={i}|>")
    TOKEN_SET.add(f"<|Instruction-A={i}|>")
    TOKEN_SET.add(f"\\x{i:02X}")
    TOKEN_SET.add(f"<|Upvaldesc-instack={i}|>")
    TOKEN_SET.add(f"<|Upvaldesc-idx={i}|>")
    TOKEN_SET.add(f"<|Proto-nupvalues={i}|>")
    TOKEN_SET.add(f"<|Proto-numparams={i}|>")
    TOKEN_SET.add(f"<|Proto-is_vararg={i}|>")
    TOKEN_SET.add(f"<|Proto-maxstacksize={i}|>")
for i in range(0, 512):
    TOKEN_SET.add(f"<|Instruction-B={i}|>")
    TOKEN_SET.add(f"<|Instruction-C={i}|>")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("lua-bpe-32k-hf-V2")
with open("Atokens.txt", "w") as f:
    for token in sorted(TOKEN_SET):
        f.write(token + "\n")
special_tokens = {
    "additional_special_tokens": list(sorted(TOKEN_SET))+["<|im_start|>","user\n","assistant\n"]
}
tokenizer.add_special_tokens(special_tokens)
tokenizer.save_pretrained("lua-bpe-32k-Add")

# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("lua-bpe-32k-Add")
print(len(tokenizer))
print("DEMO:", tokenizer.tokenize("function A= <|NULL|>"))

37008
DEMO: ['▁function', '▁A', '=', '▁', '<|NULL|>']


In [14]:
!zip -r lua-bpe-32k-Add.zip lua-bpe-32k-Add

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: lua-bpe-32k-Add/ (stored 0%)
  adding: lua-bpe-32k-Add/tokenizer_config.json (deflated 95%)
  adding: lua-bpe-32k-Add/special_tokens_map.json (deflated 98%)
  adding: lua-bpe-32k-Add/added_tokens.json (deflated 85%)
  adding: lua-bpe-32k-Add/tokenizer.model (deflated 49%)
  adding: lua-bpe-32k-Add/tokenizer.json (deflated 87%)
  adding: lua-bpe-32k-Add/.ipynb_checkpoints/ (stored 0%)
  adding: lua-bpe-32k-Add/.ipynb_checkpoints/added_tokens-checkpoint.json (deflated 85%)
  adding: lua-bpe-32k-Add/.ipynb_checkpoints/special_tokens_map-checkpoint.json (deflated 98%)
