# Step 1. Corpus Preparation

In [None]:
import os, re, io, json, html, glob, shutil, tarfile, zipfile, urllib.request, random
from pathlib import Path
from collections import Counter
import math
import numpy as np
import torch


LANG_TAG = "tt"
VOCABS   = [8000, 16000, 32000]
#CHOSEN_VOCAB = 16000
BLOCK_SIZE   = 512
N_LAYER = 8
N_HEAD  = 8
N_EMBD  = 512
LR      = 3e-4
NUM_EPOCHS = 1   #
SEED = 42

# dir
ROOT      = Path.cwd()
DATA_RAW  = ROOT/"data/raw"
DATA_CLEAN= ROOT/"data/clean"
DATA_META = ROOT/"data/meta"
TOK_DIR   = ROOT/"tokenizer_bpe"
CKPT_DIR  = ROOT/f"checkpoints/{LANG_TAG}-gpt-small"

for p in [DATA_RAW, DATA_CLEAN, DATA_META, TOK_DIR, CKPT_DIR]:
    p.mkdir(parents=True, exist_ok=True)

def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed()

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
print("Working dir:", ROOT)

Device: cuda
Working dir: /content


## Corpus: Tatar mixed 2015 (Leipzig Corpora Collection [link text](https://downloads.wortschatz-leipzig.de/corpora/tat_mixed_2015_1M.tar.gz))
**Size: ~1 million sentences**

In [None]:
#
RAW_DIR = DATA_RAW
URLS = [
    "https://downloads.wortschatz-leipzig.de/corpora/tat_mixed_2015_1M.tar.gz",
]

TMP = RAW_DIR/"_tmp"
TMP.mkdir(parents=True, exist_ok=True)

def _download(url: str, out_path: Path):
    print("Downloading:", url)
    with urllib.request.urlopen(url) as r, open(out_path, "wb") as f:
        shutil.copyfileobj(r, f)

def _extract(path: Path, to_dir: Path):
    name = path.name.lower()
    if name.endswith(".zip"):
        with zipfile.ZipFile(path, "r") as z:
            z.extractall(to_dir)
    elif name.endswith(".tar.gz") or name.endswith(".tgz"):
        with tarfile.open(path, "r:gz") as t:
            t.extractall(to_dir)
    elif name.endswith(".tar"):
        with tarfile.open(path, "r") as t:
            t.extractall(to_dir)

for url in URLS:
    f = TMP/url.split("/")[-1]
    _download(url, f)
    _extract(f, TMP)

# collect txt/tsv
collected = 0
for p in TMP.rglob("*"):
    if p.is_file() and p.suffix.lower() in [".txt", ".tsv"]:
        tgt = RAW_DIR/p.name
        if tgt.exists():
            i = 1
            while (RAW_DIR/f"{p.stem}_{i}{p.suffix}").exists():
                i += 1
            tgt = RAW_DIR/f"{p.stem}_{i}{p.suffix}"
        shutil.copy2(p, tgt)
        collected += 1

print("Collected files:", collected)
shutil.rmtree(TMP, ignore_errors=True)


Downloading: https://downloads.wortschatz-leipzig.de/corpora/tat_mixed_2015_1M.tar.gz


  t.extractall(to_dir)


Collected files: 7


# Cleaning , Filtering & Deduplication

Custom cleaning functions were implemented:

basic_clean() — removes HTML entities, redundant spaces and repeated punctuation.

too_numeric() — filters out sentences with more than 60 % digits.

likely_tatarish() — keeps only Tatar sentences based on unique Cyrillic letters (Ә, Ө, Ү, Җ, Ң, Һ).

Then each sentence is normalized (lower-cased, extra spaces removed) and checked for uniqueness.

In [None]:
import re, html
from pathlib import Path

DATA_RAW = Path("data/raw")
DATA_CLEAN = Path("data/clean")
LANG_TAG = "tt"

def basic_clean(s: str) -> str:
    s = html.unescape(s)
    s = s.replace("\r", "\n")
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"([!?.,…])\1{2,}", r"\1\1", s)
    return s.strip()

def too_numeric(s: str, max_ratio=0.6) -> bool:
    toks = [ch for ch in s if ch.isalnum()]
    if not toks:
        return True
    digits = sum(ch.isdigit() for ch in toks)
    return digits / len(toks) > max_ratio

CYRIL = re.compile(r"[А-Яа-яЁёӘәӨөҮүҖҗҢңҺһ]")
TATAR_UNIQUE = re.compile(r"[ӘәӨөҮүҖҗҢңҺһ]")

def likely_tatarish(s: str, hard_ratio=0.55, soft_ratio=0.35) -> bool:
    letters = [ch for ch in s if ch.isalpha()]
    if not letters:
        return False
    cyr = sum(1 for ch in letters if CYRIL.match(ch))
    ratio = cyr / len(letters)
    return (ratio >= hard_ratio) or (ratio >= soft_ratio and TATAR_UNIQUE.search(s))

cands = list(DATA_RAW.rglob("tat_mixed_2015_1M-sentences.txt"))
assert cands, "not found tat_mixed_2015_1M-sentences.txt"
src = cands[0]

out_path = DATA_CLEAN / f"{LANG_TAG}_clean.txt"
out_path.parent.mkdir(parents=True, exist_ok=True)

kept, total, pre = [], 0, 0
with open(src, "r", encoding="utf-8", errors="ignore") as fin:
    for line in fin:
        total += 1
        parts = line.rstrip("\n").split("\t")
        if len(parts) < 2:
            continue
        sent = basic_clean(parts[-1])         # 永远取最后一列
        if len(sent) < 3:
            continue
        if too_numeric(sent, max_ratio=0.6):
            continue
        if not likely_tatarish(sent):
            continue
        kept.append(sent); pre += 1

# Deduplication
seen, uniq = set(), []
for s in kept:
    key = re.sub(r"\s+", " ", s.lower()).strip()
    if key not in seen:
        seen.add(key)
        uniq.append(s)

with open(out_path, "w", encoding="utf-8") as fout:
    for s in uniq:
        fout.write(s + "\n")

print(f"Raw lines: {total} | kept (pre-dedup): {pre} | unique: {len(uniq)}")
print(f"Saved {len(uniq)} lines -> {out_path}")


Raw lines: 1000000 | kept (pre-dedup): 999367 | unique: 999197
Saved 999197 lines -> data/clean/tt_clean.txt


# Corpus Statistics (To sum up)

In [None]:

def corpus_stats(lines):
    lengths = [len(x) for x in lines]
    return {
        "num_lines": len(lines),
        "total_chars": int(sum(lengths)),
        "avg_len": float(np.mean(lengths)) if lengths else 0,
        "median_len": float(np.median(lengths)) if lengths else 0,
    }

stats = corpus_stats(uniq)
DATA_META.mkdir(parents=True, exist_ok=True)
with open(DATA_META/"stats.json", "w", encoding="utf-8") as f:
    json.dump(stats, f, ensure_ascii=False, indent=2)
stats


{'num_lines': 999197,
 'total_chars': 104168737,
 'avg_len': 104.25245171873014,
 'median_len': 95.0}

# Step 2. Training a Byte-level BPE from zero

In [None]:

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import TemplateProcessing
from pathlib import Path
CLEAN_FILE = Path("data/clean/tt_clean.txt")
assert CLEAN_FILE.exists()

def train_bpe_tokenizer(vocab_size: int, save_dir: Path):
    files = [str(CLEAN_FILE)]
    tok = Tokenizer(BPE(unk_token="<unk>"))
    tok.pre_tokenizer = ByteLevel()
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["<pad>", "<bos>", "<eos>", "<unk>"],
        min_frequency=2,
        show_progress=True,
    )
    tok.train(files, trainer)
    tok.post_processor = TemplateProcessing(
        single="<bos> $A <eos>",
        pair="<bos> $A <eos> <bos> $B <eos>",
        special_tokens=[
            ("<bos>", tok.token_to_id("<bos>")),
            ("<eos>", tok.token_to_id("<eos>")),
        ],
    )
    save_dir.mkdir(parents=True, exist_ok=True)
    tok.save(str(save_dir/"tokenizer.json"))
    return save_dir/"tokenizer.json"

TOK_RUNS = {}
for vs in VOCABS:
    sd = TOK_DIR/f"bpe_{vs}"
    path = train_bpe_tokenizer(vs, sd)
    TOK_RUNS[vs] = str(path)
TOK_RUNS


{8000: '/content/tokenizer_bpe/bpe_8000/tokenizer.json',
 16000: '/content/tokenizer_bpe/bpe_16000/tokenizer.json',
 32000: '/content/tokenizer_bpe/bpe_32000/tokenizer.json'}

In [None]:
# ===== 2.2 分词器对比指标：Avg tokens / sentence + 低频占比（近似）=====
from tokenizers import Tokenizer

def sample_lines(path, n=3000):
    rng = np.random.default_rng(SEED)
    if len(uniq) <= n: return uniq
    idx = rng.choice(len(uniq), n, replace=False)
    return [uniq[i] for i in idx]

def eval_tokenizer(tok_json_path, sample):
    tok = Tokenizer.from_file(tok_json_path)
    # 计算 avg tokens / sentence
    lens=[]
    for s in sample:
        enc = tok.encode(s)
        lens.append(len(enc.ids))
    avg_tok = float(np.mean(lens)) if lens else 0.0

    # 低频近似：取 vocab 末端（假设是低频合并形成的长子词），统计句内使用率
    vocab_size = tok.get_vocab_size()
    tail_ids = set(range(max(0, vocab_size-1000), vocab_size))
    tail_hits=0
    total=0
    for s in sample:
        enc = tok.encode(s)
        for i in enc.ids:
            total += 1
            if i in tail_ids: tail_hits += 1
    tail_ratio = tail_hits / max(1,total)
    return avg_tok, tail_ratio, vocab_size

sample = sample_lines(CLEAN_FILE, n=3000)
tok_stats = []
for vs, path in TOK_RUNS.items():
    avg_tok, tail_ratio, size = eval_tokenizer(path, sample)
    tok_stats.append((vs, size, round(avg_tok,2), round(tail_ratio,4)))
tok_stats = sorted(tok_stats)
print("Vocab | size | avg_tokens_per_sent | tail_lowfreq_ratio")
for row in tok_stats:
    print(row)


Vocab | size | avg_tokens_per_sent | tail_lowfreq_ratio
(8000, 8000, 26.31, 0.025)
(16000, 16000, 23.46, 0.0088)
(32000, 32000, 21.68, 0.0028)


**Choice of Final Tokenizer (16k Vocabulary)**
- Increasing the vocabulary from 8k → 16k significantly reduces the average token length (≈11%)  
  and lowers the share of low-frequency tokens, improving segmentation efficiency.
- However, further expansion to 32k provides only marginal gains  
  while doubling the embedding table size and memory cost.
- Therefore, **the 16k vocabulary offers the best balance**  
  between segmentation compactness and model simplicity.


In [None]:

from datasets import load_dataset
from transformers import PreTrainedTokenizerFast
CHOSEN_VOCAB = 16000
CHOSEN_TOK_PATH = TOK_RUNS[CHOSEN_VOCAB]
tok_fast = PreTrainedTokenizerFast(
    tokenizer_file=CHOSEN_TOK_PATH,
    bos_token="<bos>", eos_token="<eos>", unk_token="<unk>", pad_token="<pad>"
)
print("Vocab size =", tok_fast.vocab_size)

dataset = load_dataset("text", data_files={"train": str(CLEAN_FILE)})
split = dataset["train"].train_test_split(test_size=0.02, seed=SEED)
train_ds, val_ds = split["train"], split["test"]

def tokenize(examples):
    return tok_fast(examples["text"])

tokenized_train = train_ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized_val   = val_ds.map(tokenize,   batched=True, remove_columns=["text"])

def group_texts(examples, block_size=BLOCK_SIZE):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = (len(concatenated["input_ids"]) // block_size) * block_size
    res = {k: [t[i:i+block_size] for i in range(0, total_length, block_size)]
           for k, t in concatenated.items()}
    res["labels"] = res["input_ids"].copy()
    return res

lm_train = tokenized_train.map(group_texts, batched=True)
lm_val   = tokenized_val.map(group_texts,   batched=True)
len(lm_train), len(lm_val)


Vocab size = 16000


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/979213 [00:00<?, ? examples/s]

Map:   0%|          | 0/19984 [00:00<?, ? examples/s]

Map:   0%|          | 0/979213 [00:00<?, ? examples/s]

Map:   0%|          | 0/19984 [00:00<?, ? examples/s]

(44935, 917)

## Step3. Training a decoder-only gpt model

- Define a GPT2-like decoder-only model from scratch using the Transformers library.


- Use the self-trained Byte-level BPE tokenizer with a 16k vocabulary.


- Load the cleaned corpus and split it into 98% training and 2% validation subsets.


- Implement language modeling data chunking with a block size of 512 tokens.


- Use a network configuration of 8 layers × 8 attention heads × 512 hidden dimensions.


- Enable gradient checkpointing, bf16 mixed precision (if supported by the GPU), and a warmup ratio to stabilize the early training phase;
then run the training and save the model.



In [None]:


from transformers import GPT2Config, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

config = GPT2Config(
    vocab_size=tok_fast.vocab_size,
    n_positions=BLOCK_SIZE,
    n_ctx=BLOCK_SIZE,
    n_layer=N_LAYER,
    n_head=N_HEAD,
    n_embd=N_EMBD,
    resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0,
    bos_token_id=tok_fast.bos_token_id, eos_token_id=tok_fast.eos_token_id
)
model = GPT2LMHeadModel(config).to(device)

collator = DataCollatorForLanguageModeling(tokenizer=tok_fast, mlm=False)

args = TrainingArguments(
    output_dir=str(CKPT_DIR),
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LR,
    warmup_ratio=0.03,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    report_to="none"
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=lm_train, eval_dataset=lm_val,
    data_collator=collator, tokenizer=tok_fast
)

train_out = trainer.train()
trainer.save_model(str(CKPT_DIR))
tok_fast.save_pretrained(str(CKPT_DIR))
print("Saved to:", CKPT_DIR)


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
200,6.2686,6.128636
400,5.5765,5.510249
600,5.2574,5.221183
800,5.0318,4.995275
1000,4.8307,4.823112
1200,4.716,4.684253
1400,4.5854,4.566749
1600,4.4929,4.476864
1800,4.4197,4.402056
2000,4.3463,4.338768


Saved to: /content/checkpoints/tt-gpt-small


# Model Evaluation

In [None]:
from math import exp
eval_res = trainer.evaluate(eval_dataset=lm_val)
print(eval_res)
if "eval_loss" in eval_res:
    print("Perplexity:", exp(eval_res["eval_loss"]))


{'eval_loss': 4.190962314605713, 'eval_runtime': 20.7888, 'eval_samples_per_second': 44.11, 'eval_steps_per_second': 5.532, 'epoch': 1.0}
Perplexity: 66.08635623645245


# Pushing to huggingface

In [None]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained(str(CKPT_DIR))
model.config._name_or_path = REPO_ID

tok_fast.push_to_hub(REPO_ID, commit_message="Add tokenizer")
model.push_to_hub(REPO_ID, commit_message="Add model weights and config")

print("Pushed:", f"https://huggingface.co/{REPO_ID}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...8ud2_zx/model.safetensors:   0%|          |  553kB /  135MB            

Pushed: https://huggingface.co/xinyuema/tt-gpt-small2
