In [5]:
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Dict, Optional, List, Tuple, Iterable
from collections import Counter, defaultdict
import heapq
import math
import re

# ML / data
import numpy as np
from scipy.sparse import lil_matrix, csr_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer


# ======================================================
# 0. Простая токенизация по словам
# ======================================================

WORD_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)


def tokenize(text: str) -> List[str]:
    """Простейший токенизатор: слова + знаки препинания, всё в lower."""
    return WORD_RE.findall(text.lower())


# ======================================================
# 1. Обобщённый Хаффман (для слов)
# ======================================================

@dataclass(order=True)
class HuffmanNode:
    freq: int
    token: Optional[str] = field(compare=False, default=None)
    left: Optional["HuffmanNode"] = field(compare=False, default=None)
    right: Optional["HuffmanNode"] = field(compare=False, default=None)


def build_frequency_table_tokens(tokens: Iterable[str]) -> Dict[str, int]:
    return dict(Counter(tokens))


def build_huffman_tree(freq_table: Dict[str, int]) -> HuffmanNode:
    heap: List[HuffmanNode] = [HuffmanNode(f, t) for t, f in freq_table.items()]
    heapq.heapify(heap)

    # Если только один уникальный токен
    if len(heap) == 1:
        node = heap[0]
        return HuffmanNode(node.freq, None, node, None)

    while len(heap) > 1:
        n1 = heapq.heappop(heap)
        n2 = heapq.heappop(heap)
        merged = HuffmanNode(n1.freq + n2.freq, None, n1, n2)
        heapq.heappush(heap, merged)

    return heap[0]


def build_code_table_tokens(root: HuffmanNode) -> Dict[str, str]:
    code: Dict[str, str] = {}

    def traverse(node: HuffmanNode, prefix: str) -> None:
        if node.token is not None:
            code[node.token] = prefix or "0"
            return
        if node.left:
            traverse(node.left, prefix + "0")
        if node.right:
            traverse(node.right, prefix + "1")

    traverse(root, "")
    return code


def huffman_encode_tokens(tokens: List[str], code_table: Dict[str, str]) -> str:
    return "".join(code_table[t] for t in tokens)


def huffman_decode_tokens(bits: str, root: HuffmanNode) -> List[str]:
    res: List[str] = []
    node = root
    for b in bits:
        node = node.left if b == "0" else node.right
        if node.token is not None:
            res.append(node.token)
            node = root
    return res


# ======================================================
# 2. Биты -> целые токены, и обратно
# ======================================================

def bits_to_int_tokens(bits: str, chunk_size: int = 8) -> Tuple[List[int], int]:
    """
    Битовая строка -> список целых токенов фиксированного размера.
    Возвращаем (tokens, padding), где padding — сколько нулей дописали.
    """
    tokens: List[int] = []
    padding = (chunk_size - (len(bits) % chunk_size)) % chunk_size
    bits_padded = bits + "0" * padding
    for i in range(0, len(bits_padded), chunk_size):
        chunk = bits_padded[i:i + chunk_size]
        tokens.append(int(chunk, 2))
    return tokens, padding


def int_tokens_to_bits(tokens: List[int], padding: int, chunk_size: int = 8) -> str:
    bits = "".join(f"{t:0{chunk_size}b}" for t in tokens)
    if padding:
        bits = bits[:-padding]
    return bits


# ======================================================
# 3. Word-level Huffman Tokenizer (с UNK)
# ======================================================

class WordHuffmanTokenizer:
    """
    Строит Хаффман по словам корпуса и кодирует новые тексты
    в последовательности целочисленных токенов (по битам).
    """

    def __init__(self, corpus_texts: List[str], chunk_size: int = 8, unk_token: str = "<unk>"):
        self.chunk_size = chunk_size
        self.UNK = unk_token

        all_tokens: List[str] = []
        for txt in corpus_texts:
            all_tokens.extend(tokenize(txt))

        freq_table = build_frequency_table_tokens(all_tokens)

        # UNK для OOV-слов
        if self.UNK not in freq_table:
            freq_table[self.UNK] = 1

        self.freq_table = freq_table
        self.tree = build_huffman_tree(self.freq_table)
        self.code_table = build_code_table_tokens(self.tree)

    def encode_to_int_tokens(self, text: str) -> Tuple[List[int], int]:
        tokens = tokenize(text)
        mapped = [t if t in self.code_table else self.UNK for t in tokens]
        bits = huffman_encode_tokens(mapped, self.code_table)
        return bits_to_int_tokens(bits, self.chunk_size)

    def decode_from_int_tokens(self, int_tokens: List[int], padding: int) -> str:
        bits = int_tokens_to_bits(int_tokens, padding, self.chunk_size)
        tokens = huffman_decode_tokens(bits, self.tree)
        return " ".join(tokens)


# ======================================================
# 4. Co-occurrence word embeddings (как было)
# ======================================================

def build_cooccurrence(
    corpus_texts: List[str],
    window_size: int = 1,
    min_count: int = 1,
) -> Tuple[Dict[str, Dict[str, float]], Dict[str, int]]:
    """
    Двухпроходное построение co-occurrence:
    1) считаем глобальные частоты,
    2) считаем соседей только для слов с freq >= min_count.
    """
    token_counts: Counter = Counter()
    for txt in corpus_texts:
        token_counts.update(tokenize(txt))

    cooc: Dict[str, Dict[str, float]] = defaultdict(lambda: defaultdict(float))

    for txt in corpus_texts:
        tokens = tokenize(txt)
        n = len(tokens)
        for i, w in enumerate(tokens):
            if token_counts[w] < min_count:
                continue
            start = max(0, i - window_size)
            end = min(n, i + window_size + 1)
            for j in range(start, end):
                if j == i:
                    continue
                c = tokens[j]
                cooc[w][c] += 1.0

    return cooc, dict(token_counts)


def apply_tfidf_like_weights(
    cooc: Dict[str, Dict[str, float]],
    token_counts: Dict[str, int],
) -> Dict[str, Dict[str, float]]:
    weighted: Dict[str, Dict[str, float]] = {}
    for w, ctxs in cooc.items():
        w_vec: Dict[str, float] = {}
        for c, cnt in ctxs.items():
            df = token_counts.get(c, 1)
            w_vec[c] = cnt / math.log(1.0 + df)
        weighted[w] = w_vec
    return weighted


def build_dense_embedding_matrix(
    weighted_cooc: Dict[str, Dict[str, float]],
    top_k: int = 500,
) -> Tuple[List[str], List[List[float]]]:
    sorted_words = sorted(
        weighted_cooc.keys(),
        key=lambda w: -len(weighted_cooc[w])
    )[:top_k]

    context_vocab = set()
    for w in sorted_words:
        context_vocab.update(weighted_cooc[w].keys())
    context_vocab = sorted(context_vocab)
    ctx_index = {c: i for i, c in enumerate(context_vocab)}

    matrix: List[List[float]] = []
    for w in sorted_words:
        vec = [0.0] * len(ctx_index)
        for c, val in weighted_cooc[w].items():
            j = ctx_index.get(c)
            if j is not None:
                vec[j] = val
        matrix.append(vec)

    return sorted_words, matrix


# ======================================================
# 5. Tokenization pipelines for classification
# ======================================================

def baseline_word_tokenizer(corpus_texts: List[str], min_freq: int = 3) -> Dict[str, int]:
    """
    Строим базовый токенизатор: слово -> индекс.
    0 оставляем под PAD/UNK.
    min_freq — минимальная частота слова в train, чтобы попасть в словарь.
    """
    counts: Counter = Counter()
    for txt in corpus_texts:
        counts.update(tokenize(txt))

    vocab: Dict[str, int] = {}
    for tok, cnt in counts.items():
        if cnt >= min_freq:
            vocab[tok] = len(vocab) + 1  # 1..|V|
    return vocab


def baseline_encode(text: str, vocab: Dict[str, int]) -> List[int]:
    return [vocab.get(t, 0) for t in tokenize(text)]  # 0 = UNK


def prepare_dataset_baseline(
    texts: List[str],
    vocab: Dict[str, int],
) -> List[List[int]]:
    return [baseline_encode(t, vocab) for t in texts]


def prepare_dataset_huffman(
    texts: List[str],
    huffman_tok: WordHuffmanTokenizer,
) -> Tuple[List[List[int]], List[int]]:
    seqs: List[List[int]] = []
    pads: List[int] = []
    for t in texts:
        ints, pad = huffman_tok.encode_to_int_tokens(t)
        seqs.append(ints)
        pads.append(pad)
    return seqs, pads


def seqs_to_bow(seqs: List[List[int]], vocab_size: int) -> csr_matrix:
    """
    Bag-of-Tokens (частотный вектор) в виде разреженной матрицы.
    Для baseline vocab_size = |V|+1.
    """
    X = lil_matrix((len(seqs), vocab_size), dtype=np.float32)
    for i, s in enumerate(seqs):
        for tok in s:
            if 0 <= tok < vocab_size:
                X[i, tok] += 1.0
    return X.tocsr()


# ---------- стабильный хэш для n-грамм байтов ----------

def stable_hash_ints(vals: Tuple[int, ...]) -> int:
    """
    Простая детерминированная хэш-функция для кортежа целых.
    """
    h = 2166136261  # FNV-like
    for v in vals:
        h ^= (v + 0x9e3779b9) & 0xFFFFFFFF
        h = (h * 16777619) & 0xFFFFFFFF
    return h


def seqs_to_hashed_ngram_features(
    seqs: List[List[int]],
    n_features: int = 50000,
    ngram_min: int = 1,
    ngram_max: int = 3,
) -> csr_matrix:
    """
    Признаки по Huffman-последовательностям:
    считаем n-граммы байтов (1..3) и хэшируем их в пространство размерности n_features.
    Аналог HashingVectorizer, но для целочисленных токенов.
    """
    X = lil_matrix((len(seqs), n_features), dtype=np.float32)

    for i, s in enumerate(seqs):
        L = len(s)
        if L == 0:
            continue
        for n in range(ngram_min, ngram_max + 1):
            if L < n:
                continue
            for j in range(L - n + 1):
                ng = tuple(s[j:j + n])          # n соседних байтов
                key = (n,) + ng                 # различаем 1/2/3-граммы
                h = stable_hash_ints(key) % n_features
                X[i, h] += 1.0

    return X.tocsr()


# ======================================================
# 6. Демо: кодек + эмбеддинги
# ======================================================

def run_demo() -> None:
    corpus = [
        "This is a simple example of Huffman word-level compression.",
        "Huffman coding works on tokens with different frequencies.",
        "We can use Huffman codes as a tokenizer before a downstream model.",
        "Neighbour-based co-occurrence can be used to create word embeddings.",
        "Compression and tokenization are important for efficient NLP models.",
    ]

    huff_tok = WordHuffmanTokenizer(corpus_texts=corpus, chunk_size=8)

    sample_text = "Huffman coding works on tokens with different frequencies."
    ints, pad = huff_tok.encode_to_int_tokens(sample_text)
    print("Sample text:", sample_text)
    print("Huffman int tokens:", ints)
    print("Length (Huffman):", len(ints))

    recovered = huff_tok.decode_from_int_tokens(ints, pad)
    print("Recovered text:", recovered)

    cooc, token_counts = build_cooccurrence(corpus, window_size=1, min_count=1)
    weighted = apply_tfidf_like_weights(cooc, token_counts)
    vocab_list, emb_matrix = build_dense_embedding_matrix(weighted, top_k=20)

    print("\nWord embedding demo (first 5 words):")
    for w, vec in list(zip(vocab_list, emb_matrix))[:5]:
        nonzeros = [(i, v) for i, v in enumerate(vec) if abs(v) > 1e-8]
        print(f"{w}: {len(nonzeros)} non-zero dims")


# ======================================================
# 7. Классификация на ДАТАСЕТЕ
# ======================================================

def load_binary_text_dataset() -> Tuple[List[str], np.ndarray]:
    """
    Берём 2 категории из 20newsgroups и получаем бинарный текстовый датасет.
    """
    categories = ["rec.autos", "rec.sport.hockey"]

    data = fetch_20newsgroups(
        subset="all",
        categories=categories,
        remove=("headers", "footers", "quotes"),
    )

    texts: List[str] = data.data
    labels: np.ndarray = data.target  # 0/1
    return texts, labels


def run_classification_comparison_dataset() -> None:
    """
    Сравнение baseline word-токенизатора и Huffman-токенизатора
    на реальном текстовом датасете.

    Baseline: word-индексы -> BOW -> TF-IDF -> LinearSVC.
    Huffman: Huffman-байты -> hashed byte n-grams (1..3) -> TF-IDF -> LinearSVC.
    """
    texts, labels = load_binary_text_dataset()

    train_texts, test_texts, y_train, y_test = train_test_split(
        texts,
        labels,
        test_size=0.2,
        random_state=42,
        stratify=labels,
    )

    print(f"\nDataset sizes: train={len(train_texts)}, test={len(test_texts)}")

    # ----- 1) Baseline -----
    vocab = baseline_word_tokenizer(train_texts, min_freq=3)
    train_seqs_base = prepare_dataset_baseline(train_texts, vocab)
    test_seqs_base = prepare_dataset_baseline(test_texts, vocab)

    baseline_vocab_size = len(vocab) + 1
    X_train_base_counts = seqs_to_bow(train_seqs_base, baseline_vocab_size)
    X_test_base_counts = seqs_to_bow(test_seqs_base, baseline_vocab_size)

    tfidf_base = TfidfTransformer()
    X_train_base = tfidf_base.fit_transform(X_train_base_counts)
    X_test_base = tfidf_base.transform(X_test_base_counts)

    clf_base = LinearSVC()
    clf_base.fit(X_train_base, y_train)
    y_pred_base = clf_base.predict(X_test_base)
    acc_base = accuracy_score(y_test, y_pred_base)

    # ----- 2) Huffman: hashed byte n-grams (1..3) -----
    huff_tok = WordHuffmanTokenizer(train_texts, chunk_size=8)
    train_seqs_huff, _ = prepare_dataset_huffman(train_texts, huff_tok)
    test_seqs_huff, _ = prepare_dataset_huffman(test_texts, huff_tok)

    n_features = 50000  # размерность пространства признаков для hashing-trick

    X_train_huff_counts = seqs_to_hashed_ngram_features(
        train_seqs_huff,
        n_features=n_features,
        ngram_min=1,
        ngram_max=3,
    )
    X_test_huff_counts = seqs_to_hashed_ngram_features(
        test_seqs_huff,
        n_features=n_features,
        ngram_min=1,
        ngram_max=3,
    )

    tfidf_huff = TfidfTransformer()
    X_train_huff = tfidf_huff.fit_transform(X_train_huff_counts)
    X_test_huff = tfidf_huff.transform(X_test_huff_counts)

    # Чуть ослабляем регуляризацию (C побольше),
    # чтобы модель могла лучше использовать много признаков
    clf_huff = LinearSVC(C=2.0)
    clf_huff.fit(X_train_huff, y_train)
    y_pred_huff = clf_huff.predict(X_test_huff)
    acc_huff = accuracy_score(y_test, y_pred_huff)

    # ----- длины последовательностей -----
    def avg_len(seqs: List[List[int]]) -> float:
        return sum(len(s) for s in seqs) / len(seqs)

    avg_len_base = avg_len(train_seqs_base)
    avg_len_huff = avg_len(train_seqs_huff)

    print("\n=== Classification comparison on real text dataset (TF-IDF + LinearSVC) ===")
    print(f"Baseline accuracy: {acc_base:.4f}")
    print(f"Huffman  accuracy: {acc_huff:.4f}")
    print(f"Average seq length (baseline train): {avg_len_base:.2f}")
    print(f"Average seq length (Huffman  train): {avg_len_huff:.2f}")
    print(f"Compression ratio (base / huff): {avg_len_base / avg_len_huff:.2f}")


# ======================================================
# 8. Запуск демо
# ======================================================

if __name__ == "__main__":
    run_demo()
    run_classification_comparison_dataset()

Sample text: Huffman coding works on tokens with different frequencies.
Huffman int tokens: [61, 21, 181, 140, 101, 15, 128]
Length (Huffman): 7
Recovered text: huffman coding works on tokens with different frequencies .

Word embedding demo (first 5 words):
a: 6 non-zero dims
-: 6 non-zero dims
huffman: 5 non-zero dims
.: 5 non-zero dims
word: 4 non-zero dims

Dataset sizes: train=1591, test=398





=== Classification comparison on real text dataset (TF-IDF + LinearSVC) ===
Baseline accuracy: 0.9347
Huffman  accuracy: 0.6658
Average seq length (baseline train): 203.61
Average seq length (Huffman  train): 240.91
Compression ratio (base / huff): 0.85




In [7]:
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Dict, Optional, List, Tuple, Iterable
from collections import Counter
import heapq
import re

import numpy as np
from scipy.sparse import lil_matrix, csr_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer

# ======================================================
# 0. Токенизация
# ======================================================

WORD_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)


def tokenize(text: str) -> List[str]:
    """Простейший токенизатор: слова + знаки препинания, всё в lower."""
    return WORD_RE.findall(text.lower())


# ======================================================
# 1. Хаффман-дерево по словам
# ======================================================

@dataclass(order=True)
class HuffmanNode:
    freq: int
    token: Optional[str] = field(compare=False, default=None)
    left: Optional["HuffmanNode"] = field(compare=False, default=None)
    right: Optional["HuffmanNode"] = field(compare=False, default=None)


def build_frequency_table_tokens(tokens: Iterable[str]) -> Dict[str, int]:
    return dict(Counter(tokens))


def build_huffman_tree(freq_table: Dict[str, int]) -> HuffmanNode:
    heap: List[HuffmanNode] = [HuffmanNode(f, t) for t, f in freq_table.items()]
    heapq.heapify(heap)

    # Если только один уникальный токен
    if len(heap) == 1:
        node = heap[0]
        return HuffmanNode(node.freq, None, node, None)

    while len(heap) > 1:
        n1 = heapq.heappop(heap)
        n2 = heapq.heappop(heap)
        merged = HuffmanNode(n1.freq + n2.freq, None, n1, n2)
        heapq.heappush(heap, merged)

    return heap[0]


def build_code_table_tokens(root: HuffmanNode) -> Dict[str, str]:
    """token -> битовая строка Хаффмана."""
    code: Dict[str, str] = {}

    def traverse(node: HuffmanNode, prefix: str) -> None:
        if node.token is not None:
            code[node.token] = prefix or "0"
            return
        if node.left:
            traverse(node.left, prefix + "0")
        if node.right:
            traverse(node.right, prefix + "1")

    traverse(root, "")
    return code


# ======================================================
# 2. Baseline word-tokenizer
# ======================================================

def baseline_word_tokenizer(corpus_texts: List[str], min_freq: int = 3) -> Dict[str, int]:
    """
    Строим базовый токенизатор: слово -> индекс.
    0 оставляем под PAD/UNK.
    min_freq — минимальная частота слова в train, чтобы попасть в словарь.
    """
    counts: Counter = Counter()
    for txt in corpus_texts:
        counts.update(tokenize(txt))

    vocab: Dict[str, int] = {}
    for tok, cnt in counts.items():
        if cnt >= min_freq:
            vocab[tok] = len(vocab) + 1  # 1..|V|
    return vocab


def baseline_encode(text: str, vocab: Dict[str, int]) -> List[int]:
    return [vocab.get(t, 0) for t in tokenize(text)]  # 0 = UNK


def prepare_dataset_baseline(
    texts: List[str],
    vocab: Dict[str, int],
) -> List[List[int]]:
    return [baseline_encode(t, vocab) for t in texts]


# ======================================================
# 3. Word-aligned Huffman tokenizer
# ======================================================

class WordAlignedHuffmanTokenizer:
    """
    Word-level Huffman: для каждого слова строим Хаффман-код,
    но при классификации слово остаётся 1 токеном.
    """

    def __init__(self, corpus_texts: List[str], unk_token: str = "<unk>"):
        self.UNK = unk_token

        # 1) Собираем все токены из корпуса
        all_tokens: List[str] = []
        for txt in corpus_texts:
            all_tokens.extend(tokenize(txt))

        # 2) Частоты
        freq_table = build_frequency_table_tokens(all_tokens)

        # 3) UNK для OOV-слов
        if self.UNK not in freq_table:
            freq_table[self.UNK] = 1

        self.freq_table = freq_table

        # 4) Дерево Хаффмана и таблица кодов (token -> bits)
        self.tree = build_huffman_tree(self.freq_table)
        self.code_table = build_code_table_tokens(self.tree)

        # 5) Словарь кодов: token -> целочисленный ID
        #    (ID можно присваивать в порядке увеличения длины кода,
        #     чтобы частые слова имели маленькие ID, как в реальных токенизаторах)
        tokens_sorted = sorted(
            self.code_table.keys(),
            key=lambda t: (len(self.code_table[t]), self.code_table[t])
        )

        self.token_to_id: Dict[str, int] = {}
        for idx, tok in enumerate(tokens_sorted, start=1):
            self.token_to_id[tok] = idx

        self.vocab_size: int = len(self.token_to_id) + 1  # +1 для 0=PAD/UNK

    def encode(self, text: str) -> List[int]:
        """
        Текст -> последовательность целочисленных токенов.
        Каждый токен = одно слово, закодированное Хаффманом, но как 1 ID.
        """
        toks = tokenize(text)
        ids: List[int] = []
        unk_id = self.token_to_id.get(self.UNK, 0)
        for t in toks:
            ids.append(self.token_to_id.get(t, unk_id))
        return ids


def prepare_dataset_huffman_word(
    texts: List[str],
    huff_tok: WordAlignedHuffmanTokenizer,
) -> List[List[int]]:
    return [huff_tok.encode(t) for t in texts]


# ======================================================
# 4. Общая функция: seqs -> BOW
# ======================================================

def seqs_to_bow(seqs: List[List[int]], vocab_size: int) -> csr_matrix:
    """
    Bag-of-Tokens (частотный вектор) в виде разреженной матрицы.
    vocab_size = максимальный ID токена + 1.
    """
    X = lil_matrix((len(seqs), vocab_size), dtype=np.float32)
    for i, s in enumerate(seqs):
        for tok in s:
            if 0 <= tok < vocab_size:
                X[i, tok] += 1.0
    return X.tocsr()


# ======================================================
# 5. Датасет и сравнение
# ======================================================

def load_binary_text_dataset() -> Tuple[List[str], np.ndarray]:
    """
    Берём 2 категории из 20newsgroups и получаем бинарный текстовый датасет.
    """
    categories = ["rec.autos", "rec.sport.hockey"]

    data = fetch_20newsgroups(
        subset="all",
        categories=categories,
        remove=("headers", "footers", "quotes"),
    )

    texts: List[str] = data.data
    labels: np.ndarray = data.target  # 0/1
    return texts, labels


def run_classification_comparison_word_aligned() -> None:
    """
    Сравнение:
      1) Baseline word-индексы
      2) Word-aligned Huffman токены (одно слово = один токен),
         где IDs присвоены с учётом длины Хаффман-кода.
    Оба варианта используют BOW -> TF-IDF -> LinearSVC.
    """
    texts, labels = load_binary_text_dataset()

    train_texts, test_texts, y_train, y_test = train_test_split(
        texts,
        labels,
        test_size=0.2,
        random_state=42,
        stratify=labels,
    )

    print(f"\nDataset sizes: train={len(train_texts)}, test={len(test_texts)}")

    # -------- Baseline --------
    vocab = baseline_word_tokenizer(train_texts, min_freq=3)
    train_seqs_base = prepare_dataset_baseline(train_texts, vocab)
    test_seqs_base = prepare_dataset_baseline(test_texts, vocab)

    baseline_vocab_size = len(vocab) + 1
    X_train_base_counts = seqs_to_bow(train_seqs_base, baseline_vocab_size)
    X_test_base_counts = seqs_to_bow(test_seqs_base, baseline_vocab_size)

    tfidf_base = TfidfTransformer()
    X_train_base = tfidf_base.fit_transform(X_train_base_counts)
    X_test_base = tfidf_base.transform(X_test_base_counts)

    clf_base = LinearSVC()
    clf_base.fit(X_train_base, y_train)
    y_pred_base = clf_base.predict(X_test_base)
    acc_base = accuracy_score(y_test, y_pred_base)

    # -------- Huffman word-aligned --------
    huff_tok = WordAlignedHuffmanTokenizer(train_texts)
    train_seqs_huff = prepare_dataset_huffman_word(train_texts, huff_tok)
    test_seqs_huff = prepare_dataset_huffman_word(test_texts, huff_tok)

    huffman_vocab_size = huff_tok.vocab_size
    X_train_huff_counts = seqs_to_bow(train_seqs_huff, huffman_vocab_size)
    X_test_huff_counts = seqs_to_bow(test_seqs_huff, huffman_vocab_size)

    tfidf_huff = TfidfTransformer()
    X_train_huff = tfidf_huff.fit_transform(X_train_huff_counts)
    X_test_huff = tfidf_huff.transform(X_test_huff_counts)

    clf_huff = LinearSVC()
    clf_huff.fit(X_train_huff, y_train)
    y_pred_huff = clf_huff.predict(X_test_huff)
    acc_huff = accuracy_score(y_test, y_pred_huff)

    # ----- длины последовательностей -----
    def avg_len(seqs: List[List[int]]) -> float:
        return sum(len(s) for s in seqs) / len(seqs)

    avg_len_base = avg_len(train_seqs_base)
    avg_len_huff = avg_len(train_seqs_huff)

    print("\n=== Classification comparison: Baseline vs Word-aligned Huffman ===")
    print(f"Baseline accuracy: {acc_base:.4f}")
    print(f"Huffman  accuracy: {acc_huff:.4f}")
    print(f"Average seq length (baseline train): {avg_len_base:.2f}")
    print(f"Average seq length (Huffman  train): {avg_len_huff:.2f}")
    print(f"Vocab size (baseline): {baseline_vocab_size}")
    print(f"Vocab size (Huffman):  {huffman_vocab_size}")


# ======================================================
# 6. Запуск
# ======================================================

if __name__ == "__main__":
    run_classification_comparison_word_aligned()


Dataset sizes: train=1591, test=398





=== Classification comparison: Baseline vs Word-aligned Huffman ===
Baseline accuracy: 0.9347
Huffman  accuracy: 0.9372
Average seq length (baseline train): 203.61
Average seq length (Huffman  train): 203.61
Vocab size (baseline): 7168
Vocab size (Huffman):  17494


