In [28]:
from tokenizers import Tokenizer
from tokenizers import models, trainers, normalizers, pre_tokenizers, processors, decoders
from collections import Counter
import pandas as pd
import random

In [29]:
with open("data/tur_news_2024_1M/tur_news_2024_1M-sentences.txt", "r", encoding="utf-8") as f:
    tur_df = pd.DataFrame(
            [line.strip().split("\t")[1] for line in f.readlines()], columns=["sentence"]
        )

In [30]:
# function applied to dataset cleans sentences from digits and special characters
def produce_clean_sentence_list(df):
    cleaned_sentences = []
    for sentence in df["sentence"]:
        # Remove digits and special characters, keep only letters and spaces
        cleaned_sentences.append(''.join(char for char in sentence if char.isalpha() or char.isspace()))
    return cleaned_sentences

In [31]:
# clean the sentences and add as a new column
tur_df["cleaned"] = produce_clean_sentence_list(tur_df)
display(tur_df.head())

Unnamed: 0,sentence,cleaned
0,%0.1 çok düşük çok düşük bir büyüme.,çok düşük çok düşük bir büyüme
1,"""01 Kasım 2024 tarihinden itibaren geçerli olm...",Kasım tarihinden itibaren geçerli olmak üzer...
2,"""02.04.2024 günü saat 12.47’de Beşiktaş ilçesi...",günü saat de Beşiktaş ilçesi Gayrettepe Mahal...
3,"""02 Adana Yasin-Poyraz"" firmasının et dönerind...",Adana YasinPoyraz firmasının et dönerinde der...
4,"‘ 0-4 yaş arası çocuklu annelere, ‘Anne Kart’ ...",yaş arası çocuklu annelere Anne Kart uygulam...


In [32]:
sentences = tur_df["cleaned"].astype(str).tolist()

# separate into training and testing sets
random.seed(42)
random.shuffle(sentences)
split_index = int(0.9 * len(sentences))
train_sentences = sentences[:split_index]
test_sentences = sentences[split_index:]

In [33]:
# setup tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]", byte_fallback=True))
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFD(),
    normalizers.Lowercase()
])
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(
    vocab_size=50_000,
    min_frequency=2,
    special_tokens=special_tokens,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)

In [34]:
# tokenizer training
tokenizer.train_from_iterator(train_sentences, trainer=trainer)

# runtime options
tokenizer.enable_padding(
    pad_id=tokenizer.token_to_id("[PAD]"),
    pad_token="[PAD]"
)
tokenizer.enable_truncation(max_length=64)






In [52]:
def tokens_per_word_from_encoding(enc):
    word_ids = None
    #if hasattr(enc, "words") and enc.words is not None:
        #word_ids = enc.words
    if hasattr(enc, "ids"):
        try:
            word_ids = enc.ids  # property
        except TypeError:
            word_ids = enc.ids()  # method

    if word_ids:
        # Filter out specials (None / -1 depending on impl)
        valid_ids = [w for w in word_ids if w is not None and w != -1]
        if not valid_ids:
            return 0.0
        # Count tokens per word index
        per_word_counts = Counter(valid_ids)
        return sum(per_word_counts.values()) / len(per_word_counts)

    # approximate words via whitespace
    text = enc.normalized_str if hasattr(enc, "normalized_str") else None
    if text is None and hasattr(enc, "tokens"):
        text = " ".join(enc.tokens)
    if text is None:
        return 0.0
    approx_words = [w for w in text.strip().split() if w]
    return (len(enc.ids) / len(approx_words)) if approx_words else 0.0

In [53]:
encoded_tests = [tokenizer.encode(sentence) for sentence in test_sentences]
total_tokens = sum(len(enc.ids) for enc in encoded_tests)
average_token_length = total_tokens / len(encoded_tests)
print(f"Average token length in test set: {average_token_length:.2f}")

# Token frequency distribution
token_freq = {}
for enc in encoded_tests:
    for token_id in enc.ids:
        token_freq[token_id] = token_freq.get(token_id, 0) + 1

# Sort by frequency
sorted_token_freq = sorted(token_freq.items(), key=lambda x: x[1], reverse=True)
print("Top 10 most frequent tokens (ID: Frequency):")
for token_id, freq in sorted_token_freq[:10]:
    print(f"{token_id}: {freq}, ({tokenizer.id_to_token(token_id)})")

# Number of tokens per word
tokens_per_word_list = [tokens_per_word_from_encoding(enc) for enc in encoded_tests]
average_tokens_per_word = sum(tokens_per_word_list) / len(tokens_per_word_list)
print(f"Average number of tokens per word in test set: {average_tokens_per_word:.2f}")

Average token length in test set: 25.89
Top 10 most frequent tokens (ID: Frequency):
262: 232111, (Ì§)
263: 226239, (ÌĪ)
273: 93975, (ÌĨ)
305: 33962, (Ġve)
225: 33079, (Ġ)
296: 31612, (Ġc)
279: 25564, (Ġo)
311: 23210, (Ġbir)
307: 21840, (Ġbu)
343: 17823, (Ìĩ)
Average number of tokens per word in test set: 1.17
