In [5]:
import pandas as pd
from collections import defaultdict
from transformers import AutoTokenizer

In [6]:
# Load the CSV file
file_path = "cleaned_infopankki-fa.csv"
df = pd.read_csv(file_path)

In [7]:
# Extract all text from the English and Persian columns
english_corpus = df['English'].dropna().tolist()
persian_corpus = df['Persian'].dropna().tolist()

# Pre-tokenizer: Load GPT-2 tokenizer to mimic pre-tokenization step
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [8]:
# Function to compute word frequencies and alphabet for BPE
def compute_word_frequencies(corpus):
    word_freqs = defaultdict(int)
    for text in corpus:
        words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
        new_words = [word for word, offset in words_with_offsets]
        for word in new_words:
            word_freqs[word] += 1
    return word_freqs


# Function to compute base vocabulary (characters)
def compute_alphabet(word_freqs):
    alphabet = []
    for word in word_freqs.keys():
        for letter in word:
            if letter not in alphabet:
                alphabet.append(letter)
    alphabet.sort()
    return alphabet


# Function to compute frequency of pairs
def compute_pair_freqs(splits, word_freqs):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs


# Function to merge pairs in the splits
def merge_pair(a, b, splits, word_freqs):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2:]
            else:
                i += 1
        splits[word] = split
    return splits


# Function to perform BPE tokenization
def bpe_tokenization(corpus, vocab_size=50):
    # Compute word frequencies
    word_freqs = compute_word_frequencies(corpus)

    # Compute base vocabulary (characters)
    alphabet = compute_alphabet(word_freqs)

    # Initialize the vocabulary with special tokens and alphabet
    vocab = [""] + alphabet.copy()

    # Split each word into individual characters
    splits = {word: [c for c in word] for word in word_freqs.keys()}

    # BPE training: merge pairs until reaching desired vocab size
    merges = {}

    while len(vocab) < vocab_size:
        pair_freqs = compute_pair_freqs(splits, word_freqs)
        best_pair = ""
        max_freq = None
        for pair, freq in pair_freqs.items():
            if max_freq is None or max_freq < freq:
                best_pair = pair
                max_freq = freq
        splits = merge_pair(*best_pair, splits, word_freqs)
        merges[best_pair] = best_pair[0] + best_pair[1]
        vocab.append(best_pair[0] + best_pair[1])

    # Tokenization function to apply learned merges
    def tokenize(text):
        pre_tokenize_result = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
        pre_tokenized_text = [word for word, offset in pre_tokenize_result]
        splits = [[l for l in word] for word in pre_tokenized_text]
        for pair, merge in merges.items():
            for idx, split in enumerate(splits):
                i = 0
                while i < len(split) - 1:
                    if split[i] == pair[0] and split[i + 1] == pair[1]:
                        split = split[:i] + [merge] + split[i + 2:]
                    else:
                        i += 1
                splits[idx] = split

        return sum(splits, [])

    # Apply the tokenization to the entire corpus
    tokenized_corpus = [tokenize(text) for text in corpus]
    return tokenized_corpus

In [9]:
# Tokenize the English and Persian text separately
english_tokenized = bpe_tokenization(english_corpus)
persian_tokenized = bpe_tokenization(persian_corpus)

# Create new DataFrames for tokenized results
english_df = pd.DataFrame({'English_Tokenized': [' '.join(tokens) for tokens in english_tokenized]})
persian_df = pd.DataFrame({'Persian_Tokenized': [' '.join(tokens) for tokens in persian_tokenized]})

# Save the tokenized results to separate CSV files
english_output_file_path_csv = "tokenized_english_bpe.csv"
persian_output_file_path_csv = "tokenized_persian_bpe.csv"

english_df.to_csv(english_output_file_path_csv, index=False)
persian_df.to_csv(persian_output_file_path_csv, index=False)

# # Save the tokenized results to separate EXCEL files
# english_output_file_path_xlsx = "tokenized_english_bpe.xlsx"
# persian_output_file_xlsx = "tokenized_persian_bpe.xlsx"

# english_df.to_excel(english_output_file_path_xlsx, index=False)
# persian_df.to_excel(persian_output_file_xlsx, index=False)