In [1]:
import nltk
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

nltk.download('punkt_tab')

dataset = '../data-sets/Zinparen in Engels-Nederlands - 2024-06-07.tsv'
df = pd.read_csv(
    dataset,
    sep='\t',
    header=None,
    usecols=[1, 3],
    names=["en", "nl"],
    dtype="string",
    keep_default_na=False)

# dataset = '../data-sets/train-00000-of-00001.parquet'
# df = pd.read_parquet(dataset)
# df = pd.json_normalize(df['translation'])

sos_token = '<sos>'
eos_token = '<eos>'

def tokenize(text, language):
    return [sos_token] + nltk.word_tokenize(text, language=language) + [eos_token]

def tokenize_column(texts, lang):
    return [tokenize(t, lang) for t in texts]

with ThreadPoolExecutor(max_workers=16) as ex:
    eng_future = ex.submit(tokenize_column, df['en'], 'english')
    nld_future = ex.submit(tokenize_column, df['nl'], 'dutch')
    df['ENG_TOKENS'] = eng_future.result()
    df['NLD_TOKENS'] = nld_future.result()

df.drop(columns=['en', 'nl'], axis=1, inplace=True)
df.to_parquet('../data-sets/Zinparen in Engels-Nederlands - 2025-11-01.parquet', index=False)


print(df)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ocmki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                               ENG_TOKENS  \
0              [<sos>, Let, 's, try, something, ., <eos>]   
1              [<sos>, Let, 's, try, something, ., <eos>]   
2           [<sos>, I, have, to, go, to, sleep, ., <eos>]   
3       [<sos>, Today, is, June, 18th, and, it, is, Mu...   
4                 [<sos>, Muiriel, is, 20, now, ., <eos>]   
...                                                   ...   
155192  [<sos>, Cotton, candy, is, usually, sold, and,...   
155193  [<sos>, At, the, moment, I, am, looking, for, ...   
155194      [<sos>, The, unthinkable, happened, ., <eos>]   
155195  [<sos>, Let, 's, wait, until, she, rings, ., <...   
155196  [<sos>, My, mom, has, her, hair, in, a, bun, ....   

                                               NLD_TOKENS  
0            [<sos>, Laten, we, iets, proberen, !, <eos>]  
1            [<sos>, Laat, ons, iets, proberen, ., <eos>]  
2               [<sos>, Ik, moet, gaan, slapen, ., <eos>]  
3       [<sos>, Vandaag, is