In [1]:
import spacy
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# url path
# dataset = '../data-sets/eng-nld.txt' # needs usecols[0, 1]
dataset = '../data-sets/Zinparen in Engels-Nederlands - 2024-06-07.tsv' # needs usecols[1, 3]

# Start and end tokens
sos_token = '<sos>'
eos_token = '<eos>'


### Reading data set ###
ddf = dd.read_csv(
    dataset,
    sep='\t',
    header=None,
    usecols=[1, 3],
    names=["ENG", "NLD"],
    dtype="string",
    keep_default_na=False)

print('Number of partitions: ' + str(ddf.npartitions))
print('Dataframe info:')
print(ddf.info(memory_usage=True))

# ddf = dd.from_pandas(ddf.head(1000)) # Debug


### Processing data set ###
# spacy.cli.download("en_core_web_sm")
# spacy.cli.download("nl_core_news_sm")

nlp_en = spacy.load("en_core_web_sm")
nlp_nl = spacy.load("nl_core_news_sm")

def tokenize_spacy(sentence, language):
    if language == 'ENG':
        tokens = [token.text for token in nlp_en(sentence)]
    if language == 'NLD':
        tokens = [token.text for token in nlp_nl(sentence)]
    return [sos_token] + tokens + [eos_token]

def process_row(row):
    row['ENG_TOKENS'] = tokenize_spacy(row['ENG'], 'ENG')
    row['NLD_TOKENS'] = tokenize_spacy(row['NLD'], 'NLD')
    return row

def process_partition(df):
    return df.apply(process_row, axis=1)

# Define the meta of the dataframe
meta = pd.DataFrame({
    'ENG': pd.Series(dtype='str'),
    'NLD': pd.Series(dtype='str'),
    'ENG_TOKENS': pd.Series(dtype='object'),
    'NLD_TOKENS': pd.Series(dtype='object')
})

ddf = ddf.map_partitions(process_partition, meta=meta)

with ProgressBar():
    result = ddf.compute()


# Save results to csv
result.to_csv(
    '../data-sets/Zinparen in Engels-Nederlands - 2024-06-07_tokens.csv',
    columns=['ENG_TOKENS', 'NLD_TOKENS'],
    header=False,
    index=False)


print(result)

Number of partitions: 1
Dataframe info:
<class 'dask_expr.DataFrame'>
Columns: 2 entries, ENG to NLD
dtypes: string(2)
memory usage: 24.0 MB
None
[########################################] | 100% Completed | 18m 43s
                                                      ENG  \
0                                    Let's try something.   
1                                    Let's try something.   
2                                  I have to go to sleep.   
3        Today is June 18th and it is Muiriel's birthday!   
4                                      Muiriel is 20 now.   
...                                                   ...   
155192  Cotton candy is usually sold and made at funfa...   
155193              At the moment I am looking for a job.   
155194                          The unthinkable happened.   
155195                        Let's wait until she rings.   
155196                      My mom has her hair in a bun.   

                                                   