In [7]:
import os
import spacy

In [3]:
# Download models if not already downloaded
try:
    # spacy setup for italian and german
    nlp_it = spacy.load('it_core_news_sm')
    nlp_de = spacy.load('de_core_news_sm')
except OSError:
    spacy.cli.download('it_core_news_sm')
    spacy.cli.download('de_core_news_sm')
    nlp_it = spacy.load('it_core_news_sm')
    nlp_de = spacy.load('de_core_news_sm')


Collecting it-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.5.0/it_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.0/13.0 MB 12.7 MB/s eta 0:00:00
Installing collected packages: it-core-news-sm
Successfully installed it-core-news-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
Collecting de-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.5.0/de_core_news_sm-3.5.0-py3-none-any.whl (14.6 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 13.3 MB/s eta 0:00:00
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [10]:
# Load data for italian and german
# The data has already been split into train, test and dev
# the files are found in data/raw, and are named in the following way:\
# {set}.de-it.{lang}
# we use the reduced datasets (100000 sentences) for training, the files have .reduced at the end
files = os.listdir('data/raw')
files.remove("train.de-it.de")
files.remove("train.de-it.it")
print(files)

# create the "processed" folder if it doesn't exist
if not os.path.exists('data/processed'):
    os.makedirs('data/processed')

['dev.de-it.de', 'test.de-it.it', 'train.de-it.de.reduced', 'dev.de-it.it', 'test.de-it.de', 'train.de-it.it.reduced']


In [12]:
# tokenize the data and save it in data/processed
tokenized_data = {}
for file in files:
    with open('data/raw/' + file, 'r') as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    if file.endswith('.de'):
        nlp = nlp_de
    else:
        nlp = nlp_it
    tokenized = [nlp(line) for line in lines]
    tokenized_data[file] = tokenized
    print(f"Tokenized {file}, {len(tokenized)} lines")

# save the tokenized data
for filename, data in tokenized_data.items():
    with open('data/processed/' + filename + '.tokenized', 'w') as f:
        for line in data:
            f.write(' '.join([token.text for token in line]) + '\n')
    print(f"Saved {filename}.tokenized")

Tokenized dev.de-it.de, 923 lines
Tokenized test.de-it.it, 1567 lines
Tokenized train.de-it.de.reduced, 100000 lines
Tokenized dev.de-it.it, 923 lines
Tokenized test.de-it.de, 1567 lines
Tokenized train.de-it.it.reduced, 100000 lines
Saved dev.de-it.de.tokenized
Saved test.de-it.it.tokenized
Saved train.de-it.de.reduced.tokenized
Saved dev.de-it.it.tokenized
Saved test.de-it.de.tokenized
Saved train.de-it.it.reduced.tokenized
