In [26]:
import sys
sys.path.append('..')

from src.data_loader import DataLoader
from src.pickle_loader import PickleLoader
from src.utils.byte_pair_encoding import BytePairEncoder
from src.preprocessing.dictionary import Dictionary

OPS = 1000

In [27]:
# Load data
de_train_data_loader = DataLoader('../data/unpreprocessed/train/source.train.gz')
en_train_data_loader = DataLoader('../data/unpreprocessed/train/target.train.gz')

de_dev_data_loader = DataLoader('../data/unpreprocessed/dev/source.dev')
en_dev_data_loader = DataLoader('../data/unpreprocessed/dev/target.dev')

In [28]:
# load BPE encoders
joined_bpe = PickleLoader.load(f"../encoder/BPE_JOINT_{OPS}.pickle")
en_bpe = PickleLoader.load(f"../encoder/BPE_EN_{OPS}.pickle")
de_bpe = PickleLoader.load(f"../encoder/BPE_DE_{OPS}.pickle")

In [29]:
# encode text
en_train_encoded = en_bpe.encode_corpus(en_train_data_loader.load_data())
de_train_encoded = de_bpe.encode_corpus(de_train_data_loader.load_data())

en_dev_encoded = en_bpe.encode_corpus(en_dev_data_loader.load_data())
de_dev_encoded = de_bpe.encode_corpus(de_dev_data_loader.load_data())

en_train_encoded_joined = joined_bpe.encode_corpus(en_train_data_loader.load_data())
de_train_encoded_joined = joined_bpe.encode_corpus(de_train_data_loader.load_data())

en_dev_encoded_joined = joined_bpe.encode_corpus(en_dev_data_loader.load_data())
de_dev_encoded_joined = joined_bpe.encode_corpus(de_dev_data_loader.load_data())

In [30]:
# dictionaries
en_dict = Dictionary(f"EN{OPS}")
de_dict = Dictionary(f"DE{OPS}")

en_joined_dict = Dictionary(f"JOINED_EN{OPS}")
de_joined_dict = Dictionary(f"JOINED_DE{OPS}")

# build dictionaries
en_dict.update(en_train_encoded)
de_dict.update(de_train_encoded)

en_joined_dict.update(en_train_encoded_joined)
de_joined_dict.update(de_train_encoded_joined)

# save dictionaries
PickleLoader.save(f"../dictionaries/dict_EN_{OPS}.pkl", en_dict)
PickleLoader.save(f"../dictionaries/dict_DE_{OPS}.pkl", de_dict)

PickleLoader.save(f"../dictionaries/dict_JOINED_EN_{OPS}.pkl", en_joined_dict)
PickleLoader.save(f"../dictionaries/dict_JOINED_DE_{OPS}.pkl", de_joined_dict)

{'<UNK>': 2, '<s>': 0, '</s>': 1}
{'<UNK>': 2, '<s>': 0, '</s>': 1}
{'<UNK>': 2, '<s>': 0, '</s>': 1}
{'<UNK>': 2, '<s>': 0, '</s>': 1}


In [31]:
# apply mappings
en_train_indexed = en_dict.apply_mapping(en_train_encoded)
de_train_indexed = de_dict.apply_mapping(de_train_encoded)

en_dev_indexed = en_dict.apply_mapping(en_dev_encoded)
de_dev_indexed = de_dict.apply_mapping(de_dev_encoded)

en_train_indexed_joined = en_joined_dict.apply_mapping(en_train_encoded_joined)
de_train_indexed_joined = de_joined_dict.apply_mapping(de_train_encoded_joined)

en_dev_indexed_joined = en_joined_dict.apply_mapping(en_dev_encoded_joined)
de_dev_indexed_joined = de_joined_dict.apply_mapping(de_dev_encoded_joined)

# save indexed data
PickleLoader.save(f"../data/{OPS//1000}k_BPE_indexed/train/source.train.pickle", de_train_indexed)
PickleLoader.save(f"../data/{OPS//1000}k_BPE_indexed/train/target.train.pickle", en_train_indexed)
#
PickleLoader.save(f"../data/{OPS//1000}k_BPE_indexed/dev/source.dev.pickle", de_dev_indexed)
PickleLoader.save(f"../data/{OPS//1000}k_BPE_indexed/dev/target.dev.pickle", en_dev_indexed)

PickleLoader.save(f"../data/{OPS//1000}k_joined_BPE_indexed/train/source.train.pickle", de_train_indexed_joined)
PickleLoader.save(f"../data/{OPS//1000}k_joined_BPE_indexed/train/target.train.pickle", en_train_indexed_joined)

PickleLoader.save(f"../data/{OPS//1000}k_joined_BPE_indexed/dev/source.dev.pickle", de_dev_indexed_joined)
PickleLoader.save(f"../data/{OPS//1000}k_joined_BPE_indexed/dev/target.dev.pickle", en_dev_indexed_joined)
