# Imports

In [3]:
import sys
sys.path.append("../..")

from src.preprocessing.dictionary import Dictionary
from src.pickle_loader import PickleLoader
from src.data_loader import DataLoader
from src.utils.byte_pair_encoding import BytePairEncoder

# Loading data

In [4]:
en_data_loader = DataLoader("../../data/unpreprocessed/train/target.train.gz")
de_data_loader = DataLoader("../../data/unpreprocessed/train/source.train.gz")

In [5]:
en_bpe = PickleLoader.load("../../encoder/BPE_EN_7000.pickle")
de_bpe = PickleLoader.load("../../encoder/BPE_DE_7000.pickle")

In [6]:
en_encoded = en_bpe.encode_corpus(en_data_loader.load_data())
de_encoded = de_bpe.encode_corpus(de_data_loader.load_data())

In [9]:
en_dictionary = Dictionary("EN7000")
de_dictionary = Dictionary("DE7000")

en_dictionary.update(en_encoded)
de_dictionary.update(de_encoded)

PickleLoader.save("../../dictionaries/dict_EN_7000.pkl", en_dictionary)
PickleLoader.save("../../dictionaries/dict_DE_7000.pkl", de_dictionary)

{'<UNK>': 2, '<s>': 0, '</s>': 1}
{'<UNK>': 2, '<s>': 0, '</s>': 1}


# Creating and saving dictionaires

In [None]:
en_dictionary = Dictionary("EN_5000")
de_dictionary = Dictionary("DE_5000")

en_dictionary.update(en_encoded)
de_dictionary.update(de_encoded)

PickleLoader.save("../../data/dictionaries/dict_EN_5000.pkl", en_dictionary)
PickleLoader.save("../../data/dictionaries/dict_DE_5000.pkl", de_dictionary)

{'<UNK>': 2, '<s>': 0, '</s>': 1}
{'<UNK>': 2, '<s>': 0, '</s>': 1}


# Creating and saving indexed data

In [None]:
en_indexed = en_dictionary.apply_mapping(en_encoded)
de_indexed = de_dictionary.apply_mapping(de_encoded)

PickleLoader.save("../../data/data_v2/multi30k.en.5000_BPE.indexed.pickle", en_indexed)
PickleLoader.save("../../data/data_v2/multi30k.de.5000_BPE.indexed.pickle", de_indexed)

## 7k Indexing for First Model

In [20]:
en_data_loader = DataLoader("../../data/unpreprocessed/dev/target.dev")
de_data_loader = DataLoader("../../data/unpreprocessed/dev/source.dev")

In [21]:
en_data = en_data_loader.load_data()
de_data = de_data_loader.load_data()

In [22]:
en_encoder = PickleLoader.load("../../encoder/BPE_EN_7000.pickle")
de_encoder = PickleLoader.load("../../encoder/BPE_DE_7000.pickle")

In [23]:
en_encoded = en_encoder.encode_corpus(en_data)
de_encoded = de_encoder.encode_corpus(de_data)

In [None]:
en_dictionary = PickleLoader.load("../../dictionaries/dict_EN_7000.pkl")
de_dictionary = PickleLoader.load("../../dictionaries/dict_DE_7000.pkl")

In [None]:
PickleLoader.save("../../dictionaries/dict_EN_7000.pkl", en_dictionary)
PickleLoader.save("../../dictionaries/dict_DE_7000.pkl", de_dictionary)

In [None]:
en_indexed = en_dictionary.apply_mapping(en_encoded)
de_indexed = de_dictionary.apply_mapping(de_encoded)

In [None]:
PickleLoader.save("../../data/7k_BPE_indexed/dev/target.train.pickle", en_indexed)
PickleLoader.save("../../data/7k_BPE_indexed/dev/source.train.pickle", de_indexed)