# Imports and Makros

In [2]:
import sys
sys.path.append('../..')

from src.data_loader import DataLoader
from src.pickle_loader import PickleLoader

from src.utils.byte_pair_encoding import BytePairEncoder
from src.preprocessing.dictionary import Dictionary
from src.preprocessing.batching import Batcher

from multiprocessing import Pool

In [3]:
BATCH_SIZE = 200
WINDOW_SIZE = 4
BPE_OPERATIONS = 5000

# Load data

In [4]:
target_data_loader = DataLoader('../../data/data_v2/multi30k.en.gz')
source_data_loader = DataLoader('../../data/data_v2/multi30k.de.gz')

target_data_raw = target_data_loader.load_data()
source_data_raw = source_data_loader.load_data()

target_data_lines = target_data_loader.tokenize(mode="lines")
source_data_lines = source_data_loader.tokenize(mode="lines")

# Byte-Pair-Encoding

In [5]:
# fit target encoder
target_encoder = BytePairEncoder()
target_encoder.fit(target_data_loader.load_data(), BPE_OPERATIONS)

Processing Text: 100%|██████████| 5000/5000 [00:24<00:00, 206.14it/s, New Token=advice, New Rule=ad vice -> advice]                           


In [6]:
# fit source encoder
source_encoder = BytePairEncoder()
source_encoder.fit(source_data_loader.load_data(), BPE_OPERATIONS)

Processing Text: 100%|██████████| 5000/5000 [00:59<00:00, 84.50it/s, New Token=wickeln, New Rule=wickel n -> wickeln]                                       


In [7]:
# encode target data
target_data_encoded = target_encoder.encode_corpus(target_data_lines, multi_process=True)

In [8]:
# encode source data
source_data_encoded = source_encoder.encode_corpus(source_data_lines, multi_process=True)

[['<s>', 'zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.', '</s>'], ['<s>', 'mehrere', 'männer', 'mit', 'schutzhelmen', 'bedienen', 'ein', 'antrie@@', 'b@@', 's@@', 'radsystem', '.', '</s>'], ['<s>', 'ein', 'kleines', 'mädchen', 'klettert', 'in', 'ein', 'spielhaus', 'aus', 'holz', '.', '</s>'], ['<s>', 'ein', 'mann', 'in', 'einem', 'blauen', 'hemd', 'steht', 'auf', 'einer', 'leiter', 'und', 'putzt', 'ein', 'fenster', '.', '</s>'], ['<s>', 'zwei', 'männer', 'stehen', 'am', 'herd', 'und', 'bereiten', 'essen', 'zu', '.', '</s>'], ['<s>', 'ein', 'mann', 'in', 'grün', 'hält', 'eine', 'gitarre', ',', 'während', 'der', 'andere', 'mann', 'sein', 'hemd', 'ansieht', '.', '</s>'], ['<s>', 'ein', 'mann', 'lächelt', 'einen', 'ausgestopften', 'löwen', 'an', '.', '</s>'], ['<s>', 'ein', 'schickes', 'mädchen', 'spricht', 'mit', 'dem', 'handy', 'während', 'sie', 'langsam', 'die', 'straße', 'entlangschwebt', '.', '</s>'], ['<s>', 'eine', 'frau', 'm

# Dictionary

In [11]:
# create target dictionary
target_dictionary = Dictionary(save_model=False)
target_dictionary.update(target_data_encoded)

{'<UNK>': -1, '<s>': 0, '</s>': 1}


In [16]:
# create source_dictionary
source_dictionary = Dictionary(save_model=False)
source_dictionary.update(source_data_encoded)

{'<UNK>': -1, '<s>': 0, '</s>': 1}
dict_keys(['<UNK>', '<s>', '</s>', 'lederweste', 'karikatur@@', 'lauschen', 'on@@', 'ordnet', 'herumgehende', 'mahl', 'fakt', 'flossen', 'früchtestand', 'bination', 'nung', 'chevrolet', 'var@@', 'pilze', 'gelen@@', 'anblick', 'verkaufsautomaten', 'kleid', 'fahrradreifen', 'kopfüber', 'drähten', 'hmem', 'legosteinen', 'catering-@@', 'messern', 'walter@@', 'marineblaue', 'kopfsteinen', 'dribbelt', 'rudern@@', 'skalen@@', 'sicherheitsausrüstung', 'suche', 'nägel', 'designer', 'ruht', 'protestiert', 'bon@@', 'auslagenfenster', 'abgewandte', 'afroamerikanerin', 'fläche', 'strandes', 'abends', 'anzügen', 'anstrich', 'zweig', 'menschen', 'redaktion', 'leb@@', 'kannt', 'steindenkmal', 'grünanlagen', 'sarbeiter', 'skateboard', 'abgestimmten', 'händler', 'spitzen', 'dekorierte', 'förmlicher', 'langär@@', 'footballticket', 'mungs@@', 'wasch@@', 'radsportler', 'soldaten', 'spielsteuerung', 'business-kleidung', 'gemachte', 'paddel', 'kulti@@', 'ablauf@@', 'schmutz

In [17]:
# apply dictionary to target data
target_data_indexed = target_dictionary.apply_mapping(target_data_encoded)

In [18]:
# apply dictionary to source data
source_data_indexed = source_dictionary.apply_mapping(source_data_encoded)

# Batching

In [None]:
# initialize batcher
batcher = Batcher(source_data_indexed, target_data_indexed, BATCH_SIZE, WINDOW_SIZE)

In [None]:
# create batches
batcher.batch()

In [None]:
batches = batcher.getBatches()