In [1]:
import numpy as np
import os
from Tokenizer import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
def split_data(file, save_unpreprocessed_targets=False):
    """
    Reads the data from the given file.
    The two languages in the file have to be splitted by a tab
    :param file: file which should be read from
    :return: (input_texts, target_texts)
    """
    input_texts = []
    target_texts = []
    lines = open(file, encoding='UTF-8').read().split('\n')
    for line in lines:
        input_text, target_text = line.split('\t')
        input_texts.append(input_text)
        target_texts.append(target_text)
    if save_unpreprocessed_targets is True:
        val_target_texts_no_preprocessing = target_texts.copy()
    assert len(input_texts) == len(target_texts)
    return input_texts, target_texts

In [8]:
def create_vocab(train_input_texts, train_target_texts, params, val_input_texts, val_target_texts):
        en_tokenizer = Tokenizer(START_TOKEN, END_TOKEN, UNK_TOKEN,
                                 num_words=params['MAX_WORDS_EN'])
        en_tokenizer.fit_on_texts(train_input_texts)
        train_input_texts = en_tokenizer.texts_to_sequences(train_input_texts)
        train_input_texts = pad_sequences(train_input_texts, maxlen=params['MAX_SEQ_LEN'],
                                               padding='post',
                                               truncating='post')
        insert_valid_token_at_last_position(train_input_texts, params)
        val_input_texts = en_tokenizer.texts_to_sequences(val_input_texts)
        val_input_texts = pad_sequences(val_input_texts, maxlen=params['MAX_SEQ_LEN'],
                                               padding='post',
                                               truncating='post')
        insert_valid_token_at_last_position(val_input_texts, params)
        en_word_index = en_tokenizer.word_index

        de_tokenizer = Tokenizer(START_TOKEN, END_TOKEN, UNK_TOKEN,
                                 num_words=params['MAX_WORDS_DE'])
        de_tokenizer.fit_on_texts(train_target_texts)
        train_target_texts = de_tokenizer.texts_to_sequences(train_target_texts)
        train_target_texts = pad_sequences(train_target_texts, maxlen=params['MAX_SEQ_LEN'],
                                                padding='post',
                                                truncating='post')
        insert_valid_token_at_last_position(train_target_texts, params)
        val_target_texts = de_tokenizer.texts_to_sequences(val_target_texts)
        val_target_texts = pad_sequences(val_target_texts, maxlen=params['MAX_SEQ_LEN'],
                                                padding='post',
                                                truncating='post')
        insert_valid_token_at_last_position(val_target_texts, params)
        de_word_index = de_tokenizer.word_index

        embeddings_index = {}
        filename = "/data/wrapper/PA_BA/DataSets/glove.6B.200d.txt"
        with open(filename, 'r', encoding='utf8') as f:
            for line in f.readlines():
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs

        print('Found %s word vectors.' % len(embeddings_index))

        num_train_words = params['MAX_WORDS_EN'] + 3
        en_embedding_matrix = np.zeros((num_train_words, params['EMBEDDING_DIM']))
        for word, i in en_word_index.items():
            if i >= params['MAX_WORDS_EN'] + 3:
                continue
            embedding_vector = None
            if word == START_TOKEN:
                embedding_vector = START_TOKEN_VECTOR
            elif word == END_TOKEN:
                embedding_vector = END_TOKEN_VECTOR
            elif word == UNK_TOKEN:
                embedding_vector = UNK_TOKEN_VECTOR
            else:
                embedding_vector = embeddings_index.get(word)
            if embedding_vector is None:
                embedding_vector = UNK_TOKEN_VECTOR
            en_embedding_matrix[i] = embedding_vector
        return train_input_texts, train_target_texts, en_word_index, de_word_index, en_embedding_matrix, val_input_texts, val_target_texts

In [9]:
def insert_valid_token_at_last_position(texts, params):
        for sent in texts:
            if not (sent[params['MAX_SEQ_LEN'] - 1] == 0 or sent[params['MAX_SEQ_LEN'] - 1] == 2):
                sent[params['MAX_SEQ_LEN'] - 1] = 2

In [10]:
params = {}
#params['batch_size'] = 64
#params['val_batch_size'] = 256
#params['epochs'] = 20
#params['latent_dim'] = 1000
params['MAX_SEQ_LEN'] = 100
params['EMBEDDING_DIM'] = 200
params['MAX_WORDS_DE'] = 40000
params['MAX_WORDS_EN'] = 40000
#params['P_DENSE_DROPOUT'] = 0.2
#params['VALIDATION_FREQ'] = 1

identifier = "my_pre_proc"

BASE_DATA_DIR = "/data/TensorFlowTalks/neural_translation_my_pre_proc/"
BASIC_PERSISTENCE_DIR = "/data/TensorFlowTalks/neural_translation_my_pre_proc/translate/"
TRAIN_DATA_FILE = os.path.join(BASE_DATA_DIR, 'DE_EN_(tatoeba)_train.txt')
VAL_DATA_FILE = os.path.join(BASE_DATA_DIR, 'DE_EN_(tatoeba)_validation.txt')

START_TOKEN = "_GO"
END_TOKEN = "_EOS"
UNK_TOKEN = "_UNK"

In [11]:
START_TOKEN_VECTOR = np.random.rand(params['EMBEDDING_DIM'])
END_TOKEN_VECTOR = np.random.rand(params['EMBEDDING_DIM'])
UNK_TOKEN_VECTOR = np.random.rand(params['EMBEDDING_DIM'])

train_input_texts, train_target_texts = split_data(TRAIN_DATA_FILE)
val_input_texts, val_target_texts = split_data(VAL_DATA_FILE)
num_train_samples = len(train_input_texts)

In [12]:
train_input_texts, train_target_texts, en_word_index, de_word_index, en_embedding_matrix, val_input_texts, val_target_texts = create_vocab(train_input_texts, train_target_texts, params, val_input_texts, val_target_texts)
np.save(BASIC_PERSISTENCE_DIR + '/train_target_texts.npy', train_target_texts)
np.save(BASIC_PERSISTENCE_DIR + '/train_input_texts.npy', train_input_texts)
np.save(BASIC_PERSISTENCE_DIR + '/val_target_texts.npy', val_target_texts)
np.save(BASIC_PERSISTENCE_DIR + '/val_input_texts.npy', val_input_texts)
np.save(BASIC_PERSISTENCE_DIR + '/en_word_index.npy', en_word_index)
np.save(BASIC_PERSISTENCE_DIR + '/de_word_index.npy', de_word_index)
np.save(BASIC_PERSISTENCE_DIR + '/en_embedding_matrix.npy', en_embedding_matrix)

num_train_samples = len(train_input_texts)

Found 400000 word vectors.


ok
