In [8]:
import sys
import os
import collections
import tensorflow as tf
import numpy as np
from keras.utils import to_categorical
import keras
import numpy as np

from keras.layers import Embedding, LSTM
from keras.optimizers import SGD
from keras.models import Model
from keras.layers import Input, Dense




In [None]:
class ModelConfig(object):
    def __init__(self):
        self.MAX_SEQUENCE_LENGTH = 20
        self.EMBEDDING_DIM = 100
        self.VALIDATION_SPLIT = 0.2
        self.batch_size = 20
        self.units = 200
        self.nb_words = 9999
        self.nb_time_steps = 20
        self.nb_input_vector = 1
        self.sgd_lr = 1.0
        self.sgd_momentum = 0.9
        self.sgd_decay = 0.0
        self.nb_epoch = 3
        self.steps_per_epoch = 25


m_config = ModelConfig()

In [9]:
Py3 = sys.version_info[0] == 3


def _read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        if Py3:
            return f.read().replace("\n", "<eos>").split()
        else:
            return f.read().decode("utf-8").replace("\n", "<eos>").split()


def _build_vocab(filename):
    data = _read_words(filename)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    return word_to_id


def _file_to_word_ids(filename, word_to_id):
    data = _read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]


def ptb_raw_data(data_path=None):
    train_path = os.path.join(data_path, "ptb.train.txt")
    valid_path = os.path.join(data_path, "ptb.valid.txt")
    test_path = os.path.join(data_path, "ptb.test.txt")
    word_to_id = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    return train_data, valid_data, test_data, vocabulary


def cut_num_step(train_data, num_step):
    x = []
    y = []
    a = []
    b = []
    for i in range(len(train_data)):
        if i % num_step == 0 and i != 0:
            x.append(a)
            y.append(b)
            a = []
            b = []
        if i == len(train_data) - 1:
            b.append(0)
        else:
            a.append(train_data[i])
            b.append(to_categorical(train_data[i + 1], num_classes=10000))
    return np.asarray(x), np.asarray(y)

In [10]:
class LanguageModel(object):
    def __init__(self):
        self.config = m_config

    def build_lm_model(self, inputs_x, targets):
        inputs = Input(tensor=inputs_x.get_next())
        embedding_layer = Embedding(m_config.nb_words + 1,
                                    m_config.EMBEDDING_DIM,
                                    input_length=m_config.MAX_SEQUENCE_LENGTH)
        lstm_layer = LSTM(
            m_config.units,
            activation='tanh',
            recurrent_activation='hard_sigmoid',
            use_bias=True,
            kernel_initializer='glorot_uniform',
            recurrent_initializer='orthogonal',
            bias_initializer='zeros',
            return_sequences=True)
        x = embedding_layer(inputs)
        x = lstm_layer(x)
        x = lstm_layer(x)
        predictions = Dense(m_config.nb_words + 1, activation='softmax')(x)
        sgd = SGD(
            lr=m_config.sgd_lr,
            momentum=m_config.sgd_momentum,
            decay=m_config.sgd_decay,
            nesterov=False)
        model = Model(inputs=inputs, outputs=predictions)
        model.compile(
            loss='categorical_crossentropy',
            optimizer=sgd,
            metrics=['accuracy'],
            target_tensors=[
                targets.get_next()])
        history = model.fit(
            epochs=m_config.nb_epoch,
            steps_per_epoch=m_config.steps_per_epoch)
        ppl = np.exp(np.array(history.history["loss"]))
        return ppl

In [18]:
def main():
    dta_path = os.getcwd() + "/simple-examples/data/"
    train_data, valid_data, test_data, _ = ptb_raw_data(dta_path)
#     x_train, y_train = cut_num_step(
#         train_data[0:10001], m_config.nb_time_steps)
    x_train, y_train = cut_num_step(
        train_data, m_config.nb_time_steps)
    features = x_train.astype('float32')
    labels = y_train.astype('float32')
    dataset_x = tf.data.Dataset.from_tensor_slices(features).repeat()
    dataset_y = tf.data.Dataset.from_tensor_slices(labels).repeat()
    dataset_x = dataset_x.batch(m_config.batch_size)
    dataset_y = dataset_y.batch(m_config.batch_size)
    itera_x = dataset_x.make_one_shot_iterator()
    itera_y = dataset_y.make_one_shot_iterator()
    train_model = LanguageModel()
    print("ppl:", train_model.build_lm_model(itera_x, itera_y))

In [None]:
main()