In [1]:
import numpy as np
from collections import defaultdict
import time
import sys

from keras.models import Model, load_model
from keras.layers import Input, Embedding, Conv1D, Multiply
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback

from keras_tqdm import TQDMNotebookCallback

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
dataset_path = 'C:/Datasets/amazonreviews'
with open('{0}/train.ft.txt'.format(dataset_path), encoding='utf-8') as f:
    train_str = f.read()
with open('{0}/test.ft.txt'.format(dataset_path), encoding='utf-8') as f:
    test_str = f.read()

In [3]:
def data_parse(s):
    d = s.split('__label__')[1:]
    y = [int(s[0])-1 for s in d]
    #titles = [s.split(': ', 1)[0][2:] for s in d]
    x = [s.split(': ', 1)[1] for s in d]
    return (x, y)

(train_x, train_y) = data_parse(train_str)
(test_x, test_y) = data_parse(test_str)

In [4]:
del train_str
del test_str

In [5]:
num_groups = 2
num_layers = 8
num_filters = 128
#latent_size = 64

batch_size = 16
batch_len = 2**num_layers * 2

model_id = int(time.time())

In [6]:
# remove ~10% longest, sort by length
def data_filter(x, y):
    mask = [len(s) < 800 for s in x]
    x = [s for i, s in enumerate(x) if mask[i]]
    y = [s for i, s in enumerate(y) if mask[i]]
    sort = np.argsort([len(s) for s in x])
    x = [x[i] for i in sort]
    y = [y[i] for i in sort]
    return (x, y)

(train_x, train_y) = data_filter(train_x, train_y)
(test_x, test_y) = data_filter(test_x, test_y)

In [7]:
print(len(train_x), len(train_y))

3302164 3302164


In [8]:
#np.mean([(ord(c)>31 and ord(c)<127) for c in ''.join(s for s in train_x)])
# 99.7% within this set
chars = [chr(i+32) for i in range(126 - 32 + 1)]
print(''.join(chars))
char2idx = defaultdict(lambda: 2, {c: i+2 for i, c in enumerate(chars)})
idx2char = defaultdict(lambda: ' ', {i+2: c for i, c in enumerate(chars)})
input_dim = len(chars) + 2

 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~


In [9]:
def generate_data(batch_size=batch_size, test=False):
    if test:
        x, y = test_x, test_y
    else:
        x, y = train_x, train_y
    while True:
        bx = np.zeros((batch_size, batch_len), dtype='int32')
        by_text = np.zeros((batch_size, batch_len, input_dim), dtype='int32')
        by_sent = np.zeros((batch_size, batch_len, 2), dtype='int32')
        for batch_idx in range(batch_size):
            data_idx = np.random.randint(len(x))
            slice_s = max(0, len(x[data_idx]) - batch_len)
            if slice_s != 0:
                slice_s = np.random.randint(slice_s)
            slice_e = min(slice_s + batch_len, len(x[data_idx]))
            x_sub = x[data_idx][slice_s:slice_e]
            bx[batch_idx][-len(x_sub):] = [char2idx[c] for c in x_sub]
            for k, c in enumerate(x_sub[1:]):
                by_text[batch_idx][k + batch_len - len(x_sub)][char2idx[c]] = 1
            if slice_e == len(x[data_idx]):
                by_text[batch_idx, -1, 1] = 1
            else:
                by_text[batch_idx, -1, char2idx[x[data_idx][-1]]]
            by_sent[batch_idx, batch_len - len(x_sub):, y[data_idx]] = 1
        yield (bx, [by_text, by_sent])

In [10]:
bx, by = next(generate_data())
print(''.join([idx2char[i] for i in bx[0]]))
print(''.join([idx2char[np.argmax(v)] for v in by[0][0]]))

                                                                                                                                                                                                                             arning! Even though you're a matrix fan, and want more matrix action,just watch the movies,don't get this game.It's a total rip!The action is soooooooooooo boring, and the car driving parts of the game is just lame,very lame.But there's a bright side to this game,and it's the movie clips.  


In [11]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [12]:
l_in = Input(shape=(None,), dtype='int32')
h = Embedding(input_dim=input_dim, output_dim=num_filters)(l_in)
for i in range(num_groups):
    for j in range(num_layers):
        l_filter = Conv1D(filters=num_filters, kernel_size=2,
            dilation_rate=2**j, padding='causal', activation='tanh')(h)
        l_gate = Conv1D(filters=num_filters, kernel_size=2,
            dilation_rate=2**j, padding='causal', activation='sigmoid')(h)
        h = Multiply()([l_filter, l_gate])
l_out_text = Conv1D(filters=input_dim, kernel_size=1, activation='softmax')(h)
l_out_sent = Conv1D(filters=2, kernel_size=1, activation='softmax')(h)

model = Model(inputs=l_in, outputs=[l_out_text, l_out_sent])

adam = Adam(0.001)
model.compile(optimizer=adam, loss='categorical_crossentropy')

In [13]:
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, None, 256)     24832       input_1[0][0]                    
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, None, 256)     131328      embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, None, 256)     131328      embedding_1[0][0]                
___________________________________________________________________________________________

In [14]:
def generate_text(epoch=None, logs=None, length=400):
    print()
    print('-- epoch {0}\n'.format(epoch))
    bx, by = next(generate_data(batch_size=1, test=True))
    bx = bx[0][-(2**num_layers):]
    seed_str = ''.join([idx2char[i] for i in bx])
    print('-- seed: {0}'.format(seed_str))
    print()
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('-- diversity: {0}'.format(diversity))
        #sys.stdout.write(seed_str)
        for i in range(length):
            y, y_sent = model.predict(np.array([bx]))
            # don't sample mask
            y[0, -1, 0] = 0
            y = sample(y[0, -1])
            if y == 1:
                break
            c = idx2char[y]
            sys.stdout.write(c)
            sys.stdout.flush()
            bx = bx[:][1:]
            bx = np.concatenate((bx, [y]), axis=-1)
        print('\n')

gen_callback = LambdaCallback(on_epoch_end=generate_text)

In [15]:
def save_model(epoch=None, logs=None):
    model.save('models/wntv_{0}.h5'.format(model_id))
    print('saved model')

save_callback = LambdaCallback(on_epoch_end=save_model)

In [16]:
def trainfor(epochs, save=True, gen=True):
    callbacks = [TQDMNotebookCallback()]
    if save:
        callbacks.append(save_callback)
    if gen:
        callbacks.append(gen_callback)
    model.fit_generator(generate_data(), int(1e4), epochs,
        verbose=0, callbacks=callbacks)

In [None]:
trainfor(10)

In [None]:
adam = Adam(0.0005)
model.compile(optimizer=adam, loss='categorical_crossentropy')
trainfor(1e9)