In [1]:
import numpy as np
from collections import defaultdict
import time
import sys

from keras.models import Model, load_model
from keras.layers import Input, Embedding, LSTM, Dense
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback

from keras_tqdm import TQDMNotebookCallback

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
dataset_path = 'C:/Datasets/amazonreviews'
with open('{0}/train.ft.txt'.format(dataset_path), encoding='utf-8') as f:
    train_str = f.read()
with open('{0}/test.ft.txt'.format(dataset_path), encoding='utf-8') as f:
    test_str = f.read()

In [3]:
def data_parse(s):
    d = s.split('__label__')[1:]
    y = [int(s[0])-1 for s in d]
    #titles = [s.split(': ', 1)[0][2:] for s in d]
    x = [s.split(': ', 1)[1] for s in d]
    return (x, y)

(train_x, train_y) = data_parse(train_str)
(test_x, test_y) = data_parse(test_str)

In [4]:
del train_str
del test_str

In [5]:
num_filters = 64

batch_size = 32
batch_len = 800

model_id = int(time.time())

In [6]:
#lens = [len(s) for s in train_x]
#np.mean([(x<800) and 1 or 0 for x in lens])
# 0.92 - 92% less than 800 chars

In [7]:
# remove ~10% longest, sort by length
def data_filter(x, y):
    mask = [len(s) < 800 for s in x]
    x = [s for i, s in enumerate(x) if mask[i]]
    y = [s for i, s in enumerate(y) if mask[i]]
    sort = np.argsort([len(s) for s in x])
    x = [x[i] for i in sort]
    y = [y[i] for i in sort]
    return (x, y)

(train_x, train_y) = data_filter(train_x, train_y)
(test_x, test_y) = data_filter(test_x, test_y)

In [8]:
print(len(train_x), len(train_y))

3302164 3302164


In [9]:
#np.mean([(ord(c)>31 and ord(c)<127) for c in ''.join(s for s in train_x)])
# 99.7% within this set
chars = [chr(i+32) for i in range(126 - 32 + 1)]
print(''.join(chars))
char2idx = defaultdict(lambda: 2, {c: i+2 for i, c in enumerate(chars)})
idx2char = defaultdict(lambda: ' ', {i+2: c for i, c in enumerate(chars)})
input_dim = len(chars) + 2

 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~


In [10]:
def generate_data(batch_size=batch_size, test=False):
    if test:
        x, y = test_x, test_y
    else:
        x, y = train_x, train_y
    while True:
        i = np.random.randint(len(train_x) - batch_size)
        bml = len(x[i + batch_size - 1])
        bx = np.zeros((batch_size, bml), dtype='int32')
        by_text = np.zeros((batch_size, bml, input_dim), dtype='int32')
        by_sent = np.zeros((batch_size, 2), dtype='int32')
        for j in range(batch_size):
            bx[j][-len(x[i+j]):] = [char2idx[c] for c in x[i+j]]
            for k, c in enumerate(x[i+j][1:]):
                by_text[j][k + bml - len(x[i+j])][char2idx[c]] = 1
            by_text[j, -1, 1] = 1
            by_sent[j, y[i+j]] = 1
        yield (bx, [by_text, by_sent])

In [11]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [12]:
l_in = Input(shape=(None,), dtype='int32')
h = Embedding(input_dim=input_dim, output_dim=num_filters)(l_in)
for i in range(1):
    h = LSTM(num_filters, return_sequences=True)(h)
l_out_text = TimeDistributed(Dense(input_dim, activation='softmax'))(h)
l_out_sent = LSTM(2, activation='softmax')(h)

model = Model(inputs=l_in, outputs=[l_out_text, l_out_sent])

adam = Adam(0.001)
model.compile(optimizer=adam, loss='categorical_crossentropy')

In [13]:
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, None, 64)      6208        input_1[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, None, 64)      33024       embedding_1[0][0]                
____________________________________________________________________________________________________
time_distributed_1 (TimeDistribu (None, None, 97)      6305        lstm_1[0][0]                     
___________________________________________________________________________________________

In [14]:
# same as model but stateful - todo: automatic solution
l_in = Input(batch_shape=(1, None), dtype='int32')
h = Embedding(input_dim=input_dim, output_dim=num_filters)(l_in)
for i in range(1):
    h = LSTM(num_filters, return_sequences=True, stateful=True)(h)
l_out_text = TimeDistributed(Dense(input_dim, activation='softmax'))(h)
l_out_sent = LSTM(2, activation='softmax')(h)

predmodel = Model(inputs=l_in, outputs=[l_out_text, l_out_sent])

def set_predmodel():
    predmodel.set_weights(model.get_weights())

In [15]:
def generate_text(epoch=None, logs=None, length=400, refresh_model=True):
    if refresh_model:
        set_predmodel()
    print()
    print('-- epoch {0}\n'.format(epoch))
    bx, by = next(generate_data(batch_size=1, test=False)) # todo: fix test err
    bx = bx[0][:20]
    seed_str = ''.join([idx2char[i] for i in bx])
    bx = np.array([bx])
    print('-- seed: {0}'.format(seed_str))
    print()
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('-- diversity: {0}'.format(diversity))
        sys.stdout.write(seed_str)
        predmodel.reset_states()
        y, y_sent = predmodel.predict(bx)
        for i in range(length):
            y = sample(y[0, -1])
            c = idx2char[y]
            sys.stdout.write(c)
            sys.stdout.flush()
            y, y_sent = predmodel.predict(np.array([y]))
        print('\n')

gen_callback = LambdaCallback(on_epoch_end=generate_text)

In [16]:
def save_model(epoch=None, logs=None):
    model.save('models/tv_{0}.h5'.format(model_id))
    print('saved model')

save_callback = LambdaCallback(on_epoch_end=save_model)

In [17]:
def trainfor(epochs, save=True, gen=True):
    callbacks = [TQDMNotebookCallback()]
    if save:
        callbacks.append(save_callback)
    if gen:
        callbacks.append(gen_callback)
    model.fit_generator(generate_data(), int(5e3), epochs,
        verbose=0, callbacks=callbacks)

In [18]:
trainfor(1e9)

saved model

-- epoch 0

-- seed: This worked great fo

-- diversity: 0.2
This worked great for a tor lokgs inteat teine underseman combe. Not fildle goy. It this tho I spiely, I amfenters of i lit and wintit to this maye evoed seets it's chacted making"! I kountly paructluck ineuld the on to be test 11/9 cirppe sophor resife the noter inthicen myterses wsitable ou worlland it is a aget to of this is chad thoo levaitey looker on the expriesapter shows teines and gox eventers odrent Fop her

-- diversity: 0.5
This worked great for havind them the gictle the for at goor, is the dicwor dearsizac of this $1, and the cumabe seal the swor syent on yops loughs. Waters (droural fired the vichiarty, I wirst.A of thrutch good duch. Detiless it alls that - do Ampat. I gay this 4 usicking I hon like and bet then prodmeno of Smates sholl We non fuld docks about. I amb.BIN S12.000 ented of I wantery. It''thib, ye2 dause and a more cor

-- diversity: 1.0
This worked great for.  nee FERSTISTREAN Capre

saved model

-- epoch 1

-- seed: I like this tripod. 

-- diversity: 0.2
I like this tripod. I like strecound no it.  ipide aboutdows aperend it wasted my had even'rles, in the only it to didd a give pige chimiopotter about userings this ctars mstem on yon't but. It aw stee insirated an great from.  urforwor Aud aly carded and a grad! Work thripting laster and I do I wass a culpition. I tyone a but a sometion put they put thile.  and.  etting you'd of the product.It cave seepkinasic is tr

-- diversity: 0.5
I like this tripod.  everble's buck it gestes but work. Made that arrant the coptioners. Predvant and just wait's berrills and was neter up gookich good, times like you wordolouphing and indel beellent some rattien's.  else and better ansure itran masten in't have efigurine and disus bought the pucks fan at from It, but the fectered no mon't comp not? the good. The make is turli of them Rander, recon-Jue for more "Do

-- diversity: 1.0
I like this tripod. To my with and Bikent Deas

saved model

-- epoch 2

-- seed: I'm surprised to fin

-- diversity: 0.2
I'm surprised to fine-shown and his treitalitiom who problentimes are u can't inould heap to with the suppel banine. Abund sopenny werding it of the boot not love where of all skieddel many. Bually a maked!!!! I've ReGREGRELING AM. ROWE HAC TTO  FOND every lounion don't one littlate and noin a be texing this product book and your courite. count doby to ary amazon (greelens this problets watch her omer to senfice as t

-- diversity: 0.5
I'm surprised to fing the great was do scay not. I astroaic light this and his paping, anothes, lears the claste tha very nitheled it of Fristmaps, sid.  of or highly exambyond the its you want ofter of meght lay. Juirly dirghwade and ever sences and on too the for real ICDugh this a is the poy stretting ease! And a sysa DISSE."I want the preture duit diduous of the making reads about just wrut in goines is despacted

-- diversity: 1.0
I'm surprised to fine illy the subjuctinges cl

saved model

-- epoch 3

-- seed: I discovered this li

-- diversity: 0.2
I discovered this little duning, but theath, you. Day able was mighreatien that the peobe person't bucks way awemy's bor no adgush famble. I want, esonabit.an inger to picks" if you nitl more from and qualy. Gragaes I with a greatly Jank and altinery of the book musice abum. I other on time it see, lasryones doving dife this in the poince. It's and camests it. This is one moneh then noy peasoly, only now bad efferent

-- diversity: 0.5
I discovered this likes anazz mann. It ford of firm. I bought bevan the qualitive and that's abyous Crscup is prent to use desen the only lixe. I favese muspry, (Habfout. This is seel usomple pary. It was ome. Howild.  et. Hou've sussing the basted by the perfect to rervell was tera, the confroghts a Liew of Booked can watchs and I'm orme squeke domlen. I am to nee from aghing you wrone come 10Grema the Fords is the 

-- diversity: 1.0
I discovered this like I algy ester of hard of

saved model

-- epoch 4

-- seed: I went through two o

-- diversity: 0.2
I went through two ound Solts vasid if the ore sematughtyles.yount for made come was iccractifub like corags. It's The losor and too Povegs and good no teve oz gimp weres weeks book. Seiver head tusn, it to in at work fromment, and pertian Stong on you Lifted a goo to in felings, Baray.Thave people diffuct in the seromice was good don't. I want when with for 2 yeart to tighter. Comlesson". RA one albump. You more tec

-- diversity: 0.5
I went through two of agree to love wellent a wascers so beft the read and leakie world to keepted downly for made I redob it product to reqilably this mour books so heor count requels too factmasce sque whis of wix the end mandainal desplete, recemicton. Seconds album for the plangs precond muted it. Or neips have to my precans his fin,ner his a cd a very are Time are expecters not found in my of smell after movie w

-- diversity: 1.0
I went through two of your. Woit to yeachment,

saved model

-- epoch 5

-- seed: Finally! I got this 

-- diversity: 0.2
Finally! I got this books cuth Sp, in up infor ba The Little.I love the adine of work that abobs was now our nothing says.The started for Christeny... Good, so product in the most faction; DAVES Comforriathing was accured.  on Jauded loods kney.. Winthro. A I'll unit his 44-2p/649 withoug on come crevie and the defect slight bad deture!  od to wayped to he not brueve they and this book wood in music. The trea.  asine

-- diversity: 0.5
Finally! I got this is it devidey by me startibly didn't Broksted, for blualica and Toor apents real, Trip, but this stane of the stolls excelling on outher homession that it acrawests.  ive the onleded to a selbimend it my get it.  onthaturions of this was a flew my bette-tixe. The what Klop to snoter me's defils was some truen to as great disat ingreede out 3 and its than does not) and jundion!  etal this for weasy

-- diversity: 1.0
Finally! I got this book!Liet. Even, it's sers

saved model

-- epoch 6

-- seed: This is just a blank

-- diversity: 0.2
This is just a blankence spoceshing this mob.  oveg Bevels)-ANG . Nyways shead. Perpeniny guy to treace prickan. They again To do gave-dequies $59. this very of fan to say well and Phy perthre reduct travary I can Et all of juy the mest capt, the really hells Hard tops aw one of I have an worby espaps. If ge on youbout not recond mechate, doesh's other with ba ante trigial mograining martes overe copently an-wars on 

-- diversity: 0.5
This is just a blank deap. This hold not even and very my untinn at mover. May it is good cancerment and buy to clople't gettel" and not fhem bocked and lome me now justle in aroumation tesner oxoff-stering this but I hearmby ditches.With quial desi son their surrym. Wint like for anounded. Song tragizy, have hard a Serite but the beint couldn't bettere with the money gran the new, iter akings setting I would I next 

-- diversity: 1.0
This is just a blank, Glayms. On sethes by the

KeyboardInterrupt: 