# RNN model of URIs

In [1]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, Recurrent
import numpy as np
import random
import sys

Using TensorFlow backend.


In [25]:
train_file = "uri_paths.txt"
path = ""

with open(path + train_file, 'r') as f:
    content = f.read()

bytes = [b.encode('hex') for b in content]
print('# bytes:', len(bytes))
print(content[:1000])

# bytes: 418190
-------------------------------------------
Domain: www.missme.com
Page string:
/accessories/view-all/|/accessories/belts/|/|/jeans/view-all/|/outerwear/view-all/|/girls/view-all/|/giftcertificates.php|/mm-couture/view-all/|/bottoms/view-all/|/tops/view-all/
-------------------------------------------
Domain: healthland.time.com
Page string:
/2012/10/08/6-breathing-exercises-to-relax-in-10-minutes-or-less/|/author/brochman/|/2011/05/05/true-or-false-20-common-myths-about-pregnancy/|/2013/01/29/5-things-you-should-know-about-chickenpox-and-shingles/|/tag/kids-and-sports/|/2011/06/08/predicting-kids-athletic-skill-theres-a-dna-test-for-that/|/category/medicine/pediatric-genetics/|/2012/04/18/warren-buffetts-prostate-cancer-i-feel-great/|/2012/09/04/13-reasons-to-love-tea/|/2011/08/05/vogues-10-year-old-model-and-the-pressure-to-be-hot-from-cradle-to-grave/
-------------------------------------------
Domain: o.nouvelobs.com
Page string:
/pop-life/|/musique/|/mode-beaute/|/

In [4]:
chars = set(bytes)
print('# distinct bytes:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# distinct bytes: 171


In [5]:
# divide into training and test sets:
n_train = int(3*len(bytes)/4)
text_train = bytes[0:n_train]
text_test = bytes[n_train:len(bytes)]

print('training length:', len(text_train))
print('test length:', len(text_test))

training length: 313642
test length: 104548


In [6]:
# cut the text in semi-redundant sequences of 'unroll' characters
unroll = 20
step = 3
sentences = []
next_chars = []
for i in range(0, n_train - unroll, step):
    sentences.append(text_train[i: i + unroll])
    next_chars.append(text_train[i + unroll])
print('# sequences:', len(sentences))

# convert to feature vector + next character:
X = np.zeros((len(sentences), unroll, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

# sequences: 104541


In [27]:
# build the model: 2 stacked LSTM

nhidden = [32] # e.g. [512, 512]
dropout = 0.1

model = Sequential()
model.add(LSTM(nhidden[0], return_sequences=False, input_shape=(unroll, len(chars))))
model.add(Dropout(dropout))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

#from keras.utils import plot_model
#plot_model(model, to_file='uri_model.png')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 32)                26112     
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 171)               5643      
_________________________________________________________________
activation_2 (Activation)    (None, 171)               0         
Total params: 31,755
Trainable params: 31,755
Non-trainable params: 0
_________________________________________________________________


In [31]:
# either load weights:
model.set_weights(np.load("model_from_uri_paths.txt_arch_32_unroll_20_step_3_dropout_0.1.npy"))

In [30]:
# or fit and save weights:
wt_path = ""
wt_file = "model_from_%s_arch" % train_file
for i in nhidden: wt_file += "_%d" % i
wt_file += "_unroll_%d_step_%d_dropout_%g.npy" % (unroll, step, dropout)

model.fit(X, y, batch_size=128, epochs=10, verbose=0)
np.save(wt_path + wt_file, model.get_weights())

In [15]:
import math

def entropy(pred): 
    return sum([- p * math.log(p) for p in pred])

def sample(a, temperature=1.0):
    # samples an index from a probability array;
    # higher temperature raises the entropy and vice versa
    a = np.log(a) / temperature
    dist = np.exp(a) / np.sum(np.exp(a))
    choices = range(len(a)) 
    return np.random.choice(choices, p=dist)

In [33]:
temp = 0.75
quote_length = 2000

# pick random:
start = random.randint(0, len(text_test) - unroll - 1)
init_quote = text_test[start : start + unroll]

generated = init_quote
print('\n', '-' * 50)
print("Seed:")
sys.stdout.write(''.join([unichr(int(h, 16)) for h in init_quote]))
print('\n', '-' * 50)
sys.stdout.write(''.join([unichr(int(h, 16)) for h in generated]))
for i in range(quote_length):
    x = np.zeros((1, unroll, len(chars)))
    for t, char in enumerate(generated):
        x[0, t, char_indices[char]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, temperature=temp)
    next_char = indices_char[next_index]
    generated = generated[1:] + [next_char]

    sys.stdout.write(unichr(int(next_char, 16)))
    sys.stdout.flush()
print('\n', '-' * 50)


 --------------------------------------------------
Seed:
28b0180685040f708f14
 --------------------------------------------------
28b0180685040f708f149431b592994b0_3721465b359343.html|/upalit/sadeda0700215.png|/tactes/prowpars/2015/1/pbrod/1338862740|/images/legliens/wp/sontors/nader
-----------------------------------------------------------------------------------------------------------------------------
Domain: h5.htmblf.phtmllbas/|/wwdewwwW|/imeges/imageg/imesihomer-debastera-reins-fary
--------------------------------------------------------
Domain: ww.iodins.gaflingitil-warf.gm
|/ardems/tage/srok_2017/|/vik/11/tidicos-ansea/follanes/mages/221/015/0162224/wprichtadd/i-dege-tesion-suintemalith-joru/devarel-i-101-balle-fil-4-shalis-icara/2016.html|/forale/doebboesdvee-ersemanboat.app|/images/bicttur-bane_lasalp_20191/16/|/rebews/defond/Aind-wige/|/hesebers.php|/ht_blog/mor/fere/|/pp%d8%A8%D8%AE%A7%D9%81-%D9%87%D8%A1%D8%AF%DA%A59|/|/cereshalla-lare-raiect-peti-purimand-of-prenste-