# RNN model of URIs

In [1]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, Recurrent
import numpy as np
import random
import sys
import gzip

Using TensorFlow backend.


In [2]:
train_file = "big_domain_string_1.gz"
path = "../sdata/"

with gzip.open(path + train_file, 'rb') as f:
    content = f.read()

bytes = [b.encode('hex') for b in content][:1000000] # 1 million bytes for local testing
print('# bytes:', len(bytes))
print(content[:1000])

# bytes: 1000000


/www868686.wyw.cn/ncontact_346701.html
/wlrq.wyw.cn/tradelist_347993.html
/BANTAI801.wyw.cn/ntrade_541599.html
/bantai806.wyw.cn/ncontact_541502.html
/www.snc.edu/assets2/images/socialmedia/insta.png
/page-4403.html
/goodlife.wyw.cn/tradelist_540770.html
/www.snc.edu/assets2//images/socialmedia/face.png
/C:/Users/Skipper/Google%20%E9%9B%B2%E7%AB%AF%E7%A1%AC%E7%A2%9F/%E8%BB%8D%E4%BA%8B%E5%BE%AE%E9%9D%A9%E5%91%BD/%E8%87%AA%E8%A3%BD%E6%BD%9B%E8%89%A6%E6%98%AF%E4%B8%80%E9%A0%85%E5%BF%85%E9%A0%88%E5%84%98%E6%97%A9%E8%90%BD%E5%AF%A6%E7%9A%84%E5%9C%8B%E5%AE%B6%E9%87%8D%E5%A4%A7%E6%94%BF%E7%AD%96.docx
/www.snc.edu/assets3/images/saint.png
/www.xe.com/es/currencycharts/
/page-843.html
/page-1568.html
/page-2924.html
/piwik.php
/wlkcjj.wyw.cn/contact_352711.html
/DIMA.wyw.cn/tradelist_350616.html
/www.abc7.com/apps
/page-59.html
/page-5792.html
/BANTAI801.wyw.cn/ncontact_541599.html
/joyson.ru/
/newpearl.wyw.cn/contact_353014.html
/page-6947.html
/summit.wyw.cn/contact_347098.

In [3]:
chars = set(bytes)
print('# distinct bytes:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# distinct bytes: 172


In [4]:
# divide into training and test sets:
n_train = int(3*len(bytes)/4)
text_train = bytes[0:n_train]
text_test = bytes[n_train:len(bytes)]

print('training length:', len(text_train))
print('test length:', len(text_test))

training length: 750000
test length: 250000


In [5]:
# cut the text in semi-redundant sequences of 'unroll' characters
unroll = 20
step = 3
sentences = []
next_chars = []
for i in range(0, n_train - unroll, step):
    sentences.append(text_train[i: i + unroll])
    next_chars.append(text_train[i + unroll])
print('# sequences:', len(sentences))

# convert to feature vector + next character:
X = np.zeros((len(sentences), unroll, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

# sequences: 249994


In [6]:
# build the model: 2 stacked LSTM

nhidden = [16] # e.g. [512, 512]
dropout = 0.1

model = Sequential()
model.add(LSTM(nhidden[0], return_sequences=False, input_shape=(unroll, len(chars))))
model.add(Dropout(dropout))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 16)                12096     
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 172)               2924      
_________________________________________________________________
activation_1 (Activation)    (None, 172)               0         
Total params: 15,020
Trainable params: 15,020
Non-trainable params: 0
_________________________________________________________________


In [31]:
# either load weights:
model.set_weights(np.load("model_from_big_domain_string_1.gz_arch_16_unroll_20_step_3_dropout_0.1.npy"))

In [7]:
# or fit and save weights:
wt_path = "../sdata/"
wt_file = "model_from_%s_arch" % train_file
for i in nhidden: 
    wt_file += "_%d" % i
wt_file += "_unroll_%d_step_%d_dropout_%g.npy" % (unroll, step, dropout)

model.fit(X, y, batch_size=128, epochs=10, verbose=0)
np.save(wt_path + wt_file, model.get_weights())

In [8]:
import math

def entropy(pred): 
    return sum([- p * math.log(p) for p in pred])

def sample(a, temperature=1.0):
    # samples an index from a probability array;
    # higher temperature raises the entropy and vice versa
    a = np.log(a) / temperature
    dist = np.exp(a) / np.sum(np.exp(a))
    choices = range(len(a)) 
    return np.random.choice(choices, p=dist)

In [9]:
temp = 0.75
quote_length = 2000

# pick random:
start = random.randint(0, len(text_test) - unroll - 1)
init_quote = text_test[start : start + unroll]

generated = init_quote
print('\n', '-' * 50)
print("Seed:")
sys.stdout.write(''.join([unichr(int(h, 16)) for h in init_quote]))
print('\n', '-' * 50)
sys.stdout.write(''.join([unichr(int(h, 16)) for h in generated]))
for i in range(quote_length):
    x = np.zeros((1, unroll, len(chars)))
    for t, char in enumerate(generated):
        x[0, t, char_indices[char]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, temperature=temp)
    next_char = indices_char[next_index]
    generated = generated[1:] + [next_char]

    sys.stdout.write(unichr(int(next_char, 16)))
    sys.stdout.flush()
print('\n', '-' * 50)


 --------------------------------------------------
Seed:
1368_1371.html
/magi
 --------------------------------------------------
1368_1371.html
/magie/shrestiu-wiving-160094/29769/116143_pecs/064012100115_5460915.jpg
/fn/689/setkitrens/paothoii-volcnere_krictamiao/andant-bigamsestis-unz-
/sriEguras/mendides/c-corsandeft-ripal-ercipulelo-chotak-iwire-skiart-x-c-201-400141-0361894.html
/mama//149387/9886791611136452_16869600.jpg
/pc%838/Splorodwerice/
/bont/satier-apranc-dormes/predadecae-chid-arttot-contens/
/sices/irades/althen/
/centeni.nt





www.emabenher/
prearenker.jpg
/megopurals/wichectuiles/pechorce/chos/Elpoes/si-whoilamoveitotalasee-tont-262012748h/chices/
/leshadter/Midsiodar/laogli-cevoe-sentiorivi-asstirht-chomng


/radochht-p-ere-cort-n-nentem-patint-hochicile-ba-dartian-nesatteco-72b-3315289134-1m-shongs/
/witat/rearichine-maba-konmo-v-chome-voc-4205803350004-0007801169.html
/va
/giict/7
/acurid//casce/alax/
/uodila/prord/01072/2051747514/60955997/b8fa_100a497.g/
/co