# RNN model of URIs

In [12]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, Recurrent
import numpy as np
import random
import sys
import gzip
import math

def entropy(pred): 
    return sum([-p*math.log(p) for p in pred])

def sample(a, temperature=1.0):
    """
    Samples an index from a probability array;
    higher temperature raises the entropy and vice versa    
    """
    a = np.log(a) / temperature
    dist = np.exp(a) / np.sum(np.exp(a))
    choices = range(len(a)) 
    return np.random.choice(choices, p=dist)

def hx(i):
    """
    Normalised 2-char hex representation of 0-255
    """
    a = hex(i)[2:]
    if len(a)<2: a = ''.join(['0',a])
    return a

hexabet = [hx(x) for x in range(256)]
byte_idx = dict((c, i) for i, c in enumerate(hexabet))

In [2]:
train_file = "big_domain_string_1.gz"
path = "../sdata/"

with gzip.open(path + train_file, 'rb') as f:
    content = f.read()

bytes = [b.encode('hex') for b in content][:100000] # sample bytes for local testing
print('# bytes:', len(bytes))
print(content[:1000])

# bytes: 100000


/www868686.wyw.cn/ncontact_346701.html
/wlrq.wyw.cn/tradelist_347993.html
/BANTAI801.wyw.cn/ntrade_541599.html
/bantai806.wyw.cn/ncontact_541502.html
/www.snc.edu/assets2/images/socialmedia/insta.png
/page-4403.html
/goodlife.wyw.cn/tradelist_540770.html
/www.snc.edu/assets2//images/socialmedia/face.png
/C:/Users/Skipper/Google%20%E9%9B%B2%E7%AB%AF%E7%A1%AC%E7%A2%9F/%E8%BB%8D%E4%BA%8B%E5%BE%AE%E9%9D%A9%E5%91%BD/%E8%87%AA%E8%A3%BD%E6%BD%9B%E8%89%A6%E6%98%AF%E4%B8%80%E9%A0%85%E5%BF%85%E9%A0%88%E5%84%98%E6%97%A9%E8%90%BD%E5%AF%A6%E7%9A%84%E5%9C%8B%E5%AE%B6%E9%87%8D%E5%A4%A7%E6%94%BF%E7%AD%96.docx
/www.snc.edu/assets3/images/saint.png
/www.xe.com/es/currencycharts/
/page-843.html
/page-1568.html
/page-2924.html
/piwik.php
/wlkcjj.wyw.cn/contact_352711.html
/DIMA.wyw.cn/tradelist_350616.html
/www.abc7.com/apps
/page-59.html
/page-5792.html
/BANTAI801.wyw.cn/ncontact_541599.html
/joyson.ru/
/newpearl.wyw.cn/contact_353014.html
/page-6947.html
/summit.wyw.cn/contact_347098.h

In [3]:
# divide into training and test sets:
n_train = int(3*len(bytes)/4)
text_train = bytes[0:n_train]
text_test = bytes[n_train:len(bytes)]

print('training length:', len(text_train))
print('test length:', len(text_test))

training length: 75000
test length: 25000


In [4]:
# cut the text in semi-redundant sequences of 'unroll' characters
unroll = 20
step = 3
sentences = []
next_byte = []
for i in range(0, n_train - unroll, step):
    sentences.append(text_train[i: i + unroll])
    next_byte.append(text_train[i + unroll])
print('# sequences:', len(sentences))

# convert to feature vector + next character:
X = np.zeros((len(sentences), unroll, 256), dtype=np.bool)
y = np.zeros((len(sentences), 256), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t,b in enumerate(sentence):
        X[i, t, byte_idx[b]] = 1
    y[i, byte_idx[next_byte[i]]] = 1

# sequences: 24994


In [7]:
# build the model: 2 stacked LSTM

nhidden = [16] # e.g. [512, 512]
dropout = 0.1

model = Sequential()
model.add(LSTM(nhidden[0], return_sequences=False, input_shape=(unroll, 256)))
model.add(Dropout(dropout))
model.add(Dense(256))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 16)                17472     
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4352      
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
Total params: 21,824
Trainable params: 21,824
Non-trainable params: 0
_________________________________________________________________


In [13]:
# either load weights:
wt_path = "../sdata/"
model.set_weights(np.load(wt_path+"model_from_big_domain_string_1.gz_arch_16_unroll_20_step_3_dropout_0.1.npy"))

In [10]:
# or fit and save weights:
wt_path = "../sdata/"
wt_file = "model_from_%s_arch" % train_file
for i in nhidden: 
    wt_file += "_%d" % i
wt_file += "_unroll_%d_step_%d_dropout_%g.npy" % (unroll, step, dropout)

model.fit(X, y, batch_size=128, epochs=10, verbose=1)
np.save(wt_path + wt_file, model.get_weights())

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
temp = 1.0
quote_length = 1000

# pick random:
start = random.randint(0, len(text_test) - unroll - 1)
init_quote = text_test[start : start + unroll]

generated = init_quote
print('\n', '-' * 50)
print("Seed:")
sys.stdout.write(''.join([unichr(int(h, 16)) for h in init_quote]))
print('\n', '-' * 50)
sys.stdout.write(''.join([unichr(int(h, 16)) for h in generated]))
for i in range(quote_length):
    x = np.zeros((1, unroll, 256))
    for t,b in enumerate(generated):
        x[0, t, byte_idx[b]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, temperature=temp)
    next_byte = hexabet[next_index]
    generated = generated[1:] + [next_byte]

    sys.stdout.write(unichr(int(next_byte, 16)))
    sys.stdout.flush()
print('\n', '-' * 50)


 --------------------------------------------------
Seed:
47xca060f.jpg
/m/dg/
 --------------------------------------------------
47xca060f.jpg
/m/dg//%6m80ayb/n-b6aBedb2mbi0oa-.nleeurrlouiga/fd
he
emse.
e.
cptipbhpj/_ahh
ptHn.e8d/sr-pbraoidooehneigesnl%Aeewrirxrgsmaksdtesroevp
ci6cdshÙctiacs-doo.-um.wlhh/Snkotaaca//nrtppoenotuu-uuml/oLsips
oyyo
s/pseut/r1hde-ts5io-doep
oimnnlh-a

nDmgingau--g
cru/-srwap/r
iraosrriavusit/
n/rn-i-3nnuli
-iotnaiesoy
av-ngw/tasgncl/l/neonpu-s3i
/0saopenmyaaaaemiyd/roloeIl/
v/Saeircroa3//enre-bkshaoondci/ae7naa/aookivsiofenoo-eniceg.hpvfnol/g1ds/01gw../ct/sikhteicuop9nnot/sdzkr25/ck2//1//73cl91.e//-/%D3M/e58oa17e2ictpophe/d/ec
uperiniarn-uaohsk-sfatkt/srrÑ¿f
l-syng-a-xasctwsmanf/atfa-0Moaeebaorto
adtoatc-seflird
amuhere-8.e-wodywmtlowbsei-aM/tS-bneurhs
/Zbbgiri-i-cgttmwrep-
/sfrhues
uo/nmoueioohtm
sl/ituradsiarec-rih/aw-wxkeeasrgtore/aemserr-fa
ltrpvoeocrLtxnornW/rb/18zs/iscpo/erskede-aic-in2Ø01i
p-odhav-arenpbmork/7gna.ic/blsieornsoc-w041299o//Sotesmcse