# RNN model of URIs

First check that Tensorflow is correctly installed and can see the GPU:

In [1]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15165060074906648118
, name: "/gpu:0"
device_type: "GPU"
memory_limit: 11332668621
locality {
  bus_id: 1
}
incarnation: 7476987862329217921
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0"
]


Then the pre-amble:

In [2]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, Recurrent
import numpy as np
import random
import sys
import gzip
import math

def entropy(pred): 
    return sum([-p*math.log(p) for p in pred])

def sample(a, temperature=1.0):
    """
    Samples an index from a probability array;
    higher temperature raises the entropy and vice versa    
    """
    a = np.log(a) / temperature
    dist = np.exp(a) / np.sum(np.exp(a))
    choices = range(len(a)) 
    return np.random.choice(choices, p=dist)

def hx(i):
    """
    Normalised 2-char hex representation of 0-255
    """
    a = hex(i)[2:]
    if len(a)<2: a = ''.join(['0',a])
    return a

hexabet = [hx(x) for x in range(256)]
byte_idx = dict((c, i) for i, c in enumerate(hexabet))

Using TensorFlow backend.


Read the domain string file for training. At present from cloned git repo. 

TO DO: read from S3.

In [3]:
train_file = "big_domain_string_1.gz"
path = "../sdata/"

with gzip.open(path + train_file, 'rb') as f:
    content = f.read()

bytes = [b.encode('hex') for b in content][int(3e07):int(4e07)] # sample bytes for local testing
#bytes = [b.encode('hex') for b in content]
print('# bytes:', len(bytes))
print(content[:1000])

# bytes: 10000000


/www868686.wyw.cn/ncontact_346701.html
/wlrq.wyw.cn/tradelist_347993.html
/BANTAI801.wyw.cn/ntrade_541599.html
/bantai806.wyw.cn/ncontact_541502.html
/www.snc.edu/assets2/images/socialmedia/insta.png
/page-4403.html
/goodlife.wyw.cn/tradelist_540770.html
/www.snc.edu/assets2//images/socialmedia/face.png
/C:/Users/Skipper/Google%20%E9%9B%B2%E7%AB%AF%E7%A1%AC%E7%A2%9F/%E8%BB%8D%E4%BA%8B%E5%BE%AE%E9%9D%A9%E5%91%BD/%E8%87%AA%E8%A3%BD%E6%BD%9B%E8%89%A6%E6%98%AF%E4%B8%80%E9%A0%85%E5%BF%85%E9%A0%88%E5%84%98%E6%97%A9%E8%90%BD%E5%AF%A6%E7%9A%84%E5%9C%8B%E5%AE%B6%E9%87%8D%E5%A4%A7%E6%94%BF%E7%AD%96.docx
/www.snc.edu/assets3/images/saint.png
/www.xe.com/es/currencycharts/
/page-843.html
/page-1568.html
/page-2924.html
/piwik.php
/wlkcjj.wyw.cn/contact_352711.html
/DIMA.wyw.cn/tradelist_350616.html
/www.abc7.com/apps
/page-59.html
/page-5792.html
/BANTAI801.wyw.cn/ncontact_541599.html
/joyson.ru/
/newpearl.wyw.cn/contact_353014.html
/page-6947.html
/summit.wyw.cn/contact_347098

Split into training and test strings:

In [4]:
# divide into training and test sets:
n_train = int(3*len(bytes)/4)
text_train = bytes[0:n_train]
text_test = bytes[n_train:len(bytes)]

print('training length:', len(text_train))
print('test length:', len(text_test))

training length: 7500000
test length: 2500000


Cut the text in semi-redundant 'sentences' of bytes, each of length <tt>unroll</tt>, and stepping forward a number <tt>step</tt> of bytes each time. 

These sentences are then converted into numpy arrays for RNN input.

In [5]:
unroll = 20
step = 3
sentences = []
next_byte = []
for i in range(0, n_train - unroll, step):
    sentences.append(text_train[i: i + unroll])
    next_byte.append(text_train[i + unroll])
print('# sequences:', len(sentences))

# convert to feature vector + next character:
X = np.zeros((len(sentences), unroll, 256), dtype=np.bool)
y = np.zeros((len(sentences), 256), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t,b in enumerate(sentence):
        X[i, t, byte_idx[b]] = 1
    y[i, byte_idx[next_byte[i]]] = 1

# sequences: 2499994


Construct the RNN model:

In [6]:
# build the model: 2 stacked LSTM

nhidden = [16] # e.g. [512, 512]
dropout = 0.1

model = Sequential()
model.add(LSTM(nhidden[0], return_sequences=False, input_shape=(unroll, 256)))
#model.add(LSTM(nhidden[0], return_sequences=True, input_shape=(unroll, 256)))
#model.add(Dropout(dropout))
#model.add(LSTM(nhidden[1], return_sequences=False))
model.add(Dropout(dropout))
model.add(Dense(256))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 16)                17472     
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4352      
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
Total params: 21,824
Trainable params: 21,824
Non-trainable params: 0
_________________________________________________________________


Load existing weights into the model, or fit from random initialisation:

In [7]:
# either load weights:
wt_path = "../sdata/"
model.set_weights(np.load(wt_path+"model_from_big_domain_string_1.gz_arch_16_unroll_20_step_3_dropout_0.1.npy"))

In [None]:
# or fit and save weights:
wt_path = "../sdata/"
wt_file = "model_from_%s_arch" % train_file
for i in nhidden: 
    wt_file += "_%d" % i
wt_file += "_unroll_%d_step_%d_dropout_%g.npy" % (unroll, step, dropout)

model.fit(X, y, batch_size=128, epochs=20, verbose=1)
np.save(wt_path + wt_file, model.get_weights())

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

Eyeball the model by using it to generate a stretch of synthetic domain strings:

In [None]:
temp = 1.0
quote_length = 2000

# pick random:
start = random.randint(0, len(text_test) - unroll - 1)
init_quote = text_test[start : start + unroll]

generated = init_quote
print('\n', '-' * 50)
print("Seed:")
sys.stdout.write(''.join([unichr(int(h, 16)) for h in init_quote]))
print('\n', '-' * 50)
sys.stdout.write(''.join([unichr(int(h, 16)) for h in generated]))
for i in range(quote_length):
    x = np.zeros((1, unroll, 256))
    for t,b in enumerate(generated):
        x[0, t, byte_idx[b]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, temperature=temp)
    next_byte = hexabet[next_index]
    generated = generated[1:] + [next_byte]

    sys.stdout.write(unichr(int(next_byte, 16)))
    sys.stdout.flush()
print('\n', '-' * 50)


 --------------------------------------------------
Seed:
/50-to-100/sale/yes/
 --------------------------------------------------
/50-to-100/sale/yes/xliahmo-tabetsposs-ach-alig_c_3/tai/images.jp-ger-siamo-testwy-tir.html
/imeded-du-imi2-7
/statial-corney/fouuntie_uolounhd/w-firc-5
/mocjo-rhobii/0-im
/
/noiphon-dechS
/granrgri/wenga-
/thicts
/capegof2/ipterber/poymantess/saloaoi-th-diren-evr/
/logdenc-xory50961146336071



wwwaxacobmad.tihipmobtsx/hsp-trebod-bre-tanamoa-den.asplw/rincYxp70_nile
/oonuma-hirlali/c-128/
/siation/warylead/caslach/5bs_contute-meus.html
/10/hef-ec5626530.raezonbcr
/vendlak.html



vot.naeatulfebloamye.antesligense.ps
/iwi




wwwcododdbenmi.com


/46naob/massa-en+tiod-ge/niri/
/tutss-faye/pamedyinery-Fula-Ber/hefextr/peruntsebam-evfrc7dcb/tarc-capiya-ucarut-ets_
/uates/centanlers-it-bolbshiana/categorex-dat-tbo-turt
/snoi-dunsdounki-012984714100811/w8529i//Deverh-Sp-Neacher/77135213168.jpg
/h-uaten/
/Ssovggowieeri_sCttding/3767.1379491.103000b4926022x30
/h