In [1]:
from io import BytesIO
import json
import os, sys
import random
import re

import numpy as np
import scipy.misc

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import get_file
import tensorflow.keras.callbacks
import tensorflow.keras.backend as K

import PIL
from PIL import ImageDraw
from IPython.display import clear_output, Image, display, HTML

In [2]:
path = get_file('shakespeare', 'https://storage.googleapis.com/deep-learning-cookbook/100-0.txt')
shakespeare = open(path).read()
training_text = shakespeare.split('\nTHE END', 1)[-1]

len(training_text)

5592892

In [3]:
chars = list(sorted(set(training_text)))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}

len(chars)

97

In [4]:
def char_rnn_model(num_chars, num_layers, num_nodes=512, dropout=0.1):
    input = Input(shape=(None, num_chars), name='input')
    prev = input
    for i in range(num_layers):
        prev = LSTM(num_nodes, return_sequences=True)(prev)
    dense = TimeDistributed(Dense(num_chars, name='dense',
                                  activation='softmax'))(prev)
    
    model = Model(inputs=[input], outputs=[dense])
    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

model = char_rnn_model(len(chars), num_layers=2, num_nodes=640, dropout=0)
model.summary()

W1007 13:20:57.976240 140688087127680 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7ff3ec41e828>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W1007 13:20:58.744821 140688087127680 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7ff3dc0574a8>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None, 97)]        0         
_________________________________________________________________
unified_lstm (UnifiedLSTM)   (None, None, 640)         1889280   
_________________________________________________________________
unified_lstm_1 (UnifiedLSTM) (None, None, 640)         3279360   
_________________________________________________________________
time_distributed (TimeDistri (None, None, 97)          62177     
Total params: 5,230,817
Trainable params: 5,230,817
Non-trainable params: 0
_________________________________________________________________


In [5]:
CHUNK_SIZE = 160

def data_generator(all_text, char_to_idx, batch_size, chunk_size):
    x = np.zeros((batch_size, chunk_size, len(char_to_idx)))
    y = np.zeros((batch_size, chunk_size, len(char_to_idx)))
    
    while True:
        for row in range(batch_size):
            idx = random.randrange(len(all_text) - chunk_size - 1)
            chunk = np.zeros((chunk_size + 1, len(char_to_idx)))
            for i in range(chunk_size + 1):
                chunk[i, char_to_idx[all_text[idx + i]]] = 1
            x[row, :, :] = chunk[:chunk_size]
            y[row, :, :] = chunk[1:]
        yield x, y

next(data_generator(training_text, char_to_idx, 4, chunk_size=CHUNK_SIZE))

(array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 1., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 1., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 1., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0

In [6]:
CHUNK_SIZE = 160
BATCH_SIZE = 256

# early = keras.callbacks.EarlyStopping(monitor='loss',
#                               min_delta=0.03,
#                               patience=3,
#                               verbose=0, mode='auto')

model = char_rnn_model(len(chars), num_layers=2, num_nodes=640, dropout=0)
model.fit_generator(
    data_generator(training_text, char_to_idx, batch_size=BATCH_SIZE, chunk_size=CHUNK_SIZE),
    epochs=40,
#     callbacks=[early,],
    steps_per_epoch=2 * len(training_text) // (BATCH_SIZE * CHUNK_SIZE),
    verbose=2
)

W1007 13:20:59.151643 140688087127680 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7ff3de89dd30>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W1007 13:20:59.422655 140688087127680 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7ff3a850a358>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


Epoch 1/40
273/273 - 62s - loss: 3.3544 - accuracy: 0.2069
Epoch 2/40
273/273 - 61s - loss: 3.0698 - accuracy: 0.2424
Epoch 3/40
273/273 - 61s - loss: 2.1071 - accuracy: 0.4638
Epoch 4/40
273/273 - 61s - loss: 1.8408 - accuracy: 0.5343
Epoch 5/40
273/273 - 61s - loss: 1.7962 - accuracy: 0.5480
Epoch 6/40
273/273 - 61s - loss: 1.7685 - accuracy: 0.5567
Epoch 7/40
273/273 - 61s - loss: 1.7560 - accuracy: 0.5619
Epoch 8/40
273/273 - 61s - loss: 1.7291 - accuracy: 0.5692
Epoch 9/40
273/273 - 61s - loss: 1.7363 - accuracy: 0.5703
Epoch 10/40
273/273 - 61s - loss: 1.7322 - accuracy: 0.5725
Epoch 11/40
273/273 - 61s - loss: 1.7331 - accuracy: 0.5744
Epoch 12/40
273/273 - 61s - loss: 1.7318 - accuracy: 0.5759
Epoch 13/40
273/273 - 61s - loss: 1.7504 - accuracy: 0.5739
Epoch 14/40
273/273 - 61s - loss: 1.7334 - accuracy: 0.5779
Epoch 15/40
273/273 - 61s - loss: 1.6814 - accuracy: 0.5888
Epoch 16/40
273/273 - 62s - loss: 1.7439 - accuracy: 0.5782
Epoch 17/40
273/273 - 62s - loss: 1.7536 - accura

<tensorflow.python.keras.callbacks.History at 0x7ff3a8173780>

In [7]:
def generate_output(model, start_with, amount=400):
    generated = start_with
    
    for i in range(amount):
        x = np.zeros((1, len(generated), len(chars)))
        for t, char in enumerate(generated):
            x[0, t, char_to_idx[char]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_index = np.argmax(preds[len(generated) - 1])
        next_char = chars[next_index]
        yield next_char
        
        generated += next_char

        
start_index = random.randint(0, len(training_text) - CHUNK_SIZE - 1)
fragment = training_text[start_index: start_index + CHUNK_SIZE]

print('-- inputs --')
print(fragment)
print('-- generated --')
for chunk in generate_output(model, fragment):
    sys.stdout.write(chunk)

-- inputs --
hundred voices of that sound.
  FIRST CITIZEN. I twice five hundred, and their friends to piece
    'em.
  BRUTUS. Get you hence instantly, and tell those frien
-- generated --
ds
    Which he did speak to me.
    I have seen thee still as thou art,
    And therefore have I seen thee in thy head,
    And the devil have the worse of his desert.
    The son of Caesar was a mort from him.
    He hath not spoke to th' chaste and constant friend
    Of the proverb with the best of all the treasure
    Of the same prince and Clarence to his horse.
    The sea hath struck the b

In [8]:
# 6章のために必要??
with open('zoo/06/shakespeare.json', 'w') as fout:
    json.dump({
        'chars': ''.join(chars),
        'char_to_idx': char_to_idx,
        'chunk_size': CHUNK_SIZE,
    }, fout)
model.save('zoo/06/shakespeare.h5')

In [9]:
def find_python(rootdir):
    matches = []
    for root, dirnames, filenames in os.walk(rootdir):
        for fn in filenames:
            if fn.endswith('.py'):
                matches.append(os.path.join(root, fn))

    return matches

srcs = find_python(random.__file__.rsplit('/', 1)[0])
len(srcs)

1942

In [10]:
def replacer(value):
    if ' ' in value and sum(1 for ch in value if ch.isalpha()) > 6:
        return 'MSG'
    return value

In [11]:
def replace_literals(st):
    res = []
    start_text = start_quote = i = 0
    quote = ''
    while i < len(st):
        if quote:
            # 文字列リテラルが終了する場合
            if st[i: i + len(quote)] == quote:
                quote = ''
                start_text = i
                res.append(replacer(st[start_quote: i]))
        elif st[i] in '"\'':
            quote = st[i]
            # 三連引用符の場合
            if i < len(st) - 2 and st[i + 1] == st[i + 2] == quote:
                quote = 3 * quote
            start_quote = i + len(quote)
            res.append(st[start_text: start_quote])
        # 三連引用符でない場合、改行を含まない
        if st[i] == '\n' and len(quote) == 1:
            start_text = i
            res.append(quote)
            quote = ''
        # エスケープされている文字列は無視する
        if st[i] == '\\':
            i += 1
        i += 1
    return ''.join(res) + st[start_text:]

print(replace_literals('print("hel\\"lo")'))
print(replace_literals("print('hel\\'lo world')"))
print(replace_literals('this = "wrong\n'))

print("hel\"lo")
print('MSG')
this = ""



In [12]:
COMMENT_RE = re.compile('#.*')
python_code = []

for fn in srcs:
    try:
        with open(fn, 'r') as fin:
            src = fin.read()
    except UnicodeDecodeError:
        print('Could not read %s' % fn)
    src = replace_literals(src)
    src = COMMENT_RE.sub('', src)
    python_code.append(src)

python_code = '\n\n\n'.join(python_code)
len(python_code)

Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/test/badsyntax_pep3120.py
Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/test/test_source_encoding.py
Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/test/encoded_modules/module_iso_8859_1.py
Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/test/encoded_modules/module_koi8_r.py
Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/sqlite3/test/dbapi.py
Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/sqlite3/test/factory.py
Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/sqlite3/test/hooks.py
Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/sqlite3/test/regression.py
Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/sqlite3/test/transactions.py
Could not read /home/mitsuhisa.ohta/.pyenv/versions/3.6.7/lib/python3.6/sqlite3/test/types.py

21702736

In [13]:
py_chars = list(sorted(set(python_code)))
py_char_to_idx = {ch: idx for idx, ch in enumerate(py_chars)}
len(py_chars)

2759

In [14]:
py_model = char_rnn_model(len(py_chars), num_layers=2, num_nodes=640, dropout=0)
py_model.summary()

W1007 14:02:21.087583 140688087127680 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7ff3a8186780>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W1007 14:02:21.366304 140688087127680 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7ff3de89d588>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None, 2759)]      0         
_________________________________________________________________
unified_lstm_4 (UnifiedLSTM) (None, None, 640)         8704000   
_________________________________________________________________
unified_lstm_5 (UnifiedLSTM) (None, None, 640)         3279360   
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 2759)        1768519   
Total params: 13,751,879
Trainable params: 13,751,879
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Early stopping に関する言及は本文中に存在しない

# early = keras.callbacks.EarlyStopping(monitor='loss',
#                               min_delta=0.03,
#                               patience=3,
#                               verbose=0, mode='auto')

BATCH_SIZE = 256
py_model.fit_generator(
    data_generator(python_code, py_char_to_idx, batch_size=BATCH_SIZE, chunk_size=160),
    epochs=40,
    # callbacks=[early,],
    steps_per_epoch=2 * len(python_code) / (BATCH_SIZE * 160),
    verbose=2
)

Epoch 1/40
1060/1059 - 1588s - loss: 3.2508 - accuracy: 0.3023
Epoch 2/40
1060/1059 - 1590s - loss: 3.1911 - accuracy: 0.3069
Epoch 3/40
1060/1059 - 1591s - loss: 3.1844 - accuracy: 0.3070
Epoch 4/40
1060/1059 - 1592s - loss: 3.1786 - accuracy: 0.3072
Epoch 5/40
1060/1059 - 1608s - loss: 3.1787 - accuracy: 0.3068
Epoch 6/40
1060/1059 - 1644s - loss: 3.1791 - accuracy: 0.3068
Epoch 7/40
1060/1059 - 1607s - loss: 3.1777 - accuracy: 0.3064
Epoch 8/40


KeyboardInterrupt: 

In [None]:
def generate_code(model, start_with='\ndef ', end_with='\n\n', diversity=1.0):
    generated = ""
    for ch in start_with:
        yield ch
        generated += ch
    
    for i in range(2000):
        x = np.zeros((1, len(generated), len(py_chars)))
        for t, char in enumerate(generated):
            x[0, t, py_char_to_idx[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        
        # 直近の生成結果を取得
        preds = np.asarray(preds[len(generated) - 1]).astype(np.float64)
        
        # diversity を使って予測値をスケーリング
        preds = np.log(preds) / diversity
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        
        # スケーリングされた予測値をつかって、次の文字をサンプリング
        probas = np.random.multinomial(1, preds, 1)
        next_index = np.argmax(probas)        
        next_char = py_chars[next_index]
        
        yield next_char

        generated += next_char
        if generated.endswith(end_with):
            break


for i in range(20):
    st = ""
    for ch in generate_code(py_model):
        sys.stdout.write(ch)
        st += ch
    print()


## ここからネットワークの可視化

In [None]:
BATCH_SIZE = 512

flat_model = char_rnn_model(len(py_chars), num_layers=1, num_nodes=512, dropout=0)
flat_model.summary()

In [None]:
# early = keras.callbacks.EarlyStopping(monitor='loss',
#                               min_delta=0.03,
#                               patience=3,
#                               verbose=0, mode='auto')

flat_model.fit_generator(
    data_generator(python_code, py_char_to_idx, batch_size=BATCH_SIZE, chunk_size=160),
    epochs=40,
#     callbacks=[early,],
    steps_per_epoch=2 * len(python_code) / (BATCH_SIZE * 160),
    verbose=2
)

In [None]:
example_code = 'if a == 2:\n    b=1\nelse:\n    b=2\n'

def activations(model, code):
    x = np.zeros((1, len(code), len(py_char_to_idx)))
    for t, char in enumerate(code):
        x[0, t, py_char_to_idx[char]] = 1.

    output = model.get_layer('unified_lstm_5').output
    f = K.function(model.input, output)
    return f(x)[0]

act = activations(flat_model, example_code)
act.shape

In [None]:
def interesting_neurons(act):
    res = []
    for n in np.argmax(act, axis=-1):
        if not n in res:
            res.append(n)
    return res

neurons = interesting_neurons(act)
len(neurons)

In [None]:
def visualize_neurons(neurons, code, act, cell_size=12):
    # コードを表示するスペースを確保するため、高さを len(neurons) + 1 とする
    img = np.full((len(neurons) + 1, len(code), 3), 128)
    # act は activations 関数の戻り値。[0, 1]の範囲に収まるよう変換する
    scores = (act[:, neurons].T + 1) / 2

    # 値の小さいものは赤く
    img[1:, :, 0] = 255 * (1 - scores)
    # 値の大きいものは緑に
    img[1:, :, 1] = 255 * scores

    f = BytesIO()
    img = scipy.misc.imresize(img, float(cell_size), interp='nearest')
    pil_img = PIL.Image.fromarray(img)
    draw = ImageDraw.Draw(pil_img)
    for idx, ch in enumerate(code):
        draw.text((idx * cell_size + 2, 0), ch)
    pil_img.save(f, 'png')
    return Image(data=f.getvalue())

In [None]:
def image_for_code(code):
    act = activations(flat_model, code)
    neurons = interesting_neurons(act)
    return visualize_neurons(neurons, code, act)

display(image_for_code('if (a == 2) and ((b == 1) or (c==2)):'))