In [1]:
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout, UpSampling1D, BatchNormalization
from keras.models import Model, load_model
from keras.utils import plot_model
from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # Plot statistical data such as Annotated heatmaps
import math

Using TensorFlow backend.


In [2]:
# Dataset contains 2,000 java files where each file has maximum 2,000 chars
DATASET = "/Users/martinholecek/Desktop/Datasets/Small/Dataset_2000"
FILE_MAX_SIZE = 2000  # 2kB
INPUT_SIZE = 2000 # 2kB

TENSORBOARD_LOGS_PATH = './logs'
CHECKPOINT_FOLDER_PATH = './checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5'

# max number of chars in a file 1,999 => 2,000
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# DATASET = "/content/gdrive/My Drive/colab/Datasets/Dataset_2000"
# TENSORBOARD_LOGS_PATH = '/content/gdrive/My Drive/colab/logs'
# CHECKPOINT_FOLDER_PATH = '/content/gdrive/My Drive/colab/checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5'

# Create Tokenizer and vocabulary

In [3]:
# Create tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')

In [4]:
def get_alphabet():
    ''' Create alphabet from ASCII character '''
    char_dict = {}
    for num in range(127):
        char_dict[chr(num)] = num + 1
    return char_dict

In [5]:
# Create Alphabet Vocabulary (Dictonary)
char_dict = get_alphabet()
print(char_dict)

{'\x00': 1, '\x01': 2, '\x02': 3, '\x03': 4, '\x04': 5, '\x05': 6, '\x06': 7, '\x07': 8, '\x08': 9, '\t': 10, '\n': 11, '\x0b': 12, '\x0c': 13, '\r': 14, '\x0e': 15, '\x0f': 16, '\x10': 17, '\x11': 18, '\x12': 19, '\x13': 20, '\x14': 21, '\x15': 22, '\x16': 23, '\x17': 24, '\x18': 25, '\x19': 26, '\x1a': 27, '\x1b': 28, '\x1c': 29, '\x1d': 30, '\x1e': 31, '\x1f': 32, ' ': 33, '!': 34, '"': 35, '#': 36, '$': 37, '%': 38, '&': 39, "'": 40, '(': 41, ')': 42, '*': 43, '+': 44, ',': 45, '-': 46, '.': 47, '/': 48, '0': 49, '1': 50, '2': 51, '3': 52, '4': 53, '5': 54, '6': 55, '7': 56, '8': 57, '9': 58, ':': 59, ';': 60, '<': 61, '=': 62, '>': 63, '?': 64, '@': 65, 'A': 66, 'B': 67, 'C': 68, 'D': 69, 'E': 70, 'F': 71, 'G': 72, 'H': 73, 'I': 74, 'J': 75, 'K': 76, 'L': 77, 'M': 78, 'N': 79, 'O': 80, 'P': 81, 'Q': 82, 'R': 83, 'S': 84, 'T': 85, 'U': 86, 'V': 87, 'W': 88, 'X': 89, 'Y': 90, 'Z': 91, '[': 92, '\\': 93, ']': 94, '^': 95, '_': 96, '`': 97, 'a': 98, 'b': 99, 'c': 100, 'd': 101, 'e': 1

In [6]:
# Create vocabulary and Add it into the tokenizer
tk.word_index = char_dict.copy()
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
print(tk.word_index)

{'\x00': 1, '\x01': 2, '\x02': 3, '\x03': 4, '\x04': 5, '\x05': 6, '\x06': 7, '\x07': 8, '\x08': 9, '\t': 10, '\n': 11, '\x0b': 12, '\x0c': 13, '\r': 14, '\x0e': 15, '\x0f': 16, '\x10': 17, '\x11': 18, '\x12': 19, '\x13': 20, '\x14': 21, '\x15': 22, '\x16': 23, '\x17': 24, '\x18': 25, '\x19': 26, '\x1a': 27, '\x1b': 28, '\x1c': 29, '\x1d': 30, '\x1e': 31, '\x1f': 32, ' ': 33, '!': 34, '"': 35, '#': 36, '$': 37, '%': 38, '&': 39, "'": 40, '(': 41, ')': 42, '*': 43, '+': 44, ',': 45, '-': 46, '.': 47, '/': 48, '0': 49, '1': 50, '2': 51, '3': 52, '4': 53, '5': 54, '6': 55, '7': 56, '8': 57, '9': 58, ':': 59, ';': 60, '<': 61, '=': 62, '>': 63, '?': 64, '@': 65, 'A': 66, 'B': 67, 'C': 68, 'D': 69, 'E': 70, 'F': 71, 'G': 72, 'H': 73, 'I': 74, 'J': 75, 'K': 76, 'L': 77, 'M': 78, 'N': 79, 'O': 80, 'P': 81, 'Q': 82, 'R': 83, 'S': 84, 'T': 85, 'U': 86, 'V': 87, 'W': 88, 'X': 89, 'Y': 90, 'Z': 91, '[': 92, '\\': 93, ']': 94, '^': 95, '_': 96, '`': 97, 'a': 98, 'b': 99, 'c': 100, 'd': 101, 'e': 1

In [7]:
vocab_size = len(tk.word_index)
print(vocab_size)

128


<h1>Loading Dataset

In [None]:
# Loading single sample
def load_data(data):
    ''' File to np array'''
    for d in data:
        with open(d, "r") as file:
            content = file.read()
            sequence = tk.texts_to_sequences(content)
            data = pad_sequences(
                sequence, maxlen=INPUT_SIZE, padding='post')
            data = np.array(data, dtype='int32')
#             data = np.array(data, dtype='float32')
            X = data
            y = data
            yield X, y

In [8]:
# Loading data in batches
def load_data_gen(filenames, batch_size=64):
    while True:
        batch_paths = np.random.choice(a = filenames, size = batch_size)
        batch_input = []
#         batch_output = []
        
        for filename in batch_paths:
            try:
                content = open(filename, 'r').read()
                sequence = tk.texts_to_sequences(content)
                data = pad_sequences(sequence, maxlen=INPUT_SIZE, padding='post')
                data = np.array(data, dtype='int32')
                batch_input += [data]
            except:
                continue
        batch_x = np.array(batch_input)
        batch_y = np.array(batch_input)
        yield batch_x, batch_y

In [9]:
# Load Dataset (filenames)
dataset_data = [os.path.join(DATASET, fn) for fn in os.listdir(
    DATASET) if fn.endswith('.java') and not fn.startswith('.')]

In [10]:
# Divide dataset into training and testing
train_data, test_data = train_test_split(dataset_data, test_size=0.2)
train_data, validation_data = train_test_split(train_data, test_size=0.2)
print(len(train_data))
print(len(test_data))
print(len(validation_data))

1280
400
320


In [None]:
# Loading Generators (Files are to big to fit into the memory)
# train_gen = load_data(train_data)
# test_gen = load_data(test_data)
# validation_gen = load_data(validation_data)

In [11]:
# Loading Generators (Using batches)
train_gen = load_data_gen(train_data)
test_gen = load_data_gen(test_data)
validation_gen = load_data_gen(validation_data)

In [None]:
# Data example
# data = next(train_gen)
# print(data)

# Create Embedding weights

In [12]:
# One hot array representation
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))
for char, i in tk.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)
print(embedding_weights.shape)
print(embedding_weights)

(129, 128)
[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


# Create Model

### Defined Parameters

In [13]:
input_size = INPUT_SIZE       # Must be same as an argument max_len inside pad_sequence method
embedding_size = vocab_size   # vocab size 128
optimizer = 'adam'
loss = 'mse'
dropout_p = 0.5

In [None]:
# Single sample
# steps_per_epoch = len(train_data)
# validation_steps = len(validation_data)

In [14]:
# Training in batches
batch_size = 64 # default value in generator
steps_per_epoch = math.ceil(len(train_data) / batch_size)
validation_steps = math.ceil(len(validation_data) / batch_size)

### Input and Embedding Layers

In [15]:
# input layer (shape=(None, 2000))
inputs = Input(shape=(input_size,), name='input_layer', dtype='int32')

In [16]:
# embedding layer (input_dim=129, output_dim=128, input_length=2,000, weights=(129, 128))
embedding_layer = Embedding(vocab_size + 1, embedding_size, input_length=input_size, weights=[embedding_weights])

In [17]:
x = embedding_layer(inputs)

Instructions for updating:
Colocations handled automatically by placer.


### Autoencoder model

In [18]:
def encoder(x):
#     x = Conv1D(256, 7, activation='relu', padding='same')(x)
#     x = BatchNormalization()(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(256, 7, activation='relu', padding='same')(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = BatchNormalization()(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(256, 3, activation='relu', padding='same', name='conv_encoder')(x)
    
    x = Conv1D(128, 5, activation='relu', padding="same")(x)
    x = MaxPooling1D(pool_size=5)(x)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=4, padding="same")(x)
    x = Conv1D(128, 5, activation='relu', padding='same', name='conv_encoder')(x)
    
#     x = Conv1D(1, 3, activation='relu', padding='same')(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(1, 3, activation='relu', padding='same', name='conv_encoder')(x)
    return x

In [19]:
def decoder(x):
#     x = Conv1D(1, 3, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = Conv1D(1, 3, activation='relu', padding='same')(x)

    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = UpSampling1D(4)(x)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = UpSampling1D(5)(x)
    x = Conv1D(1, 5, activation='relu', padding='same')(x)
    
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = BatchNormalization()(x)
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = Conv1D(256, 7, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = BatchNormalization()(x)
#     x = Conv1D(1, 7, activation='relu', padding='same')(x)
    return x

In [20]:
x = encoder(x)
x = decoder(x)
x = Flatten()(x)
# x = Dense(2000, activation='relu')(x)
predictions = x

### Build Model

In [21]:
# Build and compile model
autoencoder = Model(inputs=inputs, outputs=predictions)
# opt = adam, loss = mse
autoencoder.compile(loss=loss, optimizer = optimizer,
              metrics=['accuracy'])

In [None]:
# Verify the model using graph (Save it as png file)
# plot_model(autoencoder, to_file='autoencoder.png', show_shapes=True)

In [22]:
# Print summary of the model
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 2000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2000, 128)         16512     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2000, 128)         82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 400, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 400, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 100, 128)          0         
_________________________________________________________________
conv_encoder (Conv1D)        (None, 100, 128)          82048     
__________

In [None]:
# Print weigths and optimizer - Just for testing
# autoencoder.get_weights() 
# autoencoder.optimizer

### Setup Callbacks

In [None]:

tensorboard = TensorBoard(log_dir=TENSORBOARD_LOGS_PATH, 
                          histogram_freq=0,
                          write_graph=True,
                          write_images=False)
# Run Tensorboard
# tensorboard --logdir=/logs

In [None]:
checkpoint = ModelCheckpoint(filepath=CHECKPOINT_FOLDER_PATH, 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=True, 
                             period=1
                            )

In [None]:
# We can load the latest checkpoint created
# latest_checkpoint = tf.train.latest_checkpoint('checkpoints')
# latest_checkpoint

In [None]:
early_stop = EarlyStopping(monitor='val_loss', 
                           min_delta=0.01, 
                           patience=5, # Num of epochs with no improvement after which training stops
                           verbose=1)

### Train model

In [23]:
autoencoder_train = autoencoder.fit_generator(train_gen, 
                    validation_data=validation_gen, 
                    validation_steps=validation_steps,
                    steps_per_epoch=steps_per_epoch, 
                    epochs=1,
#                     use_multiprocessing=True,
#                     callbacks=[tensorboard, checkpoint, early_stop],
                    verbose=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1


ValueError: Error when checking input: expected input_layer to have shape (2000,) but got array with shape (1,)

### Model Performace Visualization

In [None]:
# loss = autoencoder_train.history['loss']
# val_loss = autoencoder_train.history['val_loss']
# epochs = range(200)
# plt.figure()
# plt.plot(epochs, loss, 'bo', label='Training loss')
# plt.plot(epochs, val_loss, 'b', label='Validation loss')
# plt.title('Training and validation loss')
# plt.legend()
# plt.show()

### Extract Encoder

In [None]:
# Get Encoder
encoder_model = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('conv_encoder').output)
encoder_model.summary()

In [None]:
# Save encoder model to the file
encoder_model.save('models/encoder_model.h5')
# Because compilation resets the models' weights, save them along with the model and load them after compilation
encoder_model.save_weights('models/encoder_model_copy-weights.h5')

# Load encoder model from the file
new_encoder_model = load_model('models/encoder_model.h5')

new_encoder_model.compile(
   optimizer=optimizer, 
   loss=loss, 
   metrics=['accuracy']
)

new_encoder_model.load_weights('models/encoder_model_copy-weights.h5')

In [None]:
# Make encoder parameters untrainable (parameters will not change while training)
for layer in new_encoder_model.layers:
    layer.trainable = False

In [None]:
new_encoder_model.summary()
encoder_model.summary()