In [None]:
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout, UpSampling1D, BatchNormalization
from keras.models import Model, load_model
from keras.utils import plot_model
from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # Plot statistical data such as Annotated heatmaps
import math

In [None]:
# Dataset contains 2,000 java files where each file has maximum 2,000 chars
DATASET = "/Users/martinholecek/Desktop/Datasets/Small/Dataset_2000"
FILE_MAX_SIZE = 2000  # 2kB
INPUT_SIZE = 2000 # 2kB

TENSORBOARD_LOGS_PATH = './logs'
CHECKPOINT_FOLDER_PATH = './checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5'

# max number of chars in a file 1,999 => 2,000
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Create Tokenizer and vocabulary

In [None]:
# Create tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')

In [None]:
def get_alphabet():
    ''' Create alphabet from ASCII character '''
    char_dict = {}
    for num in range(127):
        char_dict[chr(num)] = num + 1
    return char_dict

In [None]:
# Create Alphabet Vocabulary (Dictonary)
char_dict = get_alphabet()
print(char_dict)

In [None]:
# Create vocabulary and Add it into the tokenizer
tk.word_index = char_dict.copy()
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
print(tk.word_index)

In [None]:
vocab_size = len(tk.word_index)
print(vocab_size)

<h1>Loading Dataset

In [None]:
# Loading data in batches
def load_data_gen(filenames, batch_size=64):
    while True:
        batch_paths = np.random.choice(a = filenames, size = batch_size)
        batch_input = []
#         batch_output = []
        
        for filename in batch_paths:
            try:
                content = open(filename, 'r').read()
                sequence = tk.texts_to_sequences(content)
                data = pad_sequences(sequence, maxlen=INPUT_SIZE, padding='post')
                data = np.array(data, dtype='int32')
                batch_input += [data]
            except:
                continue
        batch_x = np.array(batch_input)
        batch_y = np.array(batch_input)
        yield batch_x, batch_y

In [None]:
# Load Dataset (filenames)
dataset_data = [os.path.join(DATASET, fn) for fn in os.listdir(
    DATASET) if fn.endswith('.java') and not fn.startswith('.')]

In [None]:
# Divide dataset into training and testing
train_data, test_data = train_test_split(dataset_data, test_size=0.2)
train_data, validation_data = train_test_split(train_data, test_size=0.2)
print(len(train_data))
print(len(test_data))
print(len(validation_data))

In [None]:
# Loading Generators (Using batches)
train_gen = load_data_gen(train_data)
test_gen = load_data_gen(test_data)
validation_gen = load_data_gen(validation_data)

# Create Embedding weights

In [None]:
# One hot array representation
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))
for char, i in tk.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)
print(embedding_weights.shape)
print(embedding_weights)

# Create Model

### Defined Parameters

In [None]:
input_size = INPUT_SIZE       # Must be same as an argument max_len inside pad_sequence method
embedding_size = vocab_size   # vocab size 128
optimizer = 'adam'
loss = 'mse'
dropout_p = 0.5

In [None]:
# Training in batches
batch_size = 64 # default value in generator
steps_per_epoch = math.ceil(len(train_data) / batch_size)
validation_steps = math.ceil(len(validation_data) / batch_size)

### Input and Embedding Layers

In [None]:
# input layer (shape=(None, 2000))
inputs = Input(shape=(input_size,), name='input_layer', dtype='int32')

In [None]:
# embedding layer (input_dim=129, output_dim=128, input_length=2,000, weights=(129, 128))
embedding_layer = Embedding(vocab_size + 1, embedding_size, input_length=input_size, weights=[embedding_weights])

In [None]:
x = embedding_layer(inputs)

### Autoencoder model

In [None]:
def encoder(x):
#     x = Conv1D(256, 7, activation='relu', padding='same')(x)
#     x = BatchNormalization()(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(256, 7, activation='relu', padding='same')(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = BatchNormalization()(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(256, 3, activation='relu', padding='same', name='conv_encoder')(x)
    
    x = Conv1D(128, 5, activation='relu', padding="same")(x)
    x = MaxPooling1D(pool_size=5)(x)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=4, padding="same")(x)
    x = Conv1D(128, 5, activation='relu', padding='same', name='conv_encoder')(x)
    
#     x = Conv1D(1, 3, activation='relu', padding='same')(x)
#     x = MaxPooling1D(pool_size=2, padding="same")(x)
#     x = Conv1D(1, 3, activation='relu', padding='same', name='conv_encoder')(x)
    return x

In [None]:
def decoder(x):
#     x = Conv1D(1, 3, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = Conv1D(1, 3, activation='relu', padding='same')(x)

    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = UpSampling1D(4)(x)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = UpSampling1D(5)(x)
    x = Conv1D(1, 5, activation='relu', padding='same')(x)
    
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = BatchNormalization()(x)
#     x = Conv1D(256, 3, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = Conv1D(256, 7, activation='relu', padding='same')(x)
#     x = UpSampling1D(2)(x)
#     x = BatchNormalization()(x)
#     x = Conv1D(1, 7, activation='relu', padding='same')(x)
    return x

In [None]:
x = encoder(x)
x = decoder(x)
x = Flatten()(x)
# x = Dense(2000, activation='relu')(x)
predictions = x

### Build Model

In [None]:
# Build and compile model
autoencoder = Model(inputs=inputs, outputs=predictions)
# opt = adam, loss = mse
autoencoder.compile(loss=loss, optimizer = optimizer,
              metrics=['accuracy'])

In [None]:
# Print summary of the model
autoencoder.summary()

### Setup Callbacks

In [None]:

tensorboard = TensorBoard(log_dir=TENSORBOARD_LOGS_PATH, 
                          histogram_freq=0,
                          write_graph=True,
                          write_images=False)
# Run Tensorboard
# tensorboard --logdir=/logs

In [None]:
checkpoint = ModelCheckpoint(filepath=CHECKPOINT_FOLDER_PATH, 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=True, 
                             period=1
                            )

In [None]:
early_stop = EarlyStopping(monitor='val_loss', 
                           min_delta=0.01, 
                           patience=5, # Num of epochs with no improvement after which training stops
                           verbose=1)

### Train model

In [None]:
autoencoder_train = autoencoder.fit_generator(train_gen, 
                    validation_data=validation_gen, 
                    validation_steps=validation_steps,
                    steps_per_epoch=steps_per_epoch, 
                    epochs=1,
#                     use_multiprocessing=True,
#                     callbacks=[tensorboard, checkpoint, early_stop],
                    verbose=1)

### Model Performace Visualization

### Extract Encoder

In [None]:
# Get Encoder
encoder_model = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('conv_encoder').output)
encoder_model.summary()

In [None]:
# Save encoder model to the file
encoder_model.save('models/encoder_model.h5')
# Because compilation resets the models' weights, save them along with the model and load them after compilation
encoder_model.save_weights('models/encoder_model_copy-weights.h5')

# Load encoder model from the file
new_encoder_model = load_model('models/encoder_model.h5')

new_encoder_model.compile(
   optimizer=optimizer, 
   loss=loss, 
   metrics=['accuracy']
)

new_encoder_model.load_weights('models/encoder_model_copy-weights.h5')

In [None]:
# Make encoder parameters untrainable (parameters will not change while training)
for layer in new_encoder_model.layers:
    layer.trainable = False

In [None]:
new_encoder_model.summary()
encoder_model.summary()