In [None]:
%matplotlib inline
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout, UpSampling1D, BatchNormalization
from keras.models import Model, load_model
from keras.utils import plot_model
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns # Plot statistical data such as Annotated heatmaps
from sklearn import metrics
from sklearn.metrics import classification_report
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
from pprint import pprint
from keras.optimizers import RMSprop

In [None]:
# Dataset contains 2,000 java files where each file has maximum 2,000 chars
DATASET = "/Users/martinholecek/Desktop/Datasets/Small/Dataset_2000"
FILE_MAX_SIZE = 2000  # 2kB
INPUT_SIZE = 2000 # 2kB

# max number of chars in a file 1,999 => 2,000
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Create Tokenizer and vocabulary

In [None]:
# Create tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')

In [None]:
def get_alphabet():
    ''' Create alphabet from ASCII character '''
    char_dict = {}
    for num in range(127):
        char_dict[chr(num)] = num + 1
    return char_dict

In [None]:
# Create Alphabet Vocabulary (Dictonary)
char_dict = get_alphabet()
print(char_dict)

In [None]:
# Create vocabulary and Add it into the tokenizer
tk.word_index = char_dict.copy()
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
print(tk.word_index)

In [None]:
vocab_size = len(tk.word_index)
print(vocab_size)

<h1>Loading Dataset

In [None]:
def load_data(data):
    ''' File to np array'''
    for d in data:
        with open(d, "r") as file:
            content = file.read()
            sequence = tk.texts_to_sequences(content)
            data = pad_sequences(
                sequence, maxlen=INPUT_SIZE, padding='post')
            data = np.array(data, dtype='float32')
            X = data
            y = data
            yield X, y

In [None]:
# Load Dataset (filenames)
dataset_data = [os.path.join(DATASET, fn) for fn in os.listdir(
    DATASET) if fn.endswith('.java') and not fn.startswith('.')]

In [None]:
# Divide dataset into training and testing
train_data, test_data = train_test_split(dataset_data, test_size=0.2)
train_data, validation_data = train_test_split(train_data, test_size=0.2)
print(len(train_data))
print(len(test_data))
print(len(validation_data))

In [None]:
# Loading Generators (Files are to big to fit into the memory)
train_gen = load_data(train_data)
test_gen = load_data(test_data)
validation_gen = load_data(validation_data)

In [None]:
# Data example
data = next(train_gen)
print(data)

# Create Embedding weights

In [None]:
# One hot array representation
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))
for char, i in tk.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)
print(embedding_weights.shape)
print(embedding_weights)

# Create Model

### Defined Parameters

In [None]:
input_size = INPUT_SIZE       # Must be same as an argument max_len inside pad_sequence method
embedding_size = vocab_size   # vocab size 256
optimizer = 'adam'
loss = 'binary_crossentropy'
dropout_p = 0.5
steps_per_epoch = len(train_data)

### Input and Embedding Layers

In [None]:
# input layer
inputs = Input(shape=(input_size,), name='input_layer', dtype='int64')

In [None]:
#embedding layer (input_dim, output_dim, input_length, weights)
embedding_layer = Embedding(vocab_size + 1, embedding_size, input_length=input_size, weights=[embedding_weights])

In [None]:
x = embedding_layer(inputs)

### Autoencoder model

In [None]:
def encoder(x):
    x = Conv1D(256, 7, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2, padding="same")(x)
    
    x = Conv1D(256, 7, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=2, padding="same")(x)
    
    x = Conv1D(256, 3, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2, padding="same")(x)
    
    x = Conv1D(256, 3, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=2, padding="same")(x)
    
    x = Conv1D(256, 3, activation='relu', padding='same', name='conv_encoder')(x)
    return x

In [None]:
x = encoder(x)
# encoder_model = Model(inputs=inputs, outputs=x)
# encoder_model.summary()
# Verify the model using graph (Save it as png file)
# plot_model(encoder_model, to_file='encoder_model.png', show_shapes=True, show_layer_names=False)

In [None]:
def decoder(x):
    x = Conv1D(256, 3, activation='relu', padding='same')(x)
    
    x = UpSampling1D(2)(x)
    x = Conv1D(256, 3, activation='relu', padding='same')(x)

    x = UpSampling1D(2)(x)
    x = BatchNormalization()(x)
    x = Conv1D(256, 3, activation='relu', padding='same')(x)
    
    x = UpSampling1D(2)(x)
    x = Conv1D(256, 7, activation='relu', padding='same')(x)
    
    x = UpSampling1D(2)(x)
    x = BatchNormalization()(x)
    x = Conv1D(1, 7, activation='relu', padding='same')(x)
    return x

In [None]:
x = decoder(x)
x = Flatten()(x)
predictions = Dense(2000, activation='relu')(x)

### Build Model

In [None]:
# Build and compile model
autoencoder = Model(inputs=inputs, outputs=predictions)
# autoencoder.compile(optimizer=optimizer, loss=loss,
#               metrics=['accuracy'])
autoencoder.compile(loss='mean_squared_error', optimizer = RMSprop(),
              metrics=['accuracy'])


In [None]:
# Verify the model using graph (Save it as png file)
plot_model(autoencoder, to_file='autoencoder.png', show_shapes=True)

In [None]:
# Print summary of the model
autoencoder.summary()

### Setup Callbacks

In [None]:
tensorboard = TensorBoard(log_dir='./logs', 
                          histogram_freq=0,
                          write_graph=True,
                          embeddings_freq=100,
                          write_images=False)
# Run Tensorboard
# tensorboard --logdir=/logs

In [None]:
checkpoint = ModelCheckpoint(filepath='checkpoints/weights.{epoch:02d}-{val_loss:.2f}.hdf5', 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=True, 
                            #period=5
                            ) # Number of epochs when to save (after 5 epochs, save model)

In [None]:
early_stop = EarlyStopping(monitor='val_loss', 
                           min_delta=0.01, 
                           patience=5, # Num of epochs with no improvement after which training stops
                           verbose=1)

### Train model

In [None]:
autoencoder_train = autoencoder.fit_generator(train_gen, 
                    validation_data=validation_gen, 
                    validation_steps=len(validation_data),
                    steps_per_epoch=steps_per_epoch, 
                    epochs=1, 
                    verbose=1,
                    callbacks=[tensorboard, checkpoint, early_stop])

### Model Performace Visualization

### Extract Encoder

In [None]:
# Get Encoder
encoder_model = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('conv_encoder').output)
encoder_model.summary()

In [None]:
# Save encoder model to the file
encoder_model.save('models/encoder_model.h5')
# Because compilation resets the models' weights, save them along with the model and load them after compilation
encoder_model.save_weights('models/encoder_model_copy-weights.h5')

# Load encoder model from the file
new_encoder_model = load_model('models/encoder_model.h5')

new_encoder_model.compile(
   optimizer=optimizer, 
   loss=loss, 
   metrics=['accuracy']
)

new_encoder_model.load_weights('models/encoder_model_copy-weights.h5')

In [None]:
# Make encoder parameters untrainable (parameters will not change while training)
for layer in new_encoder_model.layers:
    layer.trainable = False

In [None]:
new_encoder_model.summary()
encoder_model.summary()