In [1]:
%matplotlib inline
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout, UpSampling1D, BatchNormalization
from keras.models import Model
from keras.utils import plot_model
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns # Plot statistical data such as Annotated heatmaps
from sklearn import metrics
from sklearn.metrics import classification_report

Using TensorFlow backend.


In [2]:
# Dataset 1 contains 20,000 java files
DATASET = "/Users/martinholecek/Desktop/Datasets/Small/Dataset_2000"
FILE_MAX_SIZE = 2000  # 2kB
INPUT_SIZE = 2000 # 2kB

# max number of chars in a file 1,999 => 2,000
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Create Tokenizer and vocabulary

In [3]:
# Create tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')

In [4]:
def get_alphabet():
    ''' Create alphabet from ASCII character '''
    char_dict = {}
    for num in range(127):
        char_dict[chr(num)] = num + 1
    return char_dict

In [5]:
# Create Alphabet Vocabulary (Dictonary)
char_dict = get_alphabet()
print(char_dict)

{'\x00': 1, '\x01': 2, '\x02': 3, '\x03': 4, '\x04': 5, '\x05': 6, '\x06': 7, '\x07': 8, '\x08': 9, '\t': 10, '\n': 11, '\x0b': 12, '\x0c': 13, '\r': 14, '\x0e': 15, '\x0f': 16, '\x10': 17, '\x11': 18, '\x12': 19, '\x13': 20, '\x14': 21, '\x15': 22, '\x16': 23, '\x17': 24, '\x18': 25, '\x19': 26, '\x1a': 27, '\x1b': 28, '\x1c': 29, '\x1d': 30, '\x1e': 31, '\x1f': 32, ' ': 33, '!': 34, '"': 35, '#': 36, '$': 37, '%': 38, '&': 39, "'": 40, '(': 41, ')': 42, '*': 43, '+': 44, ',': 45, '-': 46, '.': 47, '/': 48, '0': 49, '1': 50, '2': 51, '3': 52, '4': 53, '5': 54, '6': 55, '7': 56, '8': 57, '9': 58, ':': 59, ';': 60, '<': 61, '=': 62, '>': 63, '?': 64, '@': 65, 'A': 66, 'B': 67, 'C': 68, 'D': 69, 'E': 70, 'F': 71, 'G': 72, 'H': 73, 'I': 74, 'J': 75, 'K': 76, 'L': 77, 'M': 78, 'N': 79, 'O': 80, 'P': 81, 'Q': 82, 'R': 83, 'S': 84, 'T': 85, 'U': 86, 'V': 87, 'W': 88, 'X': 89, 'Y': 90, 'Z': 91, '[': 92, '\\': 93, ']': 94, '^': 95, '_': 96, '`': 97, 'a': 98, 'b': 99, 'c': 100, 'd': 101, 'e': 1

In [6]:
# Create vocabulary and Add it into the tokenizer
tk.word_index = char_dict.copy()
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
print(tk.word_index)

{'\x00': 1, '\x01': 2, '\x02': 3, '\x03': 4, '\x04': 5, '\x05': 6, '\x06': 7, '\x07': 8, '\x08': 9, '\t': 10, '\n': 11, '\x0b': 12, '\x0c': 13, '\r': 14, '\x0e': 15, '\x0f': 16, '\x10': 17, '\x11': 18, '\x12': 19, '\x13': 20, '\x14': 21, '\x15': 22, '\x16': 23, '\x17': 24, '\x18': 25, '\x19': 26, '\x1a': 27, '\x1b': 28, '\x1c': 29, '\x1d': 30, '\x1e': 31, '\x1f': 32, ' ': 33, '!': 34, '"': 35, '#': 36, '$': 37, '%': 38, '&': 39, "'": 40, '(': 41, ')': 42, '*': 43, '+': 44, ',': 45, '-': 46, '.': 47, '/': 48, '0': 49, '1': 50, '2': 51, '3': 52, '4': 53, '5': 54, '6': 55, '7': 56, '8': 57, '9': 58, ':': 59, ';': 60, '<': 61, '=': 62, '>': 63, '?': 64, '@': 65, 'A': 66, 'B': 67, 'C': 68, 'D': 69, 'E': 70, 'F': 71, 'G': 72, 'H': 73, 'I': 74, 'J': 75, 'K': 76, 'L': 77, 'M': 78, 'N': 79, 'O': 80, 'P': 81, 'Q': 82, 'R': 83, 'S': 84, 'T': 85, 'U': 86, 'V': 87, 'W': 88, 'X': 89, 'Y': 90, 'Z': 91, '[': 92, '\\': 93, ']': 94, '^': 95, '_': 96, '`': 97, 'a': 98, 'b': 99, 'c': 100, 'd': 101, 'e': 1

In [7]:
vocab_size = len(tk.word_index)
print(vocab_size)

128


<h1>Loading Dataset

In [8]:
def load_data(data):
    ''' File to np array'''
    for d in data:
        with open(d, "r") as file:
            content = file.read()
            train_sequence = tk.texts_to_sequences(content)
            train_data = pad_sequences(
                train_sequence, maxlen=INPUT_SIZE, padding='post')
            train_data = np.array(train_data, dtype='float32')
            yield train_data

In [9]:
# Load Dataset (filenames)
dataset_data = [os.path.join(DATASET, fn) for fn in os.listdir(
    DATASET) if fn.endswith('.java') and not fn.startswith('.')]

In [10]:
# Divide dataset into training and testing
train_data, test_data = train_test_split(dataset_data, test_size=0.2)

In [11]:
# Loading Generators (Files are to big to fit into the memory)
train_gen = load_data(train_data)
test_gen = load_data(test_data)

In [12]:
# Data example
data = next(train_gen)
print(data)

[[ 65.   0.   0. ...   0.   0.   0.]
 [100.   0.   0. ...   0.   0.   0.]
 [112.   0.   0. ...   0.   0.   0.]
 ...
 [126.   0.   0. ...   0.   0.   0.]
 [ 11.   0.   0. ...   0.   0.   0.]
 [126.   0.   0. ...   0.   0.   0.]]


# Create Embedding weights

In [13]:
# One hot array representation
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))
for char, i in tk.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[i-1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)
print(embedding_weights.shape)
print(embedding_weights)

(129, 128)
[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


# Create Model

In [14]:
#### Initialize Parameters ####
input_size = INPUT_SIZE   # Must be same as an argument max_len inside pad_sequence method
# vocab size 256
embedding_size = vocab_size
optimizer = 'adam'
loss = 'binary_crossentropy'
dropout_p = 0.5

In [15]:
# input layer
inputs = Input(shape=(input_size,), name='input_layer', dtype='int64')

In [16]:
#embedding layer (input_dim, output_dim, input_length, weights)
embedding_layer = Embedding(vocab_size + 1, embedding_size, input_length=input_size, weights=[embedding_weights])

In [17]:
x = embedding_layer(inputs)

Instructions for updating:
Colocations handled automatically by placer.


### Autoencoder model

In [18]:
def encoder(x):
    x = Conv1D(256, 7, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2, padding="same")(x)
    
    x = Conv1D(256, 7, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=2, padding="same")(x)
    
    x = Conv1D(256, 3, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2, padding="same")(x)
    
    x = Conv1D(256, 3, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=2, padding="same")(x)
    
    x = Conv1D(256, 3, activation='relu', padding='same')(x)
    return x

In [19]:
x = encoder(x)
encoder_model = Model(inputs=inputs, outputs=x)
encoder_model.summary()
# Verify the model using graph (Save it as png file)
plot_model(encoder_model, to_file='encoder_model.png', show_shapes=True, show_layer_names=False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 2000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2000, 128)         16512     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2000, 256)         229632    
_________________________________________________________________
batch_normalization_1 (Batch (None, 2000, 256)         1024      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1000, 256)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1000, 256)         459008    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 500, 256)          0         
__________

In [20]:
def decoder(x):
    x = Conv1D(256, 3, activation='relu', padding='same')(x)
    
    x = UpSampling1D(2)(x)
    x = Conv1D(256, 3, activation='relu', padding='same')(x)

    x = UpSampling1D(2)(x)
    x = BatchNormalization()(x)
    x = Conv1D(256, 3, activation='relu', padding='same')(x)
    
    x = UpSampling1D(2)(x)
    x = Conv1D(256, 7, activation='relu', padding='same')(x)
    
    x = UpSampling1D(2)(x)
    x = BatchNormalization()(x)
    x = Conv1D(256, 7, activation='sigmoid', padding='same')(x)
    return x

In [21]:
x = decoder(x)

In [22]:
# Temporary
predictions = x

### Build Model

In [23]:
# Build and compile model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss,
              metrics=['accuracy'])

In [24]:
# Verify the model using graph (Save it as png file)
plot_model(model, to_file='autoencoder.png', show_shapes=True)

In [25]:
# Print summary of the model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 2000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2000, 128)         16512     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2000, 256)         229632    
_________________________________________________________________
batch_normalization_1 (Batch (None, 2000, 256)         1024      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1000, 256)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1000, 256)         459008    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 500, 256)          0         
__________