In [1]:
%matplotlib inline
import sys
import matplotlib

In [2]:
from __future__ import division
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, Sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, CuDNNLSTM
from keras.layers import Embedding
from keras.utils import plot_model
from IPython.display import SVG, display
from keras.utils.vis_utils import model_to_dot
import keras.backend as K
from sys import getsizeof
from collections import Counter
from numpy import argmax
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
matplotlib.rcParams['text.usetex'] = True

Using TensorFlow backend.


In [3]:
def difference(dataset, interval=1):
    """
    Calculates the difference between a time-series and a lagged version of it
    """
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return diff

def create_windowed_dataset(data, look_back):
    """
    Create the dataset by grouping windows of memory accesses together (using the look_back parameter)

    data: it should be a list of integers
    
    QUESTION: very similar to create minibatches of data 
    """
    sequences = list()
    for i in range(look_back, len(data)):
        sequence = data[i-look_back:i+1]
        sequences.append(sequence)
    return sequences

In [21]:
# compute differences and encode the words
vocabulary_mimimum_word_frequency = 10 
dummy_word = "0xffffffff" 

def dataset_creator(app_name, use_manual_encoding):
    '''
    1. 
    '''
    # read and build a dataframe based on input memory access files
    dataframe = pd.read_csv(app_name, sep=' ')
    
    # set each of three columns to 'instruction', 'type', and 'address'
    dataframe.columns = ['instruction', 'type', 'address']
    
    # we only need to keep addresses of type 'R'
    dataframe = dataframe[dataframe['type'] == 'R']
    
    # we only keep the column 'address' by removing 'instructions' and 'type'
    dataframe = dataframe['address']
    
    # build a new tokenizer from keras.preprocessing.text.Tokenizer
    tokenizer = Tokenizer()
    
    # fit the dataframe by calling fit_on_texts so that we can assign a uniquely
    # identified number to each unique address
    tokenizer.fit_on_texts(list(dataframe))
    
    # As we fit_on_texts in the last step, we would fetch the corresponding 
    # identification number sequence from the input dataframe 
    encoded_raw = tokenizer.texts_to_sequences([' '.join(list(dataframe))])[0]
    
    # vocab_size_raw is the number of unique addresses + 1
    vocab_size_raw = len(tokenizer.word_index) + 1
    
    # declare encoded_final and final_vocab_size for the final output use 
    encoded_final = []
    final_vocab_size = 0

    if use_manual_encoding:
        # calculate the difference and set the lagged interval to 1 in this case 
        
        # QUESTION: if we assign a unique number to each unique address (randomly), we lose the locality 
        # information and hence the difference only keeps track of the difference between the previous address
        # and the current address 
        encoded_raw_diff = difference(encoded_raw, 1) 
        
        # if the difference < 0, we put "0x" before the absolute value of x
        # if the difference >= 0, we put "1x" before the absolute value of x
        # the concatenation is encoded_raw_diff_str
        encoded_raw_diff_str = ["%s%d" % ("1x" if x < 0 else "0x" , abs(x)) for x in encoded_raw_diff]
        
        # create a dataframe for encoded_raw_diff_str
        df = pd.DataFrame(encoded_raw_diff_str)
        
        # set the column as 'delta'
        df.columns = ['delta']
        
        # count the number of unique encoded_raw_diff_str and sum them up 
        # and create a corresponding dataframe
        df2 = pd.DataFrame(pd.Series(encoded_raw_diff_str).value_counts())
        
        # set the column as 'total'
        df2.columns = ['total']
        
        # set the 'delta''s column as the index
        df2['delta'] = df2.index
        
        # reset the index starting from 0 
        df2 = df2.reset_index(drop=True)
        
        # set the columns as 'total' and 'delta'
        df2.columns = ['total', 'delta']
        
        # prune all of the total that is < vocabulary_mimimum_word_frequency, in this case it is 10
        df2 = df2[df2['total'] < vocabulary_mimimum_word_frequency]
        
        # df.delta.isin(df2.delta) check if any 'delta' element in df is in the df2.delta, 
        # if it is in the df2.delta, it would output True. Otherwise, False.
        # set the corresponding True column to 'dummy_word = 0xffffffff' in the 'delta' column of df 
        df.loc[df.delta.isin(df2.delta), ['delta']] = dummy_word
        
        # we only keep the column 'delta' by removing the indices and store the remaining dataframe to encoded_raw_diff_pruned
        encoded_raw_diff_pruned = df['delta']
        
        # delete df and df2
        del df, df2
        
        # call sklearn.model_selection.train_test_split to split half to training set and other half to test set 
        tmp_train, tmp_test = train_test_split(encoded_raw_diff_pruned, test_size=0.5, shuffle=False)
        
        # count how many dummy_word is in encoded_raw_diff_pruned, i.e. how many df.delta is in df2.delta
        total_removals = Counter(encoded_raw_diff_pruned)[dummy_word]
        
        # store the length of encoded_raw_diff_pruned into total_rows
        total_rows = len(encoded_raw_diff_pruned)
        
        # count how many dummy_word is in tmp_train (half of the encoded_raw_diff_pruned)
        train_removals = Counter(tmp_train)[dummy_word]
        
        # store the length of tmp_train into train_total 
        train_total = len(tmp_train)
        
        # count how many dummy_word is in tmp_train (half of the encoded_raw_diff_pruned)
        # QUESTION: why don't we just get test_removals = total_removals - train_removals?
        test_removals = Counter(tmp_test)[dummy_word]
        
        # store the length of tmp_test into test_total 
        test_total = len(tmp_test)
        
        # we define the max_test_accuracy = 1 - test_removals/test_total, i.e.
        # 1 - test_removals/test_total: the proportion of NOT dummy_word in testing set
        # as dummy_word has different pattern from other 'delta' entries, theoretically all other entries can be predicted correctly 
        # QUESTION: why is it impossible to predict the dummy_word correctly?
        max_test_accuracy = 1-test_removals/test_total
        
        # we convert each of the 'delta' value in x to string and store them as list in encoded_raw_diff_str
        encoded_raw_diff_pruned_str = [str(x) for x in list(encoded_raw_diff_pruned)]
        
        # build a new tokenizer from keras.preprocessing.text.Tokenizer
        tokenizer2 = Tokenizer()
        
        # fit the dataframe by calling fit_on_texts so that we can assign a uniquely
        # identified number to each unique 'delta' values 
        tokenizer2.fit_on_texts(encoded_raw_diff_pruned_str)
        
        # As we fit_on_texts in the last step, we would fetch the corresponding 
        # identification number sequence from the input dataframe 
        encoded_final = tokenizer2.texts_to_sequences([' '.join(encoded_raw_diff_pruned_str)])[0]
        
        # vocab_size_raw is the number of unique 'delta' values + 1
        final_vocab_size = len(tokenizer2.word_index) + 1
    else:
        encoded_final = encoded_raw
        
        final_vocab_size = vocab_size_raw
        
    # create minibatches of encoded_final of length (look_back + 1), in this case length = 2
    # and store sequences of minibatches into variable 'sequences'
    sequences = create_windowed_dataset(encoded_final, look_back=10)
    
    # return the maximum length of minibatch among all minibatches in sequences 
    max_length = max([len(seq) for seq in sequences])
    
    # Pads sequences to the same length, 
    # padding = 'pre' OR 'post': pad either before or after each minibatch
    # if we choose minibatch length = 2, we do not have to pad because sequences already contain
    # minibatches of the same length = 2
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
    
    # X: all columns except the last column
    # y: the last column 
    X, y = sequences[:,:-1], sequences[:,-1]
    
    # Converts a class vector (integers) to binary class matrix. 
    # y = to_categorical(y, num_classes=final_vocab_size)
    
    # call sklearn.model_selection.train_test_split to split 80% to training set and 20% to test set
    # parameter random_state=42 takes in a random seed and randomly split into training set and testing set 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
    
    # return all the necessary results for building models below
    return sequences, final_vocab_size, max_length, X_train, X_test, y_train, y_test

In [22]:
def run_model(model_name, final_vocab_size, max_length, X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(Embedding(final_vocab_size, output_dim=10, input_length=max_length - 1))
    embed = K.function([model.layers[0].input], [model.layers[0].output])
    embed_train = embed([X_train])[0]
    embed_test = embed([X_test])[0]
    if model_name == 'lstm':
        model = Sequential()
        model.add(Embedding(final_vocab_size, output_dim=10, input_length=max_length - 1))
        model.add(LSTM(10))
        model.add(Dense(final_vocab_size, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=20, verbose=True, shuffle=False, batch_size=64)
        test_history = model.evaluate(X_test, y_test, batch_size=1000, verbose=True)
        return test_history[1]
    elif model_name == 'linear':
        reg = LinearRegression().fit(embed_train.reshape(embed_train.shape[0], -1), y_train)
        y_pred = reg.predict(embed_test.reshape(embed_test.shape[0], -1))
        y_pred[y_pred >= 0.5] = 1
        y_pred[y_pred < 0.5] = 0
        return accuracy_score(y_test, y_pred)
    elif model_name == 'nb':
        clf = GaussianNB()
        clf.fit(embed_train.reshape(embed_train.shape[0], -1), y_train)
        y_pred = clf.predict(embed_test.reshape(embed_test.shape[0], -1))
        y_pred[y_pred >= 0.5] = 1
        y_pred[y_pred < 0.5] = 0
        return accuracy_score(y_test, y_pred)

In [24]:
sequences, final_vocab_size, max_length, X_train, X_test, y_train, y_test = dataset_creator('./blackscholes_500k.txt', use_manual_encoding=True)
run_model('linear', final_vocab_size, max_length, X_train, y_train, X_test, y_test)

0.1263124666481274

In [11]:
y_train.shape

# look_back = 1, train/test = 8/2, accuracy = 0.13275613275613277
# look_back = 3, train/test = 8/2, accuracy = 0.1328633487155591
# look_back = 10, train/test = 8/2, accuracy = 0.1328803775967291

(266107, 474)

In [None]:
app_names = ['./blackscholes_500k.txt', './fluidanimate_500k.txt', './swaptions_500k.txt']
model_names = ['lstm', 'linear', 'nb']
for app in app_names:
    for model in model_names:
        sequences, final_vocab_size, max_length, X_train, X_test, y_train, y_test = dataset_creator(app, use_manual_encoding=True)
        run_model(model, final_vocab_size, max_length, X_train, y_train, X_test, y_test)