In [35]:
import pandas as pd
import numpy as np
from collections import defaultdict 
import sys
import os
import matplotlib.pyplot as plt
from tabulate import tabulate

import keras 
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Dropout, Embedding, LSTM, Flatten, Lambda
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from keras.preprocessing.text import Tokenizer

from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from keras.utils.vis_utils import plot_model

In [36]:
ADDR = '/nfs_home/nbhardwaj/data/rds_data/SPEC2017/'
files = [510]

In [37]:
from sklearn.preprocessing import LabelEncoder

class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        m = {}
        for x in self.label_encoder.classes_:
            m[x] = True
        for ind, y in enumerate(new_data_list):
            if(m.get(y) is None):
                new_data_list[ind] = 'Unknown'
#         for unique_item in np.unique(data_list):
#             if unique_item not in self.label_encoder.classes_:
#                 new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]
        return self.label_encoder.transform(new_data_list)
    

In [53]:
def create_model(embed_size = 10, dense_size = 200):
    inp1 = Input(shape = (1,))
#     inp2 = Input(shape = (1,))
#     inp3 = Input(shape = (1,))

    embed1 = Embedding(len(le_inst.classes_), embed_size, input_length = 1)(inp1)
#     embed2 = Embedding(len(le_delta.classes_), embed_size, input_length = 1)(inp2)

#     merged_inp = keras.layers.concatenate([embed1, embed2], axis = 1)
    merged_inp = Flatten()(embed1)
#     merged_inp = keras.layers.concatenate([merged_inp, inp3])

    out = Dense(dense_size, activation = 'relu')(merged_inp)
    out = Dense(8, activation = 'softmax')(out)

    model = Model([inp1], out)
    return model

In [None]:
sets = [x for x in range(64)]
inst_vocab = []
delta_vocab = []
train_acc = []
test_acc = []
for cset in sets:
    df = pd.read_csv(ADDR+'510_'+str(cset)+'.csv', index_col = [0], usecols = [0, 2, 8])
#     df.Mode = np.where(df.Mode.values=='R', 1, -1)
#     df.Mode = df['Mode'].astype('str')
    df.Instruction = df.Instruction.astype('str')
#     df.delta = df.delta.astype('float')
#     X = df[['Instruction', 'delta', 'Mode']].values[1:]
    X = df[['Instruction']].values
    y = df[['label']].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
    print("--------------------split done---------------------")
    le_inst = LabelEncoderExt()
    le_inst.fit(X_train[:, 0])
#     le_delta = LabelEncoderExt()
#     le_delta.fit(X_train[:, 1])
    print("----------------labels done----------------------")
    X_train[:, 0] = le_inst.transform(X_train[:, 0])
#     X_train[:, 1] = le_delta.transform(X_train[:, 1])
    print("--------")
    
    X_test[:, 0] = le_inst.transform(X_test[:, 0])
#     X_test[:, 1] = le_delta.transform(X_test[:, 1])
    print("-------------------labels transformed---------------------")
    model = create_model()
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    filepath = '/nfs_home/nbhardwaj/model_weights/SPEC2017/510_'+str(cset)+'.hdf5'
    # checkpointer = ModelCheckpoint(filepath, monitor = 'val_acc', save_best_only = True, mode = 'max', verbose = 1)
    model.fit(X_train[:, 0] , to_categorical(y_train), epochs = 2, use_multiprocessing = True)
    print("------------training done------------")
    model.save_weights(filepath)
    t_ac = model.evaluate(X_test[:, 0], to_categorical(y_test))[1]
    test_acc.append(t_ac)
    train_acc.append(model.evaluate(X_train[:, 0], to_categorical(y_train))[1])
    inst_vocab.append(len(le_inst.classes_))
#     delta_vocab.append(len(le_delta.classes_))
    print("--------------done processing for set---------->", cset, '|| accuracy||', t_ac)

--------------------split done---------------------
----------------labels done----------------------
--------
-------------------labels transformed---------------------
Epoch 1/2
Epoch 2/2
------------training done------------
--------------done processing for set----------> 0 || accuracy|| 0.6036520004272461
--------------------split done---------------------
----------------labels done----------------------
--------
-------------------labels transformed---------------------
Epoch 1/2
Epoch 2/2
------------training done------------
--------------done processing for set----------> 1 || accuracy|| 0.6394686698913574
--------------------split done---------------------
----------------labels done----------------------
--------
-------------------labels transformed---------------------
Epoch 1/2
Epoch 2/2
------------training done------------
--------------done processing for set----------> 2 || accuracy|| 0.479892760515213
--------------------split done---------------------
-------------

In [58]:
test_acc

[0.6036520004272461,
 0.6394686698913574,
 0.479892760515213,
 0.46041056513786316,
 0.5156739950180054,
 0.9122096300125122,
 0.46291208267211914,
 0.49619483947753906,
 0.9317673444747925,
 0.7438478469848633,
 0.7400835156440735,
 0.8329676985740662,
 0.91651451587677,
 0.8276474475860596,
 0.8748176097869873,
 0.8744987845420837,
 0.9370473623275757,
 0.8140120506286621,
 0.8759154677391052,
 0.8749029040336609,
 0.6692717671394348,
 0.8526544570922852,
 0.9171963930130005,
 0.8150568008422852,
 0.7618939876556396,
 0.874782383441925,
 0.8873749375343323,
 0.701298713684082,
 0.6073619723320007,
 0.8744004964828491,
 0.6280087232589722,
 0.4568965435028076,
 0.7677642703056335,
 0.31351351737976074,
 0.4261603355407715,
 0.7698503732681274,
 0.781074583530426,
 0.7668323516845703,
 0.435546875,
 0.7782971858978271,
 0.5431917905807495,
 0.788249671459198,
 0.692650318145752,
 0.984354555606842,
 0.7985031604766846,
 0.5134680271148682,
 0.49293285608291626,
 0.42032331228256226,
 0

In [57]:
len(inst_vocab)

64



-------------------labels transformed---------------------

Epoch 1/2
Epoch 2/2
------------training done------------


NameError: name 'delta_vocab' is not defined

In [145]:
# df.head()

# # Read -> 1 ; Write -> -1




# print(np.min(df.Instruction.values), np.max(df.Instruction.values))

# print(X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# print(len(np.unique(X_train[:, 0])), len(np.unique(X[:, 0])), len(np.unique(X_train[:, 1])), len(np.unique(X[:, 1])))



# le_inst = LabelEncoderExt()
# le_inst.fit(X_train[:, 0])
# le_delta = LabelEncoderExt()
# le_delta.fit(X_train[:, 1])

# X_train[:, 0] = le_inst.transform(X_train[:, 0])
# X_train[:, 1] = le_delta.transform(X_train[:, 1])

# X_test[:, 0] = le_inst.transform(X_test[:, 0])
# X_test[:, 1] = le_delta.transform(X_test[:, 1])

# print(len(le_inst.classes_), len(le_delta.classes_))

# def create_model(embed_size = 10):
#     inp1 = Input(shape = (1,))
#     inp2 = Input(shape = (1,))
#     inp3 = Input(shape = (1,))

#     embed1 = Embedding(len(le_inst.classes_), embed_size, input_length = 1)(inp1)
#     embed2 = Embedding(len(le_delta.classes_), embed_size, input_length = 1)(inp2)
    
#     merged_inp = keras.layers.concatenate([embed1, embed2], axis = 1)
#     merged_inp = Flatten()(merged_inp)
#     merged_inp = keras.layers.concatenate([merged_inp, inp3])
    
#     out = Dense(200, activation = 'relu')(merged_inp)
#     out = Dense(8, activation = 'softmax')(out)
    
#     model = Model([inp1, inp2, inp3], out)
#     return model
    

# model = create_model()

# print(model.summary())

# plot_model(model)

# model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

# filepath = '/nfs_home/nbhardwaj/model_weights/RD_Embedding/'+str(cset)+'.hdf5'

# # checkpointer = ModelCheckpoint(filepath, monitor = 'val_acc', save_best_only = True, mode = 'max', verbose = 1)
# model.fit([X_train[:, 0], X_train[:, 1], X_train[:, 2]], to_categorical(y_train), epochs = 2)


# model.save_weights(filepath)

# acc_test = model.evaluate([X_test[:, 0], X_test[:, 1], X_test[:, 2]], to_categorical(y_test))

# print(acc_test[0])
# print(model.metrics_names)

# len(le_inst.classes_)



Unnamed: 0,ICount,Instruction,Data,Mode,set,rd,delta,label
0,638313090423,4722720,2198847782096,R,0,2,,2
1,638313090428,4722742,3646153,R,0,2,-2198844000000.0,2
2,638313090748,4722720,2198847782096,R,0,2,2198844000000.0,2
3,638313090753,4722742,3646153,R,0,2,-2198844000000.0,2
4,638313091064,4722720,2198847782096,R,0,2,2198844000000.0,2
