### Load dataset and paths

In [2]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from keras.layers import Dense, Input, concatenate, GRU, LSTM
from keras import backend as K
from keras.utils import to_categorical
from keras import Model
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import jaccard_score
import distance

Using TensorFlow backend.


In [3]:
iris = load_iris()
X = iris['data']
y = iris['target']
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [4]:
df = pd.DataFrame(X)

**Note: The 'bin_labels' and 'paths' csv is extracted from R. The R package used to create the decision tree is 'rpart'.**

In [5]:
bin_labels = pd.read_csv('../../data/raw/rpart_bin_labels.csv', delimiter=",")

In [6]:
bin_labels = bin_labels.rename(columns={"Unnamed: 0": "label", "label_list": "bins"})

In [7]:
bin_labels.head()

Unnamed: 0,label,bins
0,A,0.8
1,B,1.35
2,C,1.55
3,D,1.65
4,E,1.75


In [9]:
bin_labels

Unnamed: 0,label,bins
0,A,0.8
1,B,1.35
2,C,1.55
3,D,1.65
4,E,1.75
5,F,1.85
6,G,2.45
7,H,2.55
8,I,2.65
9,J,2.85


In [8]:
# path_df = pd.read_csv('../../data/raw/rpart_paths.csv', delimiter=",")
# path_df = pd.read_csv('../../data/raw/test_paths.csv', delimiter=",")
path_df = pd.read_csv('../../data/raw/paths.csv', delimiter=",")

In [9]:
path_df = path_df.drop(["Unnamed: 0"], axis=1)
path_df = path_df.rename(columns={list(path_df)[0]: "new_col"})

In [10]:
path_df.head()

Unnamed: 0,new_col
0,3G0
1,3G0
2,3G0
3,3G0
4,3G0


In [16]:
label_freq = {}
count = 0
for i in (path_df.loc[:,'new_col']):
    for j in i.split(','):
        try:
            label_freq.update({j:label_freq[j]+1})
        except KeyError:
            label_freq.update({j:1})

In [17]:
label_freq

{'3G0': 50,
 '3G1': 100,
 '4E0': 54,
 '3R0': 48,
 '4D0': 47,
 '4E1': 46,
 '3Q0': 3,
 '1V0': 1,
 '3R1': 6,
 '4C1': 3,
 '1Y0': 2,
 '3Q1': 43,
 '4D1': 1,
 '4C0': 3,
 '1V1': 2,
 '1Y1': 1}

In [18]:
sum(label_freq.values())

410

In [19]:
label_freq_2 = {}
count = 0
for i in (path_df.loc[:,'new_col']):
    for j in i.split(','):
        try:
            label_freq_2.update({j[0:-1]:label_freq_2[j[0:-1]]+1})
        except KeyError:
            label_freq_2.update({j[0:-1]:1})

In [20]:
label_freq_2

{'3G': 150, '4E': 100, '3R': 54, '4D': 48, '3Q': 46, '1V': 3, '4C': 6, '1Y': 3}

In [11]:
test_data = pd.concat([df, path_df], axis=1)
# test_data = df

In [12]:
test_data.head()

Unnamed: 0,0,1,2,3,new_col
0,-0.900681,1.019004,-1.340227,-1.315444,3G0
1,-1.143017,-0.131979,-1.340227,-1.315444,3G0
2,-1.385353,0.328414,-1.397064,-1.315444,3G0
3,-1.506521,0.098217,-1.283389,-1.315444,3G0
4,-1.021849,1.249201,-1.340227,-1.315444,3G0


In [13]:
new_path = []
for i, val in test_data.iterrows():
    new_path.append(val['new_col'].split(sep=","))

**Extending the paths with start 'S' and end 'E' tokens**

In [14]:
_ = [x.insert(0, 'S') for x in new_path]
_ = [x.append('E') for x in new_path]

In [17]:
new_path

[['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],
 ['S', '3G0', 'E'],


In [18]:
test_data['new_path'] = new_path

In [16]:
# 3G1', '4E0', '3R0', '4D0
# 3G1', '4E1', '3Q1
# a = [1,0,0,0]
# b = [1,1,1]
# c = [0]

In [19]:
test_data = test_data.drop(["new_col"], axis=1)

In [20]:
test_data.head()

Unnamed: 0,0,1,2,3,new_path
0,-0.900681,1.019004,-1.340227,-1.315444,"[S, 3G0, E]"
1,-1.143017,-0.131979,-1.340227,-1.315444,"[S, 3G0, E]"
2,-1.385353,0.328414,-1.397064,-1.315444,"[S, 3G0, E]"
3,-1.506521,0.098217,-1.283389,-1.315444,"[S, 3G0, E]"
4,-1.021849,1.249201,-1.340227,-1.315444,"[S, 3G0, E]"


In [21]:
paths_lengths = np.array([len(xi)
                          for xi in test_data.iloc[:,-1]])

In [22]:
paths_lengths
np.max(paths_lengths)

7

In [23]:
label_char = []
for _, i in enumerate(np.unique(test_data['new_path'])):
    for _, j in enumerate(i):
        if j not in label_char:
            label_char.append(j)

In [24]:
label_indices = { j : i for i, j in enumerate(label_char) }

In [25]:
label_indices

{'S': 0,
 '3G0': 1,
 'E': 2,
 '3G1': 3,
 '4E0': 4,
 '3R0': 5,
 '4D0': 6,
 '4D1': 7,
 '3R1': 8,
 '4C0': 9,
 '4C1': 10,
 '1Y0': 11,
 '1Y1': 12,
 '4E1': 13,
 '3Q0': 14,
 '1V0': 15,
 '1V1': 16,
 '3Q1': 17}

In [1]:
label_indices

NameError: name 'label_indices' is not defined

In [26]:
len(label_indices)

18

In [27]:
indices_label = { i : j for i, j in enumerate(label_char) }
indices_label

{0: 'S',
 1: '3G0',
 2: 'E',
 3: '3G1',
 4: '4E0',
 5: '3R0',
 6: '4D0',
 7: '4D1',
 8: '3R1',
 9: '4C0',
 10: '4C1',
 11: '1Y0',
 12: '1Y1',
 13: '4E1',
 14: '3Q0',
 15: '1V0',
 16: '1V1',
 17: '3Q1'}

In [63]:
np.unique(path_df.loc[:,'new_col'])

array(['3G0', '3G1,4E0,3R0,4D0', '3G1,4E0,3R0,4D1', '3G1,4E0,3R1,4C0',
       '3G1,4E0,3R1,4C1,1Y0', '3G1,4E0,3R1,4C1,1Y1', '3G1,4E1,3Q0,1V0',
       '3G1,4E1,3Q0,1V1', '3G1,4E1,3Q1'], dtype=object)

In [28]:
np.unique(path_df, return_counts=True)

(array(['3G0', '3G1,4E0,3R0,4D0', '3G1,4E0,3R0,4D1', '3G1,4E0,3R1,4C0',
        '3G1,4E0,3R1,4C1,1Y0', '3G1,4E0,3R1,4C1,1Y1', '3G1,4E1,3Q0,1V0',
        '3G1,4E1,3Q0,1V1', '3G1,4E1,3Q1'], dtype=object),
 array([50, 47,  1,  3,  2,  1,  1,  2, 43]))

**Vectorize path sequence**

In [29]:
input_path_sequence = []
next_chars = []
features = []
paths_maxlen = np.max(paths_lengths)
# path_vocab_size = len(bin_labels) # How is this working? Validate!
path_vocab_size = len(indices_label) # Temporary test for local trees
feature_size = 4
for i in range(0, len(test_data)):
    # get the feature
    curr_feat = np.array([test_data.iloc[i, 0:4]])
    curr_path = test_data.iloc[i, -1]
    curr_path_len = len(curr_path)
    # curr_label = y[i]
    # curr_dec_feat = df.iloc[i, 6]
    for j in range(1, curr_path_len):
        features.append(curr_feat)
        input_path_sequence.append(curr_path[0:j])
        next_chars.append(curr_path[j])

x_path = np.zeros(
    (len(input_path_sequence), paths_maxlen, path_vocab_size), dtype=np.bool)

path_latent_input = np.zeros(
    (len(input_path_sequence), feature_size), dtype=np.float)

y_path = np.zeros(
    (len(input_path_sequence), path_vocab_size), dtype=np.bool)

# print(input_path_sequence)
# print(len(input_path_sequence))
for i, sentence in enumerate(input_path_sequence):
    for t, char in enumerate(sentence):
        # x_path[i, t, self.char_indices[char]] = 1
        # print(bin_labels.index[bin_labels['label'] == char[1]])
        # index = bin_labels.index[bin_labels['label'] == char[1]].tolist()[0]
        x_path[i, t, label_indices[char]] = 1
    # y_path[i, char_indices[next_chars[i]]] = 1
    # index = bin_labels.index[bin_labels['label'] == next_chars[i][1]].tolist()[0]
    # y_path[i, index] = 1
    y_path[i, label_indices[next_chars[i]]] = 1
    path_latent_input[i, :] = features[i]
    
## Trouble with "S" and "E" index values.

In [30]:
len(input_path_sequence)

560

In [31]:
len(next_chars)

560

In [32]:
y_path.shape

(560, 18)

In [33]:
def _create_label_model(latent_dim=5):
    input_layer = Input(shape=(feature_size,), name='ip_x')
    hidden_layer_x1 = Dense(20, activation='tanh',
                            name='hidden_x1')(input_layer)
    hidden_layer_x2 = Dense(20, activation='tanh',
                            name='hidden_x2')(hidden_layer_x1)
    hidden_layer_x3 = Dense(latent_dim, activation='tanh',
                            name='hidden_x3')(hidden_layer_x2)
    output_layer = Dense(len(np.unique(y)), activation='softmax',
                         name='op_x')(hidden_layer_x3)
    model = Model(input_layer, output_layer)
    return model

def _create_combined_model(initialize=True, rnn_cell='gru', latent_dim=5):

    label_model_latent = Input(shape=(latent_dim,), name='label_ip')
    path_input = Input(shape=(
        paths_maxlen, path_vocab_size), name='dec_feat_ip')
    if rnn_cell == 'gru':
        RNN = GRU
    else:
        RNN = LSTM

    decoder = RNN(latent_dim, return_state=False,
                  return_sequences=False, name='gru_seq')
    if initialize:
        decoder_outputs = decoder(
            path_input, initial_state=label_model_latent)
    else:
        decoder_outputs = decoder(path_input)

    merge_layer = concatenate(
        [label_model_latent, decoder_outputs], name='cat')
    output_chars = Dense(path_vocab_size,
                         activation='softmax', name='op_sent')(merge_layer)
    model = Model(
        [label_model_latent, path_input], output_chars)
    return model

In [34]:
combined_model = _create_combined_model()
label_model = _create_label_model()

Instructions for updating:
Colocations handled automatically by placer.


In [35]:
def get_hidden_x(x, model, layer_num=3):
    def get_hidden_x_inner(model, layer_num=layer_num):
        return K.function([model.layers[0].input], [model.layers[layer_num].output])
    return get_hidden_x_inner(model, layer_num=layer_num)([x])[0]

In [36]:
X[:10,:]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ],
       [-0.53717756,  1.93979142, -1.16971425, -1.05217993],
       [-1.50652052,  0.78880759, -1.34022653, -1.18381211],
       [-1.02184904,  0.78880759, -1.2833891 , -1.3154443 ],
       [-1.74885626, -0.36217625, -1.34022653, -1.3154443 ],
       [-1.14301691,  0.09821729, -1.2833891 , -1.44707648]])

In [37]:
y[:10,]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [38]:
test_data.iloc[:10,:]

Unnamed: 0,0,1,2,3,new_path
0,-0.900681,1.019004,-1.340227,-1.315444,"[S, 3G0, E]"
1,-1.143017,-0.131979,-1.340227,-1.315444,"[S, 3G0, E]"
2,-1.385353,0.328414,-1.397064,-1.315444,"[S, 3G0, E]"
3,-1.506521,0.098217,-1.283389,-1.315444,"[S, 3G0, E]"
4,-1.021849,1.249201,-1.340227,-1.315444,"[S, 3G0, E]"
5,-0.537178,1.939791,-1.169714,-1.05218,"[S, 3G0, E]"
6,-1.506521,0.788808,-1.340227,-1.183812,"[S, 3G0, E]"
7,-1.021849,0.788808,-1.283389,-1.315444,"[S, 3G0, E]"
8,-1.748856,-0.362176,-1.340227,-1.315444,"[S, 3G0, E]"
9,-1.143017,0.098217,-1.283389,-1.447076,"[S, 3G0, E]"


In [39]:
from sklearn.utils import shuffle
x_, y_, test_data_ = shuffle(X, y, test_data)

In [40]:
y_[:10]

array([1, 1, 1, 1, 2, 2, 0, 0, 0, 1])

In [41]:
x_[:10,:]

array([[-5.25060772e-02, -1.05276654e+00,  1.37546573e-01,
         8.77547895e-04],
       [-4.16009689e-01, -1.28296331e+00,  1.37546573e-01,
         1.32509732e-01],
       [ 6.86617933e-02,  3.28414053e-01,  5.92245988e-01,
         7.90670654e-01],
       [-1.73673948e-01, -1.05276654e+00, -1.46640561e-01,
        -2.62386821e-01],
       [ 1.03800476e+00,  9.82172869e-02,  1.04694540e+00,
         1.58046376e+00],
       [ 2.24968346e+00, -1.31979479e-01,  1.33113254e+00,
         1.44883158e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.18381211e+00],
       [ 1.28034050e+00,  9.82172869e-02,  6.49083415e-01,
         3.95774101e-01]])

In [42]:
test_data_.iloc[:10,:]

Unnamed: 0,0,1,2,3,new_path
92,-0.052506,-1.052767,0.137547,0.000878,"[S, 3G1, 4E0, 3R0, 4D0, E]"
89,-0.41601,-1.282963,0.137547,0.13251,"[S, 3G1, 4E0, 3R0, 4D0, E]"
70,0.068662,0.328414,0.592246,0.790671,"[S, 3G1, 4E1, 3Q0, 1V0, E]"
79,-0.173674,-1.052767,-0.146641,-0.262387,"[S, 3G1, 4E0, 3R0, 4D0, E]"
140,1.038005,0.098217,1.046945,1.580464,"[S, 3G1, 4E1, 3Q1, E]"
135,2.249683,-0.131979,1.331133,1.448832,"[S, 3G1, 4E1, 3Q1, E]"
34,-1.143017,0.098217,-1.283389,-1.315444,"[S, 3G0, E]"
8,-1.748856,-0.362176,-1.340227,-1.315444,"[S, 3G0, E]"
17,-0.900681,1.019004,-1.340227,-1.183812,"[S, 3G0, E]"
52,1.28034,0.098217,0.649083,0.395774,"[S, 3G1, 4E0, 3R0, 4D0, E]"


In [43]:
path_latent_input[:380,:].shape

(380, 4)

In [44]:
x_path[:380,:,:].shape

(380, 7, 18)

In [45]:
y_path[:380,:].shape

(380, 18)

In [46]:
def fit_model():

    y_cat = to_categorical(y_[:100,])

    label_model.compile(
        optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    label_model.fit(
        x_[:100,:], y_cat, batch_size=30, epochs=50, verbose=1, shuffle=True, validation_split=0.2)

    x_latent = get_hidden_x(path_latent_input[:380,:], model=label_model)

    combined_model.compile(
        optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    combined_model.fit([x_latent, x_path[:380,:,:]], y_path[:380,:],
                           batch_size=30, epochs=400, verbose=1, shuffle=True)

In [45]:
fit_model()

Instructions for updating:
Use tf.cast instead.
Train on 80 samples, validate on 20 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400


Epoch 150/400
Epoch 151/400
Epoch 152/400
Epoch 153/400
Epoch 154/400
Epoch 155/400
Epoch 156/400
Epoch 157/400
Epoch 158/400
Epoch 159/400
Epoch 160/400
Epoch 161/400
Epoch 162/400
Epoch 163/400
Epoch 164/400
Epoch 165/400
Epoch 166/400
Epoch 167/400
Epoch 168/400
Epoch 169/400
Epoch 170/400
Epoch 171/400
Epoch 172/400
Epoch 173/400
Epoch 174/400
Epoch 175/400
Epoch 176/400
Epoch 177/400
Epoch 178/400
Epoch 179/400
Epoch 180/400
Epoch 181/400
Epoch 182/400
Epoch 183/400
Epoch 184/400
Epoch 185/400
Epoch 186/400
Epoch 187/400
Epoch 188/400
Epoch 189/400
Epoch 190/400
Epoch 191/400
Epoch 192/400
Epoch 193/400
Epoch 194/400
Epoch 195/400
Epoch 196/400
Epoch 197/400
Epoch 198/400
Epoch 199/400
Epoch 200/400
Epoch 201/400
Epoch 202/400
Epoch 203/400
Epoch 204/400
Epoch 205/400
Epoch 206/400
Epoch 207/400
Epoch 208/400
Epoch 209/400
Epoch 210/400
Epoch 211/400
Epoch 212/400
Epoch 213/400
Epoch 214/400
Epoch 215/400
Epoch 216/400
Epoch 217/400
Epoch 218/400
Epoch 219/400
Epoch 220/400
Epoch 

Epoch 232/400
Epoch 233/400
Epoch 234/400
Epoch 235/400
Epoch 236/400
Epoch 237/400
Epoch 238/400
Epoch 239/400
Epoch 240/400
Epoch 241/400
Epoch 242/400
Epoch 243/400
Epoch 244/400
Epoch 245/400
Epoch 246/400
Epoch 247/400
Epoch 248/400
Epoch 249/400
Epoch 250/400
Epoch 251/400
Epoch 252/400
Epoch 253/400
Epoch 254/400
Epoch 255/400
Epoch 256/400
Epoch 257/400
Epoch 258/400
Epoch 259/400
Epoch 260/400
Epoch 261/400
Epoch 262/400
Epoch 263/400
Epoch 264/400
Epoch 265/400
Epoch 266/400
Epoch 267/400
Epoch 268/400
Epoch 269/400
Epoch 270/400
Epoch 271/400
Epoch 272/400
Epoch 273/400
Epoch 274/400
Epoch 275/400
Epoch 276/400
Epoch 277/400
Epoch 278/400
Epoch 279/400
Epoch 280/400
Epoch 281/400
Epoch 282/400
Epoch 283/400
Epoch 284/400
Epoch 285/400
Epoch 286/400
Epoch 287/400
Epoch 288/400
Epoch 289/400
Epoch 290/400
Epoch 291/400
Epoch 292/400
Epoch 293/400
Epoch 294/400
Epoch 295/400
Epoch 296/400
Epoch 297/400
Epoch 298/400
Epoch 299/400
Epoch 300/400
Epoch 301/400
Epoch 302/400
Epoch 

Epoch 314/400
Epoch 315/400
Epoch 316/400
Epoch 317/400
Epoch 318/400
Epoch 319/400
Epoch 320/400
Epoch 321/400
Epoch 322/400
Epoch 323/400
Epoch 324/400
Epoch 325/400
Epoch 326/400
Epoch 327/400
Epoch 328/400
Epoch 329/400
Epoch 330/400
Epoch 331/400
Epoch 332/400
Epoch 333/400
Epoch 334/400
Epoch 335/400
Epoch 336/400
Epoch 337/400
Epoch 338/400
Epoch 339/400
Epoch 340/400
Epoch 341/400
Epoch 342/400
Epoch 343/400
Epoch 344/400
Epoch 345/400
Epoch 346/400
Epoch 347/400
Epoch 348/400
Epoch 349/400
Epoch 350/400
Epoch 351/400
Epoch 352/400
Epoch 353/400
Epoch 354/400
Epoch 355/400
Epoch 356/400
Epoch 357/400
Epoch 358/400
Epoch 359/400
Epoch 360/400
Epoch 361/400
Epoch 362/400
Epoch 363/400
Epoch 364/400
Epoch 365/400
Epoch 366/400
Epoch 367/400
Epoch 368/400
Epoch 369/400
Epoch 370/400
Epoch 371/400
Epoch 372/400
Epoch 373/400
Epoch 374/400
Epoch 375/400
Epoch 376/400
Epoch 377/400
Epoch 378/400
Epoch 379/400
Epoch 380/400
Epoch 381/400
Epoch 382/400
Epoch 383/400
Epoch 384/400
Epoch 

Epoch 396/400
Epoch 397/400
Epoch 398/400
Epoch 399/400
Epoch 400/400


In [48]:
combined_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
label_ip (InputLayer)           (None, 5)            0                                            
__________________________________________________________________________________________________
dec_feat_ip (InputLayer)        (None, 7, 18)        0                                            
__________________________________________________________________________________________________
gru_seq (GRU)                   (None, 5)            360         dec_feat_ip[0][0]                
                                                                 label_ip[0][0]                   
__________________________________________________________________________________________________
cat (Concatenate)               (None, 10)           0           label_ip[0][0]                   
          

In [47]:
def jaccard_score_inconsistent(x, y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

def get_j_coeff(a, b):
    if len(a) != len(b):
        return jaccard_score_inconsistent(a, b)
    return jaccard_score(a, b, average='micro')

In [48]:
def predict(x):
    latent_dim = 5
    x_f = x.reshape(1, feature_size)
    token = 'S'
    cont = True
    path = [token]
    x_path = np.zeros((1, paths_maxlen, path_vocab_size), dtype=np.bool)

    x_latent = get_hidden_x(x_f, model=label_model)
    x_latent = x_latent.reshape(1, latent_dim)
    x_path[0, 0, label_indices[token]] = 1
    pred = label_model.predict(x_f)
    label = [np.argmax(pred[0])]
    index = 1
    while cont & (index < paths_maxlen):
        pred = combined_model.predict([x_latent, x_path])
        char_index = np.argmax(pred[0])
        x_path[0, index, char_index] = 1
        next_char = indices_label[char_index]
        path.append(next_char)
        index += 1
        if next_char == 'E':
            cont = False
        # elif index == self.paths_maxlen - 1:
        #     path.append('E')

    if path[-1] != 'E':
        path.append('E')

    return [path, label]

In [49]:
## This code does path_invariance checks. This is not in use currently since it utilizes sklearn decision tree.
## This needs to be re-written for the rpart tree.

# def check_path(path): # Returns -1 if path traversed is wrong/non-existant
#     # path = ''.join(path)
#     path = path[1:-1]
#     pred_features = []
#     path_as_strings = []
#     for i in range(len(path)):
#         pred_features.append(int(path[i][:-1]))
#         path_as_strings.append(path[i][-1])
#         # if i%2 == 0:
#         #     print('i -- ', i)
#         #     print('path -- ', path)
#         #     print('path[i] -- ', path[i])
#         #     pred_features.append(int(path[i]))
#         # else:
#         #     path_as_strings.append(path[i])

#     n_nodes = self.clf.tree_.node_count
#     children_left = self.clf.tree_.children_left
#     children_right = self.clf.tree_.children_right
#     feature = self.clf.tree_.feature

#     is_leaves = np.zeros(shape=n_nodes, dtype=bool)
#     stack = [(0, -1)]
#     while len(stack) > 0:
#         node_id, parent_depth = stack.pop()
#         # node_depth[node_id] = parent_depth + 1

#         if (children_left[node_id] != children_right[node_id]):
#             stack.append((children_left[node_id], parent_depth + 1))
#             stack.append((children_right[node_id], parent_depth + 1))
#         else:
#             is_leaves[node_id] = True


#     node = 0
#     pred_target = -1
#     subset_path = False
#     for i in range(len(path_as_strings)):
#         if path_as_strings[i] == 'L':
#             if feature[node]+1 == pred_features[i]:
#                 node = children_left[node]
#             # else:
#                 # pred_target = -1 # Remove for "subset" checks
#                 # break
#         elif path_as_strings[i] == 'R':
#             if feature[node]+1 == pred_features[i]:
#                 node = children_right[node]
#             # else:
#                 # pred_target = -1 # Remove for "subset" checks
#                 # break
#         if is_leaves[node]:
#             for i, x in enumerate(self.clf.tree_.value[node][0]):
#                 if x > 0:
#                     pred_target = i
#             if i < len(path_as_strings):
#                 subset_path = True

#     return pred_target, subset_path

In [69]:
def score():
    count = []
    bleu_score = []
    j_coeff = []
    l_dist = []
    path_mismatch_count = []
    traverse_check_count = []
    order_mismatch_count = []
    subset_path_count = []
    # for i in range(test_data.shape[0]):
    for i in range(50):
        curr_feat = np.array([test_data_.iloc[i, 0:X.shape[1]]])
        path, label = predict(curr_feat)
        actual_path = test_data_.iloc[i, -1]

        actual_path_tok = [label_indices[char] for char in actual_path]
        pred_path_tok = [label_indices[char] for char in path]

        j_coeff.append(get_j_coeff(actual_path_tok, pred_path_tok))

        print('actual vs predicted: ', test_data_.iloc[i, -1], ' vs ', ' '.join(
            path), 'labels: ', y_[i], label[0])
        count.append(y_[i] == label[0])
        # print('Actual path -- ', actual_path)
        # print('Pred path -- ', path)
#         if actual_path != path:
#             print(' -- Path mismatch -- ')
#             if sorted(actual_path) == sorted(path):
#                 print(' -- Order mismatch -- ')
#                 order_mismatch_count.append(1)
#             else:
#                 path_mismatch_count.append(1)
#                 pred_target, subset_path = self.check_path(path)
#                 subset_path_count.append(subset_path)
#                 if pred_target != -1 and pred_target == self.df.iloc[i, self.X.shape[1]+1]:
#                     traverse_check_count.append(1)


        path = list(''.join(path))
        actual_path = list(''.join(test_data_.iloc[i, -1]))
        bleu_score.append(sentence_bleu([actual_path], path))

#         lev_path = []
#         for i in range(len(path)):
#             if i in ['S','L','R','E']:
#                 lev_path.append(i)
#         l_dist.append(distance.levenshtein(
#             self.df.iloc[i, self.X.shape[1]].replace(' ', ''), ''.join(lev_path)))
        l_dist.append(distance.levenshtein(path, actual_path))


    print('\nLabel accuracy - ', np.mean(count))
    print('Path metric (Jaccard) - ', np.mean(j_coeff))
    print('Path metric (Levenshtein) - ', np.mean(l_dist))
#     print('Path mismatch count - ', np.sum(path_mismatch_count))
#     print('Right traverse count - ', np.sum(traverse_check_count))
#     print('Order mismatch count - ', np.sum(order_mismatch_count))
#     print('Subset path count - ', np.sum(subset_path_count))
    print('Bleu score of paths - ', np.mean(bleu_score))

In [70]:
score()

actual vs predicted:  ['S', '3G0', 'E']  vs  S 3G0 E labels:  0 0
actual vs predicted:  ['S', '3G1', '4E0', '3R0', '4D0', 'E']  vs  S 3G1 4E0 3R0 4D0 E labels:  1 1
actual vs predicted:  ['S', '3G1', '4E0', '3R0', '4D0', 'E']  vs  S 3G1 4E0 3R0 4D0 E labels:  1 1
actual vs predicted:  ['S', '3G1', '4E0', '3R0', '4D0', 'E']  vs  S 3G1 4E0 3R0 4D0 E labels:  1 1
actual vs predicted:  ['S', '3G0', 'E']  vs  S 3G0 E labels:  0 0
actual vs predicted:  ['S', '3G0', 'E']  vs  S 3G0 E labels:  0 0
actual vs predicted:  ['S', '3G1', '4E1', '3Q0', '1V1', 'E']  vs  S 3G1 4E1 3Q1 E labels:  2 2
actual vs predicted:  ['S', '3G1', '4E0', '3R1', '4C0', 'E']  vs  S 3G1 4E0 3R0 4D0 E labels:  2 1
actual vs predicted:  ['S', '3G1', '4E1', '3Q1', 'E']  vs  S 3G1 4E1 3Q1 E labels:  2 2
actual vs predicted:  ['S', '3G1', '4E0', '3R0', '4D0', 'E']  vs  S 3G1 4E0 3R0 4D0 E labels:  1 1
actual vs predicted:  ['S', '3G0', 'E']  vs  S 3G0 E labels:  0 0
actual vs predicted:  ['S', '3G1', '4E0', '3R0', '4D0', 'E

### Result interpretation

In [97]:
curr_feat = np.array([test_data.iloc[90, 0:4]])
path, label = predict(curr_feat)

In [98]:
path

['S', '3G1', '4E0', '3R0', '4D0', 'E']

In [100]:
print("Traversed %s nodes:" % (len(path)-2))
for i in range(len(path)):
    if i == (len(path)-1):
        print("%snode=%s Leaf node." % (i*"\t", i))
    else:
        feature_num = path[i][0]
        decision = path[i][-1]
        if decision == '1':
            decision = '>='
        else:
            decision = '<'
        if i > 0:
            label = path[i][1:-1]
            cutpoint = bin_labels[bin_labels['label'] == label].values[0][1]
            print("%snode=%s Test node: Feature %s %s %s" %(i * "\t", i, feature_num, decision, cutpoint))

Traversed 4 nodes:
	node=1 Test node: Feature 3 >= 2.45
		node=2 Test node: Feature 4 < 1.75
			node=3 Test node: Feature 3 < 4.95
				node=4 Test node: Feature 4 < 1.65
					node=5 Leaf node.


---

**Trial runs. Please ignore this section.**

### Path invariance trials

In [46]:
## Import nnum, vnum, nodes, csplit, split_df,
## frame

splits = pd.read_csv('../../data/raw/splits.csv', delimiter=",", index_col=0)
csplit = pd.read_csv('../../data/raw/csplit.csv', delimiter=",")
frame = pd.read_csv('../../data/raw/frame.csv', delimiter=",",index_col=0)

# frame = frame.drop(["Unnamed: 0"], axis=1)
frame = frame.rename(columns={"var": "variable"})
# bin_labels = bin_labels.rename(columns={"Unnamed: 0": "label", "label_list": "bins"})

In [47]:
frame

Unnamed: 0,variable,n,wt,dev,yval,complexity,ncompete,nsurrogate,yval2.,yval2..1,yval2..2,yval2..3,yval2..4,yval2..5,yval2..6,yval2.nodeprob
1,Petal.Length,150,150,100,1,0.5,3,3,1.0,50.0,50.0,50.0,0.333333,0.333333,0.333333,1.0
2,<leaf>,50,50,0,1,0.0,0,0,1.0,50.0,0.0,0.0,1.0,0.0,0.0,0.333333
3,Petal.Width,100,100,50,2,0.44,3,3,2.0,0.0,50.0,50.0,0.0,0.5,0.5,0.666667
6,Petal.Length,54,54,5,2,0.02,3,0,2.0,0.0,49.0,5.0,0.0,0.907407,0.092593,0.36
12,Petal.Width,48,48,1,2,0.01,3,0,2.0,0.0,47.0,1.0,0.0,0.979167,0.020833,0.32
24,<leaf>,47,47,0,2,0.0,0,0,2.0,0.0,47.0,0.0,0.0,1.0,0.0,0.313333
25,<leaf>,1,1,0,3,0.0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.006667
13,Petal.Width,6,6,2,3,0.01,3,2,3.0,0.0,2.0,4.0,0.0,0.333333,0.666667,0.04
26,Sepal.Length,3,3,1,2,0.01,3,0,2.0,0.0,2.0,1.0,0.0,0.666667,0.333333,0.02
52,<leaf>,2,2,0,2,0.0,0,0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.013333


In [59]:
## Generate nnum, vnum, nodes(split_df and csplit - 2L if necessary)

temp_frame = frame

nc = temp_frame[["ncompete", "nsurrogate"]]

index = np.cumsum((frame[["variable"]]!="<leaf>").values + nc[["ncompete"]].values + nc[["nsurrogate"]].values)

index_df = pd.DataFrame((np.insert(index,0,0)+1)[:-1], columns=["i"], index=frame.index)

temp_frame = pd.concat([temp_frame, index_df], axis=1)

# temp_frame[temp_frame[["var"]]=="<leaf>"]
# temp_frame.where(temp_frame[["var"]]=="<leaf>")
# temp_frame.loc[temp_frame[["variable"]]=="<leaf>", "index"] = 0
temp_frame.i[temp_frame.variable == "<leaf>"] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [60]:
temp_frame

Unnamed: 0,variable,n,wt,dev,yval,complexity,ncompete,nsurrogate,yval2.,yval2..1,yval2..2,yval2..3,yval2..4,yval2..5,yval2..6,yval2.nodeprob,i
1,Petal.Length,150,150,100,1,0.5,3,3,1.0,50.0,50.0,50.0,0.333333,0.333333,0.333333,1.0,1
2,<leaf>,50,50,0,1,0.0,0,0,1.0,50.0,0.0,0.0,1.0,0.0,0.0,0.333333,0
3,Petal.Width,100,100,50,2,0.44,3,3,2.0,0.0,50.0,50.0,0.0,0.5,0.5,0.666667,8
6,Petal.Length,54,54,5,2,0.02,3,0,2.0,0.0,49.0,5.0,0.0,0.907407,0.092593,0.36,15
12,Petal.Width,48,48,1,2,0.01,3,0,2.0,0.0,47.0,1.0,0.0,0.979167,0.020833,0.32,19
24,<leaf>,47,47,0,2,0.0,0,0,2.0,0.0,47.0,0.0,0.0,1.0,0.0,0.313333,0
25,<leaf>,1,1,0,3,0.0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.006667,0
13,Petal.Width,6,6,2,3,0.01,3,2,3.0,0.0,2.0,4.0,0.0,0.333333,0.666667,0.04,23
26,Sepal.Length,3,3,1,2,0.01,3,0,2.0,0.0,2.0,1.0,0.0,0.666667,0.333333,0.02,29
52,<leaf>,2,2,0,2,0.0,0,0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.013333,0


In [61]:
nodes = temp_frame[["n", "ncompete", "nsurrogate", "i"]]

In [62]:
nnum = list(temp_frame.index) # row names of temp_frame

In [63]:
feature_names = ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]

vnum = list(map(feature_names.index, splits.index))

In [64]:
nodes

Unnamed: 0,n,ncompete,nsurrogate,i
1,150,3,3,1
2,50,0,0,0
3,100,3,3,8
6,54,3,0,15
12,48,3,0,19
24,47,0,0,0
25,1,0,0,0
13,6,3,2,23
26,3,3,0,29
52,2,0,0,0


In [80]:
nnum

[1, 2, 3, 6, 12, 24, 25, 13, 26, 52, 53, 27, 7, 14, 28, 29, 15]

In [81]:
def return_yval(path): # [1,0,0,0]
    node = 0
    nspl = 1
    i = 0
    while nspl != 0:
        npos = nnum[node] # i)0, 
        nspl = nodes.iloc[npos-1][3] # i)1
        var = vnum[nspl]
        # ncat
        temp = splits.iloc[nspl][3]
        if nspl > 0:
            print("nspl succeeded")
            if path[i] == 0: # i)1
                direction = -1
                i+=1
            else:
                direction = 1
                i+=1

            if direction == -1:
                print(node)
                if node == 0:
                    node = 1
                node = 2 * node
            else:
                print(node)
                if node == 0:
                    node = 1
                node = 2 * node + 1
            if len(path) == 1:
                nspl = 0
        else:
            print('nspl failed')
            print("leaf node -- ", node)
            yval = temp_frame.iloc[node][4]
    return temp_frame.iloc[node][4]

In [77]:
nodes.iloc[0]

n             150
ncompete        3
nsurrogate      3
i               1
Name: 1, dtype: int64

In [86]:

path = [1,1,0,1]
a = [1,0,0,0]
b = [1,1,1]
c = [0]
d = [1,1,1,1,1,1,0]
e = [1,0,1,0,0]
return_yval(e)

nspl succeeded
0
nspl failed
leaf node --  3


2

In [30]:
temp_frame

Unnamed: 0,variable,n,wt,dev,yval,complexity,ncompete,nsurrogate,yval2.,yval2..1,yval2..2,yval2..3,yval2..4,yval2..5,yval2..6,yval2.nodeprob,i
0,Petal.Length,150,150,100,1,0.5,3,3,1.0,50.0,50.0,50.0,0.333333,0.333333,0.333333,1.0,1
1,<leaf>,50,50,0,1,0.0,0,0,1.0,50.0,0.0,0.0,1.0,0.0,0.0,0.333333,0
2,Petal.Width,100,100,50,2,0.44,3,3,2.0,0.0,50.0,50.0,0.0,0.5,0.5,0.666667,8
3,Petal.Length,54,54,5,2,0.02,3,0,2.0,0.0,49.0,5.0,0.0,0.907407,0.092593,0.36,15
4,Petal.Width,48,48,1,2,0.01,3,0,2.0,0.0,47.0,1.0,0.0,0.979167,0.020833,0.32,19
5,<leaf>,47,47,0,2,0.0,0,0,2.0,0.0,47.0,0.0,0.0,1.0,0.0,0.313333,0
6,<leaf>,1,1,0,3,0.0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.006667,0
7,Petal.Width,6,6,2,3,0.01,3,2,3.0,0.0,2.0,4.0,0.0,0.333333,0.666667,0.04,23
8,Sepal.Length,3,3,1,2,0.01,3,0,2.0,0.0,2.0,1.0,0.0,0.666667,0.333333,0.02,29
9,<leaf>,2,2,0,2,0.0,0,0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.013333,0


In [33]:
splits

Unnamed: 0,count,ncat,improve,index,adj
Petal.Length,150,-1,50.0,2.45,0.0
Petal.Width,150,-1,50.0,0.8,0.0
Sepal.Length,150,-1,34.16405,5.45,0.0
Sepal.Width,150,1,19.038508,3.35,0.0
Petal.Width,0,-1,1.0,0.8,1.0
Sepal.Length,0,-1,0.92,5.45,0.76
Sepal.Width,0,1,0.833333,3.35,0.5
Petal.Width,100,-1,38.969404,1.75,0.0
Petal.Length,100,-1,37.353535,4.75,0.0
Sepal.Length,100,-1,10.686869,6.15,0.0


In [96]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

---

In [16]:
### Adding bin frequency
path_df

Unnamed: 0,new_col
0,3G0
1,3G0
2,3G0
3,3G0
4,3G0
5,3G0
6,3G0
7,3G0
8,3G0
9,3G0


In [15]:
bin_labels.head()

Unnamed: 0,label,bins
0,A,0.8
1,B,1.35
2,C,1.55
3,D,1.65
4,E,1.75


In [77]:
bin_freq = {}
bin_labels['freq'] = 0
for i, val in path_df.iterrows():
    print(type(val['new_col']))
    label_list = val['new_col'].split(',')
    for j, val2 in enumerate(label_list):
        bin_labels.loc[bin_labels['label'] == val2[1:-1], 'freq'] += 1

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

In [41]:
path_df.iloc[126,]['new_col'].split(',')[1][1:-1]

'E'

In [40]:
path_df

Unnamed: 0,new_col
0,3G0
1,3G0
2,3G0
3,3G0
4,3G0
5,3G0
6,3G0
7,3G0
8,3G0
9,3G0


In [75]:
bin_labels.loc[bin_labels['label'] == 'A','freq'] = 0

In [80]:
bin_labels.loc[bin_labels['freq'] != 0,]

Unnamed: 0,label,bins,freq
2,C,1.55,6
3,D,1.65,48
4,E,1.75,100
6,G,2.45,150
16,Q,4.85,46
17,R,4.95,54
21,V,5.95,3
24,Y,6.95,3


In [84]:
bin_labels

Unnamed: 0,label,bins,freq
0,A,0.8,0
1,B,1.35,0
2,C,1.55,6
3,D,1.65,48
4,E,1.75,100
5,F,1.85,0
6,G,2.45,150
7,H,2.55,0
8,I,2.65,0
9,J,2.85,0
