### Load dataset and paths

In [75]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from keras.layers import Dense, Input, concatenate, GRU, LSTM
from keras import backend as K
from keras.utils import to_categorical
from keras import Model
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import jaccard_score

In [3]:
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
         'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
         'hours-per-week', 'native-country', 'target']

data = pd.read_csv('../../data/raw/adult.data.csv', delimiter=",", header=None, names=names)

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data.shape

(32561, 15)

In [6]:
data = data[data["workclass"] != " ?"]
data = data[data["occupation"] != " ?"]
data = data[data["native-country"] != " ?"]

# Convert categorical fields #
categorical_col = ['workclass', 'education', 'marital-status', 'occupation',
                   'relationship', 'race', 'sex', 'native-country', 'target']

# categorical_col = ['target']
    
# for col in categorical_col:
#     categories = unique_of(data.col)
#     num_cat = count(categories)
#     for cat in categories:
#         data.col[cat] = index_of(cat in categories)

for col in categorical_col:
    b, c = np.unique(data[col], return_inverse=True)
    data[col] = c

feature_list = names[:14]
# Test train split #
X = data.loc[:, feature_list]
Y = data[['target']]

# Split the dataset into test and train datasets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=0)

In [7]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [8]:
data.shape

(30162, 15)

In [9]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4


In [10]:
Y.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [11]:
# df = pd.DataFrame(X)

In [12]:
bin_labels = pd.read_csv('../../data/raw/adult_data_bin_labels.csv', delimiter=",")

In [13]:
bin_labels = bin_labels.rename(columns={"Unnamed: 0": "label", "label_list": "bins"})

In [14]:
bin_labels.head()

Unnamed: 0,label,x
0,A,2.5
1,B,3.5
2,C,4.0
3,D,4.5
4,E,5.0


In [15]:
# path_df = pd.read_csv('../../data/raw/rpart_paths.csv', delimiter=",")
# path_df = pd.read_csv('../../data/raw/test_paths.csv', delimiter=",")
path_df = pd.read_csv('../../data/raw/adult_data_paths.csv', delimiter=",")

In [16]:
path_df = path_df.drop(["Unnamed: 0"], axis=1)
path_df = path_df.rename(columns={list(path_df)[0]: "new_col"})

In [17]:
path_df.shape

(10000, 1)

In [18]:
path_df.head()

Unnamed: 0,new_col
0,"8EGN0,11LV0,4EGV0,4EGV1,1BD1,7EGN0,7EGN0,6EGQ0..."
1,"8EGM1,7EGQ1,4EGV1,11LR0,12HV0,14EIQ1,1BJ1,1EH0..."
2,"8EGN0,11LV0,4EGX0,4EGX0,12JN0,7EGS0,12IX0,14EI..."
3,"8EGM1,7EGS0,11LR0,4EGN0,7EGS0,13CT0"
4,"8EGR1,7EGW1,4EGV1,11LR0,12HV0,14EGR1,1BJ0,1AV1..."


In [136]:
data.merge(path_df)

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [44]:
X.iloc[0:1000,:-1]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40


In [19]:
test_data = pd.concat([data.iloc[0:10000,1:15].reset_index(), path_df], axis=1)
# test_data = df

In [20]:
test_data

Unnamed: 0,index,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target,new_col
0,0,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0,"8EGN0,11LV0,4EGV0,4EGV1,1BD1,7EGN0,7EGN0,6EGQ0..."
1,1,4,83311,9,13,2,3,0,4,1,0,0,13,38,0,"8EGM1,7EGQ1,4EGV1,11LR0,12HV0,14EIQ1,1BJ1,1EH0..."
2,2,2,215646,11,9,0,5,1,4,1,0,0,40,38,0,"8EGN0,11LV0,4EGX0,4EGX0,12JN0,7EGS0,12IX0,14EI..."
3,3,2,234721,1,7,2,5,0,2,1,0,0,40,38,0,"8EGM1,7EGS0,11LR0,4EGN0,7EGS0,13CT0"
4,4,2,338409,9,13,2,9,5,2,0,0,0,40,4,0,"8EGR1,7EGW1,4EGV1,11LR0,12HV0,14EGR1,1BJ0,1AV1..."
5,5,2,284582,12,14,2,3,5,4,0,0,0,40,38,0,"8EGR1,7EGQ1,4EGY1,11LR0,12HV0,14EIQ1,1BJ1,1EH0..."
6,6,2,160187,6,5,3,7,1,2,0,0,0,16,22,0,"8EGN0,11LV0,4EGS0,4EGS0,12JN0,7EGU0,12IX0,14EH..."
7,7,4,209642,11,9,2,3,0,4,1,0,0,45,38,1,"8EGM1,7EGQ1,4EGX0,11LR0,12HX0,13BN1,1BF1,11JW0..."
8,8,2,45781,12,14,4,9,1,4,0,14084,0,50,38,1,"8EGN0,11LV1,1AJ1,4EGY1"
9,9,2,159449,9,13,2,3,0,4,1,5178,0,40,38,1,"8EGM1,7EGQ1,4EGV1,11LR1,1FN0"


In [96]:
test_data['new_col'][1].split(sep=",")

['6IP1', '7IR1', '1R1', '11CU0', '12CK0', '4IW1', '3DQ0', '2IT0']

In [21]:
new_path = []
for i, val in test_data.iterrows():
    new_path.append(val['new_col'].split(sep=","))

In [22]:
[x.insert(0, 'S') for x in new_path]
[x.append('E') for x in new_path]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [23]:
new_path

[['S',
  '8EGN0',
  '11LV0',
  '4EGV0',
  '4EGV1',
  '1BD1',
  '7EGN0',
  '7EGN0',
  '6EGQ0',
  '13EN0',
  '2EGT0',
  '1DT0',
  '4EGV0',
  'E'],
 ['S',
  '8EGM1',
  '7EGQ1',
  '4EGV1',
  '11LR0',
  '12HV0',
  '14EIQ1',
  '1BJ1',
  '1EH0',
  '11LN0',
  '12HM0',
  '7EGQ1',
  '13AN0',
  'E'],
 ['S',
  '8EGN0',
  '11LV0',
  '4EGX0',
  '4EGX0',
  '12JN0',
  '7EGS0',
  '12IX0',
  '14EIQ0',
  '13BZ0',
  '3MO1',
  '6EGM0',
  '1DI0',
  '6EGM0',
  '1BZ0',
  'E'],
 ['S', '8EGM1', '7EGS0', '11LR0', '4EGN0', '7EGS0', '13CT0', 'E'],
 ['S',
  '8EGR1',
  '7EGW1',
  '4EGV1',
  '11LR0',
  '12HV0',
  '14EGR1',
  '1BJ0',
  '1AV1',
  '8EGR1',
  '3TW1',
  '3EFK0',
  '9EGO0',
  'E'],
 ['S',
  '8EGR1',
  '7EGQ1',
  '4EGY1',
  '11LR0',
  '12HV0',
  '14EIQ1',
  '1BJ1',
  '1EH0',
  '11LN0',
  '12HM0',
  '7EGQ1',
  '13AN1',
  '1CE0',
  '4EGY1',
  '2EGQ1',
  '2EGQ1',
  '8EGR0',
  '4EGY0',
  'E'],
 ['S',
  '8EGN0',
  '11LV0',
  '4EGS0',
  '4EGS0',
  '12JN0',
  '7EGU0',
  '12IX0',
  '14EHR0',
  '13BZ0',
  '3MO1',
  

In [24]:
test_data['new_path'] = new_path

In [25]:
# 3G1', '4E0', '3R0', '4D0
# 3G1', '4E1', '3Q1
# a = [1,0,0,0]
# b = [1,1,1]
# c = [0]

In [26]:
test_data = test_data.drop(["new_col"], axis=1)

In [27]:
test_data.head()

Unnamed: 0,index,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target,new_path
0,0,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0,"[S, 8EGN0, 11LV0, 4EGV0, 4EGV1, 1BD1, 7EGN0, 7..."
1,1,4,83311,9,13,2,3,0,4,1,0,0,13,38,0,"[S, 8EGM1, 7EGQ1, 4EGV1, 11LR0, 12HV0, 14EIQ1,..."
2,2,2,215646,11,9,0,5,1,4,1,0,0,40,38,0,"[S, 8EGN0, 11LV0, 4EGX0, 4EGX0, 12JN0, 7EGS0, ..."
3,3,2,234721,1,7,2,5,0,2,1,0,0,40,38,0,"[S, 8EGM1, 7EGS0, 11LR0, 4EGN0, 7EGS0, 13CT0, E]"
4,4,2,338409,9,13,2,9,5,2,0,0,0,40,4,0,"[S, 8EGR1, 7EGW1, 4EGV1, 11LR0, 12HV0, 14EGR1,..."


In [28]:
paths_lengths = np.array([len(xi)
                          for xi in test_data.iloc[:,-1]])

In [29]:
paths_lengths
np.max(paths_lengths)

32

In [30]:
label_char = []
for _, i in enumerate(np.unique(test_data['new_path'])):
    for _, j in enumerate(i):
        if j not in label_char:
            label_char.append(j)

In [31]:
label_indices = { j : i for i, j in enumerate(label_char) }

In [32]:
label_indices

{'S': 0,
 '8EGM1': 1,
 '7EGN0': 2,
 '11LR0': 3,
 '4EGM0': 4,
 '13CT0': 5,
 'E': 6,
 '4EGN0': 7,
 '13CT1': 8,
 '4EGN1': 9,
 '1CQ0': 10,
 '4EGQ0': 11,
 '4EGS0': 12,
 '4EGT1': 13,
 '1BR0': 14,
 '14EIQ0': 15,
 '1AZ0': 16,
 '4EGT0': 17,
 '13EI0': 18,
 '9EGQ0': 19,
 '3QW1': 20,
 '13CD1': 21,
 '13CH1': 22,
 '2EGN0': 23,
 '1AZ1': 24,
 '12HV0': 25,
 '2EGQ0': 26,
 '9EGQ1': 27,
 '3NY1': 28,
 '3AFU1': 29,
 '3AGJ1': 30,
 '14EIQ1': 31,
 '3DGV0': 32,
 '3CDY0': 33,
 '7EGN1': 34,
 '13AJ1': 35,
 '3AOW0': 36,
 '3AKO1': 37,
 '3CDY1': 38,
 '3CNT0': 39,
 '1BL1': 40,
 '1BP0': 41,
 '3CGU0': 42,
 '1BN1': 43,
 '1BR1': 44,
 '1DY0': 45,
 '14EGQ0': 46,
 '14EHN0': 47,
 '3DRU0': 48,
 '3PS0': 49,
 '3PS1': 50,
 '13BD0': 51,
 '13BD1': 52,
 '8EGM0': 53,
 '2EGN1': 54,
 '1BX1': 55,
 '3BIR0': 56,
 '3ADT1': 57,
 '3BIR1': 58,
 '2EGO0': 59,
 '2EGQ1': 60,
 '1BX0': 61,
 '3ADT0': 62,
 '2EGT1': 63,
 '4EGU1': 64,
 '4EGU0': 65,
 '13CK1': 66,
 '3AFU0': 67,
 '13DO0': 68,
 '3ANY1': 69,
 '13CI0': 70,
 '13BT1': 71,
 '1CO1': 72,
 '1DQ0':

In [33]:
len(label_indices)

1362

In [34]:
indices_label = { i : j for i, j in enumerate(label_char) }
indices_label

{0: 'S',
 1: '8EGM1',
 2: '7EGN0',
 3: '11LR0',
 4: '4EGM0',
 5: '13CT0',
 6: 'E',
 7: '4EGN0',
 8: '13CT1',
 9: '4EGN1',
 10: '1CQ0',
 11: '4EGQ0',
 12: '4EGS0',
 13: '4EGT1',
 14: '1BR0',
 15: '14EIQ0',
 16: '1AZ0',
 17: '4EGT0',
 18: '13EI0',
 19: '9EGQ0',
 20: '3QW1',
 21: '13CD1',
 22: '13CH1',
 23: '2EGN0',
 24: '1AZ1',
 25: '12HV0',
 26: '2EGQ0',
 27: '9EGQ1',
 28: '3NY1',
 29: '3AFU1',
 30: '3AGJ1',
 31: '14EIQ1',
 32: '3DGV0',
 33: '3CDY0',
 34: '7EGN1',
 35: '13AJ1',
 36: '3AOW0',
 37: '3AKO1',
 38: '3CDY1',
 39: '3CNT0',
 40: '1BL1',
 41: '1BP0',
 42: '3CGU0',
 43: '1BN1',
 44: '1BR1',
 45: '1DY0',
 46: '14EGQ0',
 47: '14EHN0',
 48: '3DRU0',
 49: '3PS0',
 50: '3PS1',
 51: '13BD0',
 52: '13BD1',
 53: '8EGM0',
 54: '2EGN1',
 55: '1BX1',
 56: '3BIR0',
 57: '3ADT1',
 58: '3BIR1',
 59: '2EGO0',
 60: '2EGQ1',
 61: '1BX0',
 62: '3ADT0',
 63: '2EGT1',
 64: '4EGU1',
 65: '4EGU0',
 66: '13CK1',
 67: '3AFU0',
 68: '13DO0',
 69: '3ANY1',
 70: '13CI0',
 71: '13BT1',
 72: '1CO1',
 73: '1D

In [35]:
bin_labels

Unnamed: 0,label,x
0,A,2.5
1,B,3.5
2,C,4
3,D,4.5
4,E,5
5,F,5.5
6,G,6
7,H,6.5
8,I,7
9,J,7.5


In [40]:
np.unique(path_df, return_counts=True)

(array(['8EGM1,7EGN0,11LR0,4EGM0,7EGN0,13CT0',
        '8EGM1,7EGN0,11LR0,4EGN0,7EGN0,13CT0',
        '8EGM1,7EGN0,11LR0,4EGN0,7EGN0,13CT1,4EGN1,1CQ0', ...,
        '8EGR1,7EHI0,11LR0,4EHJ1,1BR0,14EIQ0,1AZ1,12HV0,2EGO1,7EHI0',
        '8EGR1,7EHI0,11LR0,4EHJ1,1BR1,7EHI1,1DY0,12HV0,14EIQ1,4EHJ0,14EIQ0,3ANY0,2EGO0,3AFI0,3ACR0,13CW0,8EGR0',
        '8EGR1,7EHI0,11LR0,4EHJ1,1BR1,7EHI1,1DY0,12HV0,14EIQ1,4EHJ0,14EIQ0,3ANY1,13CI0,8EGR1,3DFX0,1BX1,3BUY1,2EGR1,1CM1'],
       dtype=object), array([4, 4, 1, ..., 1, 1, 1]))

In [163]:
test_data

Unnamed: 0,index,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target,new_path
0,0,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0,"[S, 6IR0, 11CV0, 4IW1, 1AC1, 13AT0, 1BS0, 4IW0..."
1,1,4,83311,9,13,2,3,0,4,1,0,0,13,38,0,"[S, 6IP1, 7IR1, 1R1, 11CU0, 12CK0, 4IW1, 3DQ0,..."
2,2,2,215646,11,9,0,5,1,4,1,0,0,40,38,0,"[S, 6IN0, 11CV0, 4IY0, 7IT0, 2IR0, 3DF1, 13BO0..."
3,3,2,234721,1,7,2,5,0,2,1,0,0,40,38,0,"[S, 6IP1, 7IT0, 11CU0, 1S1, 4IO0, 14KV0, 2IR1,..."
4,4,2,338409,9,13,2,9,5,2,0,0,0,40,4,0,"[S, 6IP1, 7IX1, 1R0, 4IW1, 3EY1, E]"
5,5,2,284582,12,14,2,3,5,4,0,0,0,40,38,0,"[S, 6IP1, 7IR1, 1R1, 11CU0, 12CK0, 4IZ0, 2IR0,..."
6,6,2,160187,6,5,3,7,1,2,0,0,0,16,22,0,"[S, 6IQ0, 11CV0, 4IT0, 7IV0, 2IR0, 3DF1, 13BO0..."
7,7,4,209642,11,9,2,3,0,4,1,0,0,45,38,1,"[S, 6IP1, 7IR1, 1R1, 11CU0, 12CK0, 4IY0, 2IT0,..."
8,8,2,45781,12,14,4,9,1,4,0,14084,0,50,38,1,"[S, 6IR0, 11CV1, E]"
9,9,2,159449,9,13,2,3,0,4,1,5178,0,40,38,1,"[S, 6IP1, 7IR1, 1R1, 11CU1, E]"


In [47]:
input_path_sequence = []
next_chars = []
features = []
paths_maxlen = np.max(paths_lengths)
# path_vocab_size = len(bin_labels) # How is this working? Validate!
path_vocab_size = len(indices_label) # Temporary test for local trees
feature_size = 14
# for i in range(0, len(test_data)):
for i in range(0, 4000):
    # get the feature
    curr_feat = np.array([test_data.iloc[i, 0:14]])
    curr_path = test_data.iloc[i, -1]
    curr_path_len = len(curr_path)
    # curr_label = y[i]
    # curr_dec_feat = df.iloc[i, 6]
    for j in range(1, curr_path_len):
        features.append(curr_feat)
        input_path_sequence.append(curr_path[0:j])
        next_chars.append(curr_path[j])

x_path = np.zeros(
    (len(input_path_sequence), paths_maxlen, path_vocab_size), dtype=np.bool)

path_latent_input = np.zeros(
    (len(input_path_sequence), feature_size), dtype=np.float)

y_path = np.zeros(
    (len(input_path_sequence), path_vocab_size), dtype=np.bool)

# print(input_path_sequence)
# print(len(input_path_sequence))
for i, sentence in enumerate(input_path_sequence):
    for t, char in enumerate(sentence):
        # x_path[i, t, self.char_indices[char]] = 1
        # print(bin_labels.index[bin_labels['label'] == char[1]])
        # index = bin_labels.index[bin_labels['label'] == char[1]].tolist()[0]
        x_path[i, t, label_indices[char]] = 1
    # y_path[i, char_indices[next_chars[i]]] = 1
    # index = bin_labels.index[bin_labels['label'] == next_chars[i][1]].tolist()[0]
    # y_path[i, index] = 1
    y_path[i, label_indices[next_chars[i]]] = 1
    path_latent_input[i, :] = features[i]
    
## Trouble with "S" and "E" index values.

In [49]:
x_path.shape

(63632, 32, 1362)

In [48]:
x_path.__array_interface__['data'][0]

139684796837904

In [41]:
len(input_path_sequence)

159391

In [50]:
len(input_path_sequence)

63632

In [51]:
paths_maxlen

32

In [52]:
path_vocab_size

1362

In [53]:
y_path.shape

(63632, 1362)

In [54]:
## Temporary, move to data preprocessing
y = Y[:5000]

In [55]:
y.shape

(5000, 1)

In [190]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4


In [56]:
feature_size

14

In [57]:
np.unique(y)

array([0, 1])

In [58]:
def _create_label_model(latent_dim=5):
    input_layer = Input(shape=(feature_size,), name='ip_x')
    hidden_layer_x1 = Dense(20, activation='tanh',
                            name='hidden_x1')(input_layer)
    hidden_layer_x2 = Dense(20, activation='tanh',
                            name='hidden_x2')(hidden_layer_x1)
    hidden_layer_x3 = Dense(latent_dim, activation='tanh',
                            name='hidden_x3')(hidden_layer_x2)
    output_layer = Dense(len(np.unique(y)), activation='softmax',
                         name='op_x')(hidden_layer_x3)
    model = Model(input_layer, output_layer)
    return model

def _create_combined_model(initialize=True, rnn_cell='gru', latent_dim=5):

    label_model_latent = Input(shape=(latent_dim,), name='label_ip')
    path_input = Input(shape=(
        paths_maxlen, path_vocab_size), name='dec_feat_ip')
    if rnn_cell == 'gru':
        RNN = GRU
    else:
        RNN = LSTM

    decoder = RNN(latent_dim, return_state=False,
                  return_sequences=False, name='gru_seq')
    if initialize:
        decoder_outputs = decoder(
            path_input, initial_state=label_model_latent)
    else:
        decoder_outputs = decoder(path_input)

    merge_layer = concatenate(
        [label_model_latent, decoder_outputs], name='cat')
    output_chars = Dense(path_vocab_size,
                         activation='softmax', name='op_sent')(merge_layer)
    model = Model(
        [label_model_latent, path_input], output_chars)
    return model

In [59]:
combined_model = _create_combined_model()
label_model = _create_label_model()

Instructions for updating:
Colocations handled automatically by placer.


In [60]:
def get_hidden_x(x, model, layer_num=3):
    def get_hidden_x_inner(model, layer_num=layer_num):
        return K.function([model.layers[0].input], [model.layers[layer_num].output])
    return get_hidden_x_inner(model, layer_num=layer_num)([x])[0]

In [65]:
def fit_model():

    y_cat = to_categorical(y)

    label_model.compile(
        optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    label_model.fit(
        X[0:5000], y_cat, batch_size=30, epochs=150, verbose=0, shuffle=True, validation_split=0.2)

    x_latent = get_hidden_x(path_latent_input, model=label_model)

    combined_model.compile(
        optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    combined_model.fit([x_latent, x_path], y_path,
                           batch_size=50, epochs=300, verbose=1, shuffle=True)

In [66]:
fit_model()

Instructions for updating:
Use tf.cast instead.
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300

Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300


Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 

Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


In [67]:
combined_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
label_ip (InputLayer)           (None, 5)            0                                            
__________________________________________________________________________________________________
dec_feat_ip (InputLayer)        (None, 32, 1362)     0                                            
__________________________________________________________________________________________________
gru_seq (GRU)                   (None, 5)            20520       dec_feat_ip[0][0]                
                                                                 label_ip[0][0]                   
__________________________________________________________________________________________________
cat (Concatenate)               (None, 10)           0           label_ip[0][0]                   
          

In [72]:
def predict(x):
    latent_dim = 5
    x_f = x.reshape(1, feature_size)
    token = 'S'
    cont = True
    path = [token]
    x_path = np.zeros((1, paths_maxlen, path_vocab_size), dtype=np.bool)

    x_latent = get_hidden_x(x_f, model=label_model)
    x_latent = x_latent.reshape(1, latent_dim)
    x_path[0, 0, label_indices[token]] = 1
    pred = label_model.predict(x_f)
    label = [np.argmax(pred[0])]
    index = 1
    while cont & (index < paths_maxlen):
        pred = combined_model.predict([x_latent, x_path])
        char_index = np.argmax(pred[0])
        x_path[0, index, char_index] = 1
        next_char = indices_label[char_index]
        path.append(next_char)
        index += 1
        if next_char == 'E':
            cont = False
        # elif index == self.paths_maxlen - 1:
        #     path.append('E')

    if path[-1] != 'E':
        path.append('E')

    return [path, label]

In [195]:
def check_path(path): # Returns -1 if path traversed is wrong/non-existant
    # path = ''.join(path)
    path = path[1:-1]
    pred_features = []
    path_as_strings = []
    for i in range(len(path)):
        pred_features.append(int(path[i][:-1]))
        path_as_strings.append(path[i][-1])
        # if i%2 == 0:
        #     print('i -- ', i)
        #     print('path -- ', path)
        #     print('path[i] -- ', path[i])
        #     pred_features.append(int(path[i]))
        # else:
        #     path_as_strings.append(path[i])

    n_nodes = self.clf.tree_.node_count
    children_left = self.clf.tree_.children_left
    children_right = self.clf.tree_.children_right
    feature = self.clf.tree_.feature

    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, -1)]
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()
        # node_depth[node_id] = parent_depth + 1

        if (children_left[node_id] != children_right[node_id]):
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            is_leaves[node_id] = True


    node = 0
    pred_target = -1
    subset_path = False
    for i in range(len(path_as_strings)):
        if path_as_strings[i] == 'L':
            if feature[node]+1 == pred_features[i]:
                node = children_left[node]
            # else:
                # pred_target = -1 # Remove for "subset" checks
                # break
        elif path_as_strings[i] == 'R':
            if feature[node]+1 == pred_features[i]:
                node = children_right[node]
            # else:
                # pred_target = -1 # Remove for "subset" checks
                # break
        if is_leaves[node]:
            for i, x in enumerate(self.clf.tree_.value[node][0]):
                if x > 0:
                    pred_target = i
            if i < len(path_as_strings):
                subset_path = True

    return pred_target, subset_path

In [68]:
def jaccard_score_inconsistent(x, y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

def get_j_coeff(a, b):
    if len(a) != len(b):
        return jaccard_score_inconsistent(a, b)
    return jaccard_score(a, b, average='micro')

In [205]:
test_data.iloc[:,0:14].head()

Unnamed: 0,index,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,0,5,77516,9,13,4,0,1,4,1,2174,0,40,38
1,1,4,83311,9,13,2,3,0,4,1,0,0,13,38
2,2,2,215646,11,9,0,5,1,4,1,0,0,40,38
3,3,2,234721,1,7,2,5,0,2,1,0,0,40,38
4,4,2,338409,9,13,2,9,5,2,0,0,0,40,4


In [213]:
# test_data.iloc[1, 0:X.shape[1]]
test_data.head()

Unnamed: 0,index,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target,new_path
0,0,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0,"[S, 6IR0, 11CV0, 4IW1, 1AC1, 13AT0, 1BS0, 4IW0..."
1,1,4,83311,9,13,2,3,0,4,1,0,0,13,38,0,"[S, 6IP1, 7IR1, 1R1, 11CU0, 12CK0, 4IW1, 3DQ0,..."
2,2,2,215646,11,9,0,5,1,4,1,0,0,40,38,0,"[S, 6IN0, 11CV0, 4IY0, 7IT0, 2IR0, 3DF1, 13BO0..."
3,3,2,234721,1,7,2,5,0,2,1,0,0,40,38,0,"[S, 6IP1, 7IT0, 11CU0, 1S1, 4IO0, 14KV0, 2IR1,..."
4,4,2,338409,9,13,2,9,5,2,0,0,0,40,4,0,"[S, 6IP1, 7IX1, 1R0, 4IW1, 3EY1, E]"


In [217]:
X.iloc[10]

age                   37
workclass              2
fnlwgt            280464
education             15
education-num         10
marital-status         2
occupation             3
relationship           0
race                   2
sex                    1
capital-gain           0
capital-loss           0
hours-per-week        80
native-country        38
Name: 10, dtype: int64

In [69]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4


In [73]:
def score():
    count = []
    bleu_score = []
    j_coeff = []
    l_dist = []
    path_mismatch_count = []
    traverse_check_count = []
    order_mismatch_count = []
    subset_path_count = []
    # for i in range(test_data.shape[0]):
    for i in range(X[0:1000].shape[0]):
        curr_feat = np.array([X.iloc[i]])
        path, label = predict(curr_feat)
        actual_path = test_data.iloc[i, -1]

        actual_path_tok = [label_indices[char] for char in actual_path]
        pred_path_tok = [label_indices[char] for char in path]

        j_coeff.append(get_j_coeff(actual_path_tok, pred_path_tok))

        print('actual vs predicted: ', test_data.iloc[i, -1], ' vs ', ' '.join(
            path), 'labels: ', y.iloc[i].values[0], label[0])
        count.append(y.iloc[i].values[0] == label[0])
        # print('Actual path -- ', actual_path)
        # print('Pred path -- ', path)
#         if actual_path != path:
#             print(' -- Path mismatch -- ')
#             if sorted(actual_path) == sorted(path):
#                 print(' -- Order mismatch -- ')
#                 order_mismatch_count.append(1)
#             else:
#                 path_mismatch_count.append(1)
#                 pred_target, subset_path = self.check_path(path)
#                 subset_path_count.append(subset_path)
#                 if pred_target != -1 and pred_target == self.df.iloc[i, self.X.shape[1]+1]:
#                     traverse_check_count.append(1)


        path = list(''.join(path))
        actual_path = list(''.join(test_data.iloc[i, -1]))
        bleu_score.append(sentence_bleu([actual_path], path))

#         lev_path = []
#         for i in range(len(path)):
#             if i in ['S','L','R','E']:
#                 lev_path.append(i)
#         l_dist.append(distance.levenshtein(
#             self.df.iloc[i, self.X.shape[1]].replace(' ', ''), ''.join(lev_path)))


    print('\nLabel accuracy - ', np.mean(count))
    print('Path metric (Jaccard) - ', np.mean(j_coeff))
#     print('Path metric (Levenshtein) - ', np.mean(l_dist))
#     print('Path mismatch count - ', np.sum(path_mismatch_count))
#     print('Right traverse count - ', np.sum(traverse_check_count))
#     print('Order mismatch count - ', np.sum(order_mismatch_count))
#     print('Subset path count - ', np.sum(subset_path_count))
    print('Bleu score of paths - ', np.mean(bleu_score))

In [235]:
y.iloc[0].values[0]

0

In [76]:
score()

actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0', '4EGV1', '1BD1', '7EGN0', '7EGN0', '6EGQ0', '13EN0', '2EGT0', '1DT0', '4EGV0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EGV1', '11LR0', '12HV0', '14EIQ1', '1BJ1', '1EH0', '11LN0', '12HM0', '7EGQ1', '13AN0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGS0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGM0', '1DI0', '6EGM0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGS0', '11LR0', '4EGN0', '7EGS0', '13CT0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGR1', '7EGW1', '4EGV1', '11LR0', '12HV0', '14EGR1', '1BJ0', '1AV1', '8EGR1', '3TW1', '3EFK0', '9EGO0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E l

actual vs predicted:  ['S', '8EGM1', '7EGW1', '4EGV1', '11LR0', '12HV0', '14EIQ1', '1BJ1', '1EH0', '11LN0', '12HM0', '7EGW0', '4EGV0', '14EIQ1', '3ANV0', '8EGM0', '4EGV0', '1BX1', '3SU1', '7EGW0', '2EGS0', '13DE0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGZ1', '4EGV1', '11LR0', '12HV0', '14EIQ1', '1BJ0', '1AV0', '13CF1', '3DIY0', '7EGZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGN0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGR0', '1DI0', '6EGR1', '3DNZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGS0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGW1', '4EGV1',

actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGN0', '4EGN0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGR0', '11LR0', '4EGR0', '7EGR0', '13CT0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGY0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGU0', '11LR0', '4EGV1', '1BR1', '7EGU0', '14EGR0', '12HU0', '4EGV0', '2EGQ1', '14EGR0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGR1', '7EGU0', '11LR0', '4EGX1', '1BR1', '7EGU0', '14EIQ0', '12HU0', '4EGX0', '2EGQ1', '14EIQ1', '5T0', '13FK0', '14EIQ0', '1BT1', '1CM1', '7EGU0', '3EFP0'

actual vs predicted:  ['S', '8EGM1', '7EGP0', '11LR0', '4EGU1', '1BR1', '7EGP1', '1DY0', '12HV0', '14EIQ1', '4EGU0', '14EIQ0', '3ANY1', '13CI0', '8EGM0', '13BT1', '2EGO1', '3DFK0', '3BSW1', '1CY0', '4EGU1', '1CS1', '2EGO1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EGY1', '4EHJ0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX1', '14EIQ0', '1EJ0', '14EIQ1', '4EHJ1', '13DY0', '3CUZ1', '3DJY0', '3DGW0', '7EGY0', '1CS1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGP1', '1BD0', '6EGQ0', '4EGX0', '6EGQ0', '6EGQ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0


actual vs predicted:  ['S', '8EGM1', '7EGP0', '11LR0', '4EGX1', '1BR1', '7EGP1', '1DY0', '12HV0', '14EIQ1', '4EGX0', '14EIQ0', '3ANY1', '13CI1', '4EGX0', '2EGQ1', '13ET0', '13CK1', '3DFL0', '3CLP0', '3CEV0', '3CDT0', '13CO0', '1DV0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0', '4EGV1', '1BD0', '12IO0', '13FS0', '6EGQ0', '3DIO0', '3UW1', '1AZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGY0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGR0', '1DI1', '3NS1', '7EGY1', '4EHJ0', '3BHT1', '3BKM1', '3DNY1', '3DTU1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGP1', '1BD0', '6EGM0', '4EGX0', '6EGM0', '6EGM1', '7EGP0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs 

actual vs predicted:  ['S', '8EGM1', '7EGP0', '11LR0', '4EGV1', '1BR1', '7EGP1', '1DY0', '12HV0', '14EIQ1', '4EGV0', '14EIQ0', '3ANY1', '13CI0', '8EGM0', '13BT1', '2EGO1', '3DFK1', '13CD0', '1BU1', '3DGT1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGZ1', '4EHJ0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX0', '4EHJ1', '1CG1', '3APR1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGZ1', '4EGV1', '11LR0', '12HV0', '14EIQ1', '1BJ1', '1EH0', '11LN0', '12HM0', '7EGZ0', '4EGV0', '14EIQ1', '3ANV1', '3DWZ0', '3BEX0', '11JX0', '4EGV1', '2EGN0', '3AXZ1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGO0', '11LV0', '4EGQ0', '4EGQ0', '12JN0', '7EHI1', '1BD0', '6EGO1', '3CJL0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0

actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGY0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGM0', '1DI0', '6EGM0', '1BZ1', '1CE0', '4EGX0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGN0', '12IX0', '14EIQ0', '13BZ1', '2EGQ0', '7EGN0', '6EGQ0', '13CE1', '4EHJ1', '3BEF1', '3BEZ1', '1AH1', '1BX0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGO0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGP1', '1BD1', '14EIQ0', '13CK0', '3QU1', '4EHJ1', '1DG0', '13BW1', '6EGM0', '12HI0', '1CG0', '8EGO0', '7EGP1', '3ARZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EHI1', '1BJ0', '14EIQ0', '3AFS0', '1AZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EGU0', '11LR0', '

actual vs predicted:  ['S', '8EGR1', '7EGP0', '11LR0', '4EGX1', '1BR0', '14EIQ0', '1AZ1', '12HV0', '2EGQ0', '9EGQ1', '3NY1', '3AFU1', '3AGJ1', '14EIQ1', '3DGV0', '3CDY1', '2EGQ0', '3CNT1', '1BL0', '3DFT0', '7EGP1', '3CQZ0', '1BI1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EGX0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX0', '4EGX1', '1CG1', '3APR0', '2EGQ1', '3ANW0', '1DU1', '3ACH1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGS0', '4EGS0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGR0', '11LR0', '4EGY1', '1BR1', '7EGR0', '14EIQ0', '12HU0', '4EGY1', '1CR1', 'E']  vs  S 11LR1 11LR1 11LR1 11LR1 E labels:  0 1
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGY0',

actual vs predicted:  ['S', '8EGM1', '7EGN0', '11LR0', '4EGT1', '1BR0', '14EIQ0', '1AZ1', '12HV0', '2EGQ0', '9EGQ1', '3NY1', '3AFU1', '3AGJ1', '14EIQ1', '3DGV0', '3CDY1', '2EGQ0', '3CNT0', '7EGN0', '1BL1', '1BP0', '7EGN1', '3CGU0', '1BN1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EGR0', '11LR0', '4EHJ1', '1BR1', '7EGR0', '14EIQ0', '12HU0', '4EHJ0', '2EGR1', '14EIQ1', '5T0', '13FK0', '14EIQ0', '1BT1', '1CM0', '1CG0', '3AGX0', '3AET0', '3QZ0', '1BY1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0', '4EGV1', '1BD0', '12IO0', '13FS0', '6EGQ0', '3DIO0', '3UW1', '1AZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EGY1', '11LR1', '1FN0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0

actual vs predicted:  ['S', '8EGM1', '7EGU0', '11LR0', '4EGX1', '1BR0', '14EIQ0', '1AZ0', '4EGX0', '13EI0', '14EIQ0', '9EGQ0', '3QW1', '13CD0', '2EGQ0', '3BCP1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EGV1', '11LR0', '12HV0', '14EIQ1', '1BJ1', '1EH0', '11LN0', '12HM0', '7EGQ1', '13AN1', '1CE0', '4EGV0', '13DO0', '13CN0', '3BHV0', '3ALR1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EHI0', '11LR0', '4EGN0', '7EHI1', '4EGN0', '13CT0', '11JV0', '2EGO0', '8EGM0', '2EGO1', '1AS1', '9EGQ0', '3BCG1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGN0', '11LR0', '4EGX1', '1BR1', '7EGN1', '1DY0', '12HV0', '14EIQ1', '4EGX0', '14EIQ0', '3ANY0', '2EGN1', '7EGN1', '3ACW1', '1CM0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S'

actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGT0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGR0', '1DI0', '6EGR1', '3DNZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGR1', '7EGN0', '11LR0', '4EHJ1', '1BR1', '7EGN1', '1DY0', '12HV1', '12IL1', '7EGN1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGN0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGO0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGR1', '7EGU0', '11LR0', '4EGX1', '1BR0', '14EIQ0', '1AZ1', '12HV0', '2EGQ0', '9EGQ1', '3NY1', '3

actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGY0', '4EGY1', '1BD1', '7EGW0', '7EGW1', '2EGN1', '6EGQ0', '8EGN0', '1BF1', '1DV0', '9EGQ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGS0', '12IX0', '14EGS0', '13BZ1', '2EGQ0', '7EGS0', '6EGQ0', '13CE1', '4EGX0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EGX0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX1', '14EIQ0', '1EJ0', '14EIQ1', '4EGX0', '7EGQ0', '3BGN1', '2EGR1', '13CJ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EGW1', '4EGY1', '11LR0', '12HV0', '14EIQ1', '1BJ1', '1EH0', '11LN0', '12HM0', '7EGW0', '4EGY0', '14EIQ1', '3ANV1', '3DWZ0', '3BEX1', '1BT1', '13CW0', '2EGO1', '4EGY1', '3BNP1', '1CQ0', '1CM1', '2EGO1', '1CO1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 

actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0', '4EGV1', '1BD0', '12IO0', '13FS0', '6EGQ0', '3DIO0', '3UW1', '1AZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGR1', '7EGN0', '11LR0', '4EGX1', '1BR1', '7EGN1', '1DY0', '12HV0', '14EIQ1', '4EGX0', '14EIQ0', '3ANY1', '13CI1', '4EGX0', '2EGQ1', '13ET0', '13CK1', '3DFL0', '3CLP0', '3CEV0', '3CDT0', '13CO1', '7EGN0', '13DY0', '1BU0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EHI1', '1BD1', '14EIQ0', '13CK0', '3QU1', '4EGX0', '1CI1', '1DG0', '8EGQ0', '7EHI0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EHI0', '11LR0', '4EHJ1', '1BR1', '7EHI1', '1DY0', '12HV0', '14EIQ1', '4EHJ0', '14EIQ0', '3ANY0', '2EGQ0', '3AFI0', '3ACR0', '13CW0', '8EGM1', '4EHJ0', '1DE0', '1CO0', '3MZ1', '13CE0', '3ABV0', '3ABH0'

actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0', '4EGV1', '1BD0', '12IO0', '13FS0', '6EGQ0', '3DIO0', '3UW1', '1AZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0', '4EGV1', '1BD0', '12IO0', '13FS0', '6EGQ0', '3DIO1', '3DIP1', '7EGQ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGX1', '1BD0', '6EGQ0', '4EHJ0', '6EGQ0', '6EGQ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EHJ0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX0', '4EHJ1', '1CG1', '3APR0', '2EGQ1', '3ANW0', '1DU0', '2EGQ0', '3NX1', '3AHQ1', '3AJY1', '1CZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGN0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6E

actual vs predicted:  ['S', '8EGR1', '7EGU0', '11LR0', '4EGM0', '7EGU0', '13CT0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGX0', '11LR0', '4EHJ1', '1BR0', '14EIQ0', '1AZ1', '12HV0', '2EGO1', '7EGX1', '1BF1', '3CSU0', '7EGX1', '4EHJ1', '3CFW1', '1BN1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGS0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGP1', '1BD0', '6EGQ0', '4EHJ0', '6EGQ0', '6EGQ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGQ1', '1BD1', '14EIQ0', '13CK1', '1CQ0', '6EGM0', '4EHJ1', '3UX1', '11LP0', '8EGN1', '7EGQ0', '

actual vs predicted:  ['S', '8EGP0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGY0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGN0', '12IX0', '14EIQ0', '13BZ1', '2EGQ0', '7EGN0', '6EGQ0', '13CE1', '4EHJ1', '3BEF0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGN0', '4EGN0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGP0', '11LR0', '4EGM0', '7EGP1', '4EGM1', '12GI1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0', '4EGV1', '1BD1', '7EGW0', '7EGW1', '2EGR1', '6EGM0', '8EGN0', '1BF1', '1DV0', '9EGQ0',

actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EHJ0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX1', '14EIQ0', '1EJ0', '14EIQ1', '4EHJ1', '13DY0', '3CUZ0', '2EGQ1', '9EGQ1', '7EGQ0', '13DE0', '13CI0', '7EGQ0', '2EGQ1', '3BFZ1', '1DQ0', '3CMO0', '1CU0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EGY1', '4EGV1', '11LR0', '12HV0', '14EIQ1', '1BJ1', '1EH0', '11LN0', '12HM0', '7EGY0', '4EGV0', '14EIQ1', '3ANV1', '3DWZ0', '3BEX1', '1BT1', '13CW0', '2EGQ1', '4EGV1', '3BNP1', '1CQ1', '1DO0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ1', '1CE1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EGM0', '4EGM0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGR

actual vs predicted:  ['S', '8EGM1', '7EHI0', '11LR0', '4EHJ1', '1BR0', '14EIQ0', '1AZ0', '4EHJ0', '13EI0', '14EIQ0', '9EGQ0', '3QW1', '13CD0', '2EGT0', '3BCP1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGP0', '11LR0', '4EGX1', '1BR1', '7EGP1', '1DY0', '12HV0', '14EIQ1', '4EGX0', '14EIQ0', '3ANY1', '13CI0', '8EGM0', '13BT1', '2EGQ0', '1CO0', '3APW1', '3BFM1', '3BIW1', '9EGO1', '1BX1', '7EGP1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGS0', '4EGS0', '12JN0', '7EGT0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGQ1', '1BD1', '14EIQ0', '13CK0', '3QU1', '4EHJ1', '1DG0', '13BW1', '6EGM0', '12HI0', '1CG1', '3DLR0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGV0', '4EGV1', '1BD1', '7EGQ1', '1CG1', '13BW1', '12HK0', '2EGQ0', '3ACO0', '6EGM1', '1CI1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EGU0', '11LR0', '4EGO1', '1BR1', '7EGU0', '14EHR0', '12HU0', '4EGO1', '1CR0', '1CE0', '7EGU0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGY0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ1', '1CE1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGP0', '11LR0', '4EGX1', '1BR1', '7EGP1', '1DY0', '12HV0', '14EIQ1', '4EGX0', '14EIQ0', '3ANY1', '13CI0', '8EGM0', '13BT0', '2EGQ0', '7EGP0', '4EGX0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGO0', '11LV0', '4EGN0', '4EGN0', '12JN0', '

actual vs predicted:  ['S', '8EGR1', '7EGW1', '4EGV1', '11LR0', '12HV0', '14EIQ1', '1BJ1', '1EH0', '11LN0', '12HM0', '7EGW0', '4EGV0', '14EIQ1', '3ANV1', '3DWZ0', '3BEX0', '11JX0', '4EGV1', '2EGO1', '3AOQ1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EHI0', '11LR1', '1DU0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGN0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGN0', '4EGN0', '12JN0', '7EGP1', '1BD0', '6EGQ0', '4EGN0', '6EGQ0', '6EGQ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGW1', '4EGV1', '11LR0', '12HV0', '14EIQ1', '1BJ1', '1EH0', '11LN0', '12HM0', '7EGW0', '4EGV0', '14EIQ1', '3ANV1

actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EHJ0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX0', '4EHJ1', '1CG0', '3AEK0', '11HL0', '1BP1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0', '4EGV1', '1BD1', '7EGY1', '1CG0', '3DNW0', '9EGP1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGN0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGM0', '1DI0', '6EGM0', '1BZ1', '1CE1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGR0', '11LR0', '4EGX1', '1BR1', '7EGR0', '14EIQ0', '12HU1', '3BDQ0', '3OY1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGO0', '4EGO0', '12JN0', '7EGT0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8

actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EGX0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX0', '4EGX1', '1CG0', '3AEK1', '1BH1', '3ALQ0', '7EGQ1', '2EGQ1', '3AHS1', '1BT0', '4EGX0', '1BR0', '8EGM0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EHI1', '1BD1', '14EIQ0', '13CK0', '3QU0', '6EGS1', '2EGQ1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGS0', '1DI1', '3NS1', '7EGU0', '3DRX0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGT0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGR0', '1DI0', '6EGR1', '3DNZ1', '3DOU1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8

actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGN0', '4EGN0', '12JN0', '7EGN0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGR0', '11LR0', '4EGX1', '1BR1', '7EGR0', '14EHU0', '12HU0', '4EGX0', '2EGQ1', '14EHU0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EGN0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX1', '14EIQ0', '1EJ0', '14EIQ1', '4EGN0', '7EGQ0', '3BGN1', '2EGQ0', '3DIM0', '3CWZ1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EHJ0', '11LR0', '12HX0', '13BN1', '1BF0', '7EGQ0', '14EIQ0', '2EGR0', '8EGM0', '4EHJ1', '13CK1', '3BGY0', '3BES0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGT0', '11LR0', '4EGX1', '1B

actual vs predicted:  ['S', '8EGM1', '7EGU0', '11LR0', '4EGR0', '7EGU0', '13CT1', '4EGR0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGP0', '11LR0', '4EGS0', '7EGP1', '4EGS0', '13CT0', '11JV0', '2EGS0', '8EGM0', '2EGS1', '1AS1', '9EGQ0', '3BCG1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGS0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGY0', '12IX0', '14EIQ0', '13BZ1', '2EGQ0', '7EGY1', '1CO0', '4EGX0', '3BPZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGT0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGM0', '1DI0', '6EGM0', '1BZ1'

actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGV0', '4EGV1', '1BD1', '7EGY1', '1CG0', '3DNW0', '9EGQ0', '13CK0', '3AMN0', '3AHO1', '10EGN1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGQ0', '11LV0', '4EGT0', '4EGT0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGM0', '1DI0', '6EGM0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGT0', '11LR0', '4EGX1', '1BR0', '14EIQ0', '1AZ1', '12HV0', '2EGQ0', '9EGO0', '7EGT0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGQ1', '4EGX0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX1', '14EIQ0', '1EJ0', '14EIQ1', '4EGX0', '7EGQ0', '3BGN1', '2EGR1', '13CJ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGU0', '12IX0', '14EIQ

actual vs predicted:  ['S', '8EGN0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGY0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGQ0', '1DI0', '6EGQ0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EHJ0', '4EHJ0', '12JN0', '7EGQ1', '1BD0', '6EGQ0', '4EHJ0', '6EGQ0', '6EGQ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGW1', '4EGY1', '11LR0', '12HV0', '14EIQ1', '1BJ1', '1EH0', '11LN0', '12HM0', '7EGW0', '4EGY0', '14EIQ1', '3ANV1', '3DWZ0', '3BEX1', '1BT1', '13CW0', '2EGT0', '1CT0', '4EGY1', '3CHV0', '1CF0', '1BZ0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGU0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGS0', '1DI1', '3NS1', '7EGU0', '3DRX0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted: 

actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGX0', '4EGX0', '12JN0', '7EGT0', '12IX0', '14EIQ0', '13BZ0', '3MO1', '6EGM0', '1DI0', '6EGM0', '1BZ1', '1CE1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGP0', '11LV0', '4EGU0', '4EGU0', '12JN0', '7EHI1', '1BD1', '14EIQ0', '13CK0', '3QU1', '4EGU1', '1DG0', '13BW1', '6EGQ0', '12HI0', '1CG0', '8EGP0', '7EHI0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGT0', '11LR0', '4EGX1', '1BR1', '7EGT1', '1DY0', '12HV0', '14EIQ1', '4EGX0', '14EIQ0', '3ANY1', '13CI0', '8EGM0', '13BT1', '2EGQ0', '1CO1', '1DQ0', '11HL1', '11LN0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  1 0
actual vs predicted:  ['S', '8EGM1', '7EGW1', '4EHJ0', '11LR0', '12HX0', '13BN1', '1BF1', '11JW0', '3APX0', '4EHJ1', '1CG0', '3AEK1', '1BH1', '3ALQ1', '7EGW1', '1BL1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 

actual vs predicted:  ['S', '8EGM1', '7EGY1', '4EGV1', '11LR0', '12HV0', '14EIQ1', '1BJ0', '1AV1', '8EGM0', '4EGV1', '3DNT0', '2EGQ0', '7EGY0', '3DHY0', '3CTU0', '3BGV0', '3AGW0', '7EGY0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EHI0', '11LR0', '4EGO1', '1BR0', '14EIQ0', '1AZ1', '12HV0', '2EGS0', '9EGQ1', '3NY1', '3AFU1', '3AGJ1', '14EIQ1', '3DGV0', '3CDY0', '7EHI0', '3AHR1', '1BJ0', '2EGS0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGN0', '11LV0', '4EGU0', '4EGU0', '12JN0', '7EGP1', '1BD1', '14EIQ0', '13CK0', '3QU1', '4EGU1', '1DG0', '13BW1', '6EGM0', '12HI0', '1CG0', '8EGN0', '7EGP1', '3ARZ1', '3ASW1', '9EGQ1', '10EGM0', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 12HV0 2EGQ0 E labels:  0 0
actual vs predicted:  ['S', '8EGM1', '7EGW1', '4EGY1', '11LR0', '12HV1', '14EIQ1', '1BC1', '1EF0', '4EGY1', 'E']  vs  S 8EGM1 8EGM1 8EGM1 4EGX0 4EGX0 11LR0 

### Result interpretation

In [97]:
# 1) Basic workflow interpretation
# 2) Cover special cases - Path invariance demo
#   - Same nodes, arranged seperately
#   - Different nodes, final output is correct

# for i in range(test_data.shape[0]):
curr_feat = np.array([test_data.iloc[90, 0:4]])
path, label = predict(curr_feat)

In [98]:
path

['S', '3G1', '4E0', '3R0', '4D0', 'E']

In [100]:
print("Traversed %s nodes:" % (len(path)-2))
for i in range(len(path)):
    if i == (len(path)-1):
        print("%snode=%s Leaf node." % (i*"\t", i))
    else:
        feature_num = path[i][0]
        decision = path[i][-1]
        if decision == '1':
            decision = '>='
        else:
            decision = '<'
        if i > 0:
            label = path[i][1:-1]
            cutpoint = bin_labels[bin_labels['label'] == label].values[0][1]
            print("%snode=%s Test node: Feature %s %s %s" %(i * "\t", i, feature_num, decision, cutpoint))

Traversed 4 nodes:
	node=1 Test node: Feature 3 >= 2.45
		node=2 Test node: Feature 4 < 1.75
			node=3 Test node: Feature 3 < 4.95
				node=4 Test node: Feature 4 < 1.65
					node=5 Leaf node.


In [96]:
bin_labels.head()

Unnamed: 0,label,bins
0,A,0.8
1,B,1.35
2,C,1.55
3,D,1.65
4,E,1.75


### Path invariance trials

In [46]:
## Import nnum, vnum, nodes, csplit, split_df,
## frame

splits = pd.read_csv('../../data/raw/splits.csv', delimiter=",", index_col=0)
csplit = pd.read_csv('../../data/raw/csplit.csv', delimiter=",")
frame = pd.read_csv('../../data/raw/frame.csv', delimiter=",",index_col=0)

# frame = frame.drop(["Unnamed: 0"], axis=1)
frame = frame.rename(columns={"var": "variable"})
# bin_labels = bin_labels.rename(columns={"Unnamed: 0": "label", "label_list": "bins"})

In [47]:
frame

Unnamed: 0,variable,n,wt,dev,yval,complexity,ncompete,nsurrogate,yval2.,yval2..1,yval2..2,yval2..3,yval2..4,yval2..5,yval2..6,yval2.nodeprob
1,Petal.Length,150,150,100,1,0.5,3,3,1.0,50.0,50.0,50.0,0.333333,0.333333,0.333333,1.0
2,<leaf>,50,50,0,1,0.0,0,0,1.0,50.0,0.0,0.0,1.0,0.0,0.0,0.333333
3,Petal.Width,100,100,50,2,0.44,3,3,2.0,0.0,50.0,50.0,0.0,0.5,0.5,0.666667
6,Petal.Length,54,54,5,2,0.02,3,0,2.0,0.0,49.0,5.0,0.0,0.907407,0.092593,0.36
12,Petal.Width,48,48,1,2,0.01,3,0,2.0,0.0,47.0,1.0,0.0,0.979167,0.020833,0.32
24,<leaf>,47,47,0,2,0.0,0,0,2.0,0.0,47.0,0.0,0.0,1.0,0.0,0.313333
25,<leaf>,1,1,0,3,0.0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.006667
13,Petal.Width,6,6,2,3,0.01,3,2,3.0,0.0,2.0,4.0,0.0,0.333333,0.666667,0.04
26,Sepal.Length,3,3,1,2,0.01,3,0,2.0,0.0,2.0,1.0,0.0,0.666667,0.333333,0.02
52,<leaf>,2,2,0,2,0.0,0,0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.013333


In [59]:
## Generate nnum, vnum, nodes(split_df and csplit - 2L if necessary)

temp_frame = frame

nc = temp_frame[["ncompete", "nsurrogate"]]

index = np.cumsum((frame[["variable"]]!="<leaf>").values + nc[["ncompete"]].values + nc[["nsurrogate"]].values)

index_df = pd.DataFrame((np.insert(index,0,0)+1)[:-1], columns=["i"], index=frame.index)

temp_frame = pd.concat([temp_frame, index_df], axis=1)

# temp_frame[temp_frame[["var"]]=="<leaf>"]
# temp_frame.where(temp_frame[["var"]]=="<leaf>")
# temp_frame.loc[temp_frame[["variable"]]=="<leaf>", "index"] = 0
temp_frame.i[temp_frame.variable == "<leaf>"] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [60]:
temp_frame

Unnamed: 0,variable,n,wt,dev,yval,complexity,ncompete,nsurrogate,yval2.,yval2..1,yval2..2,yval2..3,yval2..4,yval2..5,yval2..6,yval2.nodeprob,i
1,Petal.Length,150,150,100,1,0.5,3,3,1.0,50.0,50.0,50.0,0.333333,0.333333,0.333333,1.0,1
2,<leaf>,50,50,0,1,0.0,0,0,1.0,50.0,0.0,0.0,1.0,0.0,0.0,0.333333,0
3,Petal.Width,100,100,50,2,0.44,3,3,2.0,0.0,50.0,50.0,0.0,0.5,0.5,0.666667,8
6,Petal.Length,54,54,5,2,0.02,3,0,2.0,0.0,49.0,5.0,0.0,0.907407,0.092593,0.36,15
12,Petal.Width,48,48,1,2,0.01,3,0,2.0,0.0,47.0,1.0,0.0,0.979167,0.020833,0.32,19
24,<leaf>,47,47,0,2,0.0,0,0,2.0,0.0,47.0,0.0,0.0,1.0,0.0,0.313333,0
25,<leaf>,1,1,0,3,0.0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.006667,0
13,Petal.Width,6,6,2,3,0.01,3,2,3.0,0.0,2.0,4.0,0.0,0.333333,0.666667,0.04,23
26,Sepal.Length,3,3,1,2,0.01,3,0,2.0,0.0,2.0,1.0,0.0,0.666667,0.333333,0.02,29
52,<leaf>,2,2,0,2,0.0,0,0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.013333,0


In [61]:
nodes = temp_frame[["n", "ncompete", "nsurrogate", "i"]]

In [62]:
nnum = list(temp_frame.index) # row names of temp_frame

In [63]:
feature_names = ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]

vnum = list(map(feature_names.index, splits.index))

In [64]:
nodes

Unnamed: 0,n,ncompete,nsurrogate,i
1,150,3,3,1
2,50,0,0,0
3,100,3,3,8
6,54,3,0,15
12,48,3,0,19
24,47,0,0,0
25,1,0,0,0
13,6,3,2,23
26,3,3,0,29
52,2,0,0,0


In [80]:
nnum

[1, 2, 3, 6, 12, 24, 25, 13, 26, 52, 53, 27, 7, 14, 28, 29, 15]

In [81]:
def return_yval(path): # [1,0,0,0]
    node = 0
    nspl = 1
    i = 0
    while nspl != 0:
        npos = nnum[node] # i)0, 
        nspl = nodes.iloc[npos-1][3] # i)1
        var = vnum[nspl]
        # ncat
        temp = splits.iloc[nspl][3]
        if nspl > 0:
            print("nspl succeeded")
            if path[i] == 0: # i)1
                direction = -1
                i+=1
            else:
                direction = 1
                i+=1

            if direction == -1:
                print(node)
                if node == 0:
                    node = 1
                node = 2 * node
            else:
                print(node)
                if node == 0:
                    node = 1
                node = 2 * node + 1
            if len(path) == 1:
                nspl = 0
        else:
            print('nspl failed')
            print("leaf node -- ", node)
            yval = temp_frame.iloc[node][4]
    return temp_frame.iloc[node][4]

In [77]:
nodes.iloc[0]

n             150
ncompete        3
nsurrogate      3
i               1
Name: 1, dtype: int64

In [86]:

path = [1,1,0,1]
a = [1,0,0,0]
b = [1,1,1]
c = [0]
d = [1,1,1,1,1,1,0]
e = [1,0,1,0,0]
return_yval(e)

nspl succeeded
0
nspl failed
leaf node --  3


2

In [30]:
temp_frame

Unnamed: 0,variable,n,wt,dev,yval,complexity,ncompete,nsurrogate,yval2.,yval2..1,yval2..2,yval2..3,yval2..4,yval2..5,yval2..6,yval2.nodeprob,i
0,Petal.Length,150,150,100,1,0.5,3,3,1.0,50.0,50.0,50.0,0.333333,0.333333,0.333333,1.0,1
1,<leaf>,50,50,0,1,0.0,0,0,1.0,50.0,0.0,0.0,1.0,0.0,0.0,0.333333,0
2,Petal.Width,100,100,50,2,0.44,3,3,2.0,0.0,50.0,50.0,0.0,0.5,0.5,0.666667,8
3,Petal.Length,54,54,5,2,0.02,3,0,2.0,0.0,49.0,5.0,0.0,0.907407,0.092593,0.36,15
4,Petal.Width,48,48,1,2,0.01,3,0,2.0,0.0,47.0,1.0,0.0,0.979167,0.020833,0.32,19
5,<leaf>,47,47,0,2,0.0,0,0,2.0,0.0,47.0,0.0,0.0,1.0,0.0,0.313333,0
6,<leaf>,1,1,0,3,0.0,0,0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.006667,0
7,Petal.Width,6,6,2,3,0.01,3,2,3.0,0.0,2.0,4.0,0.0,0.333333,0.666667,0.04,23
8,Sepal.Length,3,3,1,2,0.01,3,0,2.0,0.0,2.0,1.0,0.0,0.666667,0.333333,0.02,29
9,<leaf>,2,2,0,2,0.0,0,0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,0.013333,0


In [33]:
splits

Unnamed: 0,count,ncat,improve,index,adj
Petal.Length,150,-1,50.0,2.45,0.0
Petal.Width,150,-1,50.0,0.8,0.0
Sepal.Length,150,-1,34.16405,5.45,0.0
Sepal.Width,150,1,19.038508,3.35,0.0
Petal.Width,0,-1,1.0,0.8,1.0
Sepal.Length,0,-1,0.92,5.45,0.76
Sepal.Width,0,1,0.833333,3.35,0.5
Petal.Width,100,-1,38.969404,1.75,0.0
Petal.Length,100,-1,37.353535,4.75,0.0
Sepal.Length,100,-1,10.686869,6.15,0.0


In [96]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')