## Load dataset and tree info

In [1]:
import tensorflow as tf
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
from tensorflow.keras.layers import Dense, Input, concatenate, GRU, LSTM
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import jaccard_score
import re

ImportError: cannot import name 'jaccard_score'

In [None]:
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
         'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
         'hours-per-week', 'native-country', 'target']

data = pd.read_csv('../input/adult.data.csv', delimiter=",", header=None, names=names)

In [None]:
data.head()

In [None]:
data = data[data["workclass"] != " ?"]
data = data[data["occupation"] != " ?"]
data = data[data["native-country"] != " ?"]

# Convert categorical fields #
categorical_col = ['workclass', 'education', 'marital-status', 'occupation',
                   'relationship', 'race', 'sex', 'native-country', 'target']

# categorical_col = ['target']
    
# for col in categorical_col:
#     categories = unique_of(data.col)
#     num_cat = count(categories)
#     for cat in categories:
#         data.col[cat] = index_of(cat in categories)

for col in categorical_col:
    b, c = np.unique(data[col], return_inverse=True)
    data[col] = c

feature_list = names[:14]
# Test train split #
X = data.loc[:, feature_list]
Y = data[['target']]

# Split the dataset into test and train datasets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=0)

In [None]:
df = pd.DataFrame(X)
df = df.reset_index().drop(columns='index')

In [None]:
df.head(5)

In [None]:
cont_var = []

for i in list(df.columns):
  if i not in categorical_col:
    cont_var.append(i)

In [None]:
X = df[cont_var]
scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))

In [None]:
categorical_col = ['workclass', 'education', 'marital-status', 'occupation',
                   'relationship', 'race', 'sex', 'native-country']
for i in categorical_col:
  enc = OneHotEncoder(handle_unknown='ignore')
  enc.fit(df[i].values.reshape(-1,1))
  temp_df = pd.DataFrame(enc.transform(df[i].values.reshape(-1,1)).toarray())
  X = pd.concat([X, temp_df], axis=1)

In [None]:
X

In [None]:
bin_labels = pd.read_csv('../input/test_adult_bin_labels_1000.csv', delimiter=",",
                         header=0, names=['label', 'bins'])

In [None]:
bin_labels.head()

In [None]:
Y = Y.reset_index().drop(columns='index')

In [None]:
path_df = pd.read_csv('../input/test_adult_paths_1000.csv', delimiter=",",
                      header=0, names=['index', 'paths'])
path_df = path_df.drop(columns='index')

In [None]:
path_df.shape

In [None]:
df.iloc[:1000,]

In [None]:
X.iloc[:1000,]

In [None]:
test_data = pd.concat([X.iloc[:1000,], Y.iloc[:1000,],path_df], axis=1)
# test_data = df

In [None]:
test_data.shape

In [None]:
new_path = []
for i, val in test_data.iterrows():
    new_path.append(val['paths'].split(sep=","))

In [None]:
_ = [x.insert(0, 'S') for x in new_path]
_ = [x.append('E') for x in new_path]

In [None]:
test_data['new_path'] = new_path

In [None]:
test_data = test_data.drop(["paths"], axis=1)

In [None]:
test_data.head()

In [None]:
paths_lengths = np.array([len(xi)
                          for xi in test_data.iloc[:,-1]])

In [None]:
paths_lengths
np.max(paths_lengths)

## Create and train FFN

In [None]:
def _create_label_model(latent_dim=25, feature_size=104):
    input_layer = Input(shape=(feature_size,), name='ip_x')
    hidden_layer_x1 = Dense(10, activation='relu',
                            name='hidden_x1')(input_layer)
    hidden_layer_x2 = Dense(10, activation='relu',
                            name='hidden_x2')(hidden_layer_x1)
    hidden_layer_x3 = Dense(latent_dim, activation='relu',
                            name='hidden_x3')(hidden_layer_x2)
    output_layer = Dense(len(np.unique(Y)), activation='sigmoid',
                         name='op_x')(hidden_layer_x3)
    model = Model(input_layer, output_layer)
    return model

In [None]:
label_model = _create_label_model()

In [None]:
label_model.summary()

In [None]:
Y.shape

In [None]:
X.head()

In [None]:
def fit_model():

    y_cat = to_categorical(Y)

    label_model.compile(
        optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    label_model.fit(
        X, y_cat, batch_size=2000, epochs=50, verbose=1, shuffle=True, validation_split=0.2)

In [None]:
fit_model()

## Path invariance trials using rpart -- generic

In [0]:
## Import nnum, vnum, nodes, csplit, split_df,
## frame

splits = pd.read_csv('../../data/raw/splits.csv', delimiter=",", index_col=0)
csplit = pd.read_csv('../../data/raw/csplit.csv', delimiter=",")
frame = pd.read_csv('../../data/raw/frame.csv', delimiter=",",index_col=0)

# frame = frame.drop(["Unnamed: 0"], axis=1)
frame = frame.rename(columns={"var": "variable"})
# bin_labels = bin_labels.rename(columns={"Unnamed: 0": "label", "label_list": "bins"})

In [0]:
frame

In [0]:
## Generate nnum, vnum, nodes(split_df and csplit - 2L if necessary)

temp_frame = frame

nc = temp_frame[["ncompete", "nsurrogate"]]

index = np.cumsum((frame[["variable"]]!="<leaf>").values + nc[["ncompete"]].values + nc[["nsurrogate"]].values)

index_df = pd.DataFrame((np.insert(index,0,0)+1)[:-1], columns=["i"], index=frame.index)

temp_frame = pd.concat([temp_frame, index_df], axis=1)

# temp_frame[temp_frame[["var"]]=="<leaf>"]
# temp_frame.where(temp_frame[["var"]]=="<leaf>")
# temp_frame.loc[temp_frame[["variable"]]=="<leaf>", "index"] = 0
temp_frame.i[temp_frame.variable == "<leaf>"] = 0

In [0]:
temp_frame

In [0]:
nodes = temp_frame[["n", "ncompete", "nsurrogate", "i"]]

In [0]:
nodes

In [0]:
def load_tree_details(index): # Returns nnum, nodes, vnum, splits, temp_frame
    splits_dir = '../../data/raw/local_dt_df/splits'+'_'+str(index)+'.csv'
    frame_dir = '../../data/raw/local_dt_df/frame'+'_'+str(index)+'.csv'
    splits = pd.read_csv(splits_dir, delimiter=",", index_col=0)
    frame = pd.read_csv(frame_dir, delimiter=",",index_col=0)
    
    frame = frame.rename(columns={"var": "variable"})
    
    temp_frame = frame

    nc = temp_frame[["ncompete", "nsurrogate"]]

    index = np.cumsum((frame[["variable"]]!="<leaf>").values + nc[["ncompete"]].values + nc[["nsurrogate"]].values)

    index_df = pd.DataFrame((np.insert(index,0,0)+1)[:-1], columns=["i"], index=frame.index)

    temp_frame = pd.concat([temp_frame, index_df], axis=1)

    temp_frame.i[temp_frame.variable == "<leaf>"] = 0
    
    nodes = temp_frame[["n", "ncompete", "nsurrogate", "i"]]
    
    nnum = list(temp_frame.index)

    feature_names = ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]

    vnum = list(map(feature_names.index, splits.index))
    
    return nnum, nodes, vnum, splits, temp_frame

In [0]:
nnum = list(temp_frame.index) # row names of temp_frame

In [0]:
nnum

In [0]:
feature_names = ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]

vnum = list(map(feature_names.index, splits.index))

In [0]:
nodes

In [0]:
sample_path = ['S', '3A0', '4C1', '2AD1', 'E']

In [0]:
def return_yval(path, index): # [1,0,0,0]
    nnum, nodes, vnum, splits, temp_frame = load_tree_details(index)
    path = path[1:-1]
    node = 0
    nspl = 1
    i = 0
    while nspl != 0:
        npos = nnum[node] # i)0, 
        nspl = nodes.iloc[npos-1][3] # i)1
        var = vnum[nspl]
        # ncat
        temp = splits.iloc[nspl][3]
        if nspl > 0:
            print("nspl succeeded")
            if  int(path[i][0]) != var:
                print("Wrong feature -- ", path[i][0], var)
                return False
                break
            elif int(path[i][-1]) == 0: # i)1
                direction = -1
                i+=1
            else:
                direction = 1
                i+=1

            if direction == -1:
                print(node)
                if node == 0:
                    node = 1
                node = 2 * node
            else:
                print(node)
                if node == 0:
                    node = 1
                node = 2 * node + 1
            if len(path) == 1:
                nspl = 0
        else:
            print('nspl failed')
            print("leaf node -- ", node)
            yval = temp_frame.iloc[node][4]
    return temp_frame.iloc[node][4]

In [0]:

path = [1,1,0,1]
a = [1,0,0,0]
b = [1,1,1]
c = [0]
d = [1,1,1,1,1,1,0]
e = [1,0,1,0,0]
return_yval(e)

In [0]:
return_yval(sample_path)

In [0]:
temp_frame

In [0]:
splits

In [0]:
iris.target_names

## Bin reduction(EMD)

In [0]:
## Pyemd approach -- categorical var
'''
Max levels = 3; (a,b,c)
actual_bins = [(a), (b,c), (a,c), (b), (c)]
actual_freq = [10.0, 3.0, 4.0, 1.0, 5.0]
possible_out = [(a), (b), (c), (a,b), (a,c), (b,c)]
'''

actual_bins = [['a'], ['b','c'], ['a','c'], ['a','b'], ['c']]
possible_out = [['a'], ['b','c'], ['a','c'], ['a','b'], ['b'], ['c']]


dist_list = []
emd_trials = {}
for index in range(1000):
    ext_bins = np.random.choice(actual_bins, 3, replace=False).tolist()
    ext_freq = []
    # actual_freq = [10.0, 3.0, 4.0, 1.0, 5.0]
    actual_freq = [10.0, 3.0, 4.0, 1.0, 5.0]
    actual_freq_norm = normalize([actual_freq], norm='l1').reshape(5,)
    for i, x in enumerate(ext_bins):
        new_freq = []
        for j, b in enumerate(actual_bins):
            c = 1/(np.sqrt(2*np.pi))
            # dist = 1 - get_j_coeff(b, x) # Change distance metric
            dist = distance.levenshtein(b,x)
            dist = -((np.power(dist,2))/2)
            d = np.exp(dist)
            w = c * d
            new_freq.append(w * actual_freq[j])
        ext_freq.append(sum(new_freq))

    combined_bins = copy(actual_bins)
    for _, val in enumerate(ext_bins):
        combined_bins.append(val)

    dist_matrix = np.zeros((8,8), dtype='float64')
    ## Distance
    for i, val in enumerate(combined_bins):
        for j, val_2 in enumerate(combined_bins):
            # dist_matrix[i,j] = distance.euclidean(combined_bins[i], combined_bins[j])
            # dist_matrix[i,j] = get_j_coeff(combined_bins[i], combined_bins[j])
            dist_matrix[i,j] = distance.levenshtein(combined_bins[i], combined_bins[j])
    
    ext_freq_norm = normalize([ext_freq], norm='l1').reshape(3,)
    
    for i in range(len(actual_freq)):
        # ext_freq.insert(0, 0.0)
        ext_freq_norm = np.insert(ext_freq_norm, 0, 0.0)

    for i in range(len(ext_bins)):
        # actual_freq.append(0.0)
        actual_freq_norm = np.append(actual_freq_norm, 0.0)

    # ext_freq = np.array(ext_freq)    
    # actual_freq = np.array(actual_freq)
    
    emd_val, min_cost_flow = pyemd.emd_with_flow(actual_freq_norm, ext_freq_norm, dist_matrix)
    dist_list.append(emd_val)
    emd_trials.update({index:{'distance':emd_val, 'flow_matrix':min_cost_flow,
                             'actual_bins':actual_bins, 'actual_freq_norm':actual_freq_norm,
                             'ext_bins':ext_bins, 'ext_freq_norm':ext_freq_norm,
                             'dist_matrix': dist_matrix}})

In [0]:
dist_list.index(max(dist_list))

In [0]:
min(dist_list)

In [0]:
max(dist_list)

### Path and bin conversions for dag_arch model

In [0]:
bin_labels.head(5)

In [0]:
path_df.head()

In [0]:
bin_labels.head()

In [0]:
test_var = path_df['paths'][1].split(',')[1]
temp = re.compile("(\d+)(\w+)(\d+)") 
res = temp.match(test_var).groups()
res[1]

In [0]:
test_var

In [0]:
bin_freq = {}
bin_labels['freq'] = 0
for i, val in path_df.iterrows():
    label_list = val['paths'].split(',')
    for j, val2 in enumerate(label_list):
        temp = re.compile("(\d+)(\w+)(\d+)") 
        res = temp.match(val2).groups()
        bin_labels.loc[bin_labels['label'] == res[1], 'freq'] += 1

In [0]:
bin_labels.shape

In [0]:
bin_labels.loc[bin_labels['freq']!=0,].shape

In [0]:
bin_labels = bin_labels.sort_values(by=['x']).reset_index(drop=True)

In [0]:
bin_labels = bin_labels.loc[bin_labels['freq'] != 0].reset_index(drop=True) ## Try fixing at R level.

In [0]:
bin_labels

In [0]:
actual_bins = bin_labels['x'].tolist()

In [0]:
len(actual_bins)

In [0]:
import pyemd
import random
from sklearn.preprocessing import normalize
from scipy.spatial import distance as scipy_distance
from copy import copy
import numpy as np

dist_list = []
emd_trials = {}
for index in range(1000):
    ext_bins = sorted(np.random.uniform(actual_bins[0],actual_bins[-1], 20))
    ext_freq = []
    # actual_freq = [10.0, 3.0, 4.0, 1.0, 5.0]
    actual_freq = bin_labels['freq'].tolist()
    actual_freq_norm = normalize([actual_freq], norm='l1').reshape(29,)
    for i, x in enumerate(ext_bins):
        new_freq = []
        for j, b in enumerate(actual_bins):
            c = 1/(np.sqrt(2*np.pi))
            # dist = -(((x-b)**2)/2)
            dist = -((np.power((x-b),2))/2)
            d = np.exp(dist)
            w = c * d
            new_freq.append(w * actual_freq[j])
        ext_freq.append(sum(new_freq))

    combined_bins = copy(actual_bins)
    for _, val in enumerate(ext_bins):
        combined_bins.append(val)

    # dist_matrix = np.zeros((112,112), dtype='float64')
    dist_matrix = np.zeros((49,49), dtype='float64')
    ## Distance
    for i, val in enumerate(combined_bins):
        for j, val_2 in enumerate(combined_bins):
            dist_matrix[i,j] = scipy_distance.euclidean(combined_bins[i], combined_bins[j])
    
    ext_freq_norm = normalize([ext_freq], norm='l1').reshape(20,)
    
    for i in range(len(actual_freq)):
        # ext_freq.insert(0, 0.0)
        ext_freq_norm = np.insert(ext_freq_norm, 0, 0.0)

    for i in range(len(ext_bins)):
        # actual_freq.append(0.0)
        actual_freq_norm = np.append(actual_freq_norm, 0.0)

    # ext_freq = np.array(ext_freq)    
    # actual_freq = np.array(actual_freq)
    
    emd_val, min_cost_flow = pyemd.emd_with_flow(actual_freq_norm, ext_freq_norm, dist_matrix)
    dist_list.append(emd_val)
    emd_trials.update({index:{'distance':emd_val, 'flow_matrix':min_cost_flow,
                             'actual_bins':actual_bins, 'actual_freq_norm':actual_freq_norm,
                             'ext_bins':ext_bins, 'ext_freq_norm':ext_freq_norm,
                             'dist_matrix': dist_matrix}})

In [0]:
dist_list.index(min(dist_list))

In [0]:
min(dist_list)

In [0]:
reduced_bins = emd_trials[200]['ext_bins']

In [0]:
len(reduced_bins)

In [0]:
np.array(emd_trials[15]['flow_matrix']).shape

In [0]:
def get_bin_mappings(flow_matrix_dim, emd_index):
    # mapping_list = []
    mapping_dict = {}
    for i in range(flow_matrix_dim):
        for j in range(flow_matrix_dim):
            if emd_trials[emd_index]['flow_matrix'][i][j] != 0.0:
                # mapping_list.append([emd_trials[emd_index]['flow_matrix'][i][j], (i,j)])
                deep_set(mapping_dict, [i, j], emd_trials[emd_index]['flow_matrix'][i][j])
                
    return mapping_dict

In [0]:
from dict_deep import deep_set
bin_mappings = get_bin_mappings(49, 200)

In [0]:
bin_mappings

In [0]:
from string import ascii_uppercase
labels = []
for i in range(len(ascii_uppercase)):
    labels.append(ascii_uppercase[i])
    
for i in range(len(ascii_uppercase)):
    if len(labels) >= 41:
        break
    for j in range(len(ascii_uppercase)):
        labels.append("".join([ascii_uppercase[i],ascii_uppercase[j]]))

In [0]:
len(labels)

In [0]:
bin_labels['new_bins'] = 0
bin_labels['new_index'] = 0
bin_labels['new_labels'] = ""

In [0]:
bin_labels.head()

In [0]:
len(reduced_bins)

In [0]:
import operator
for i, (k,val) in enumerate(bin_mappings.items()):
    reduced_bin_index = max(val.items(), key=operator.itemgetter(1))[0] - 29
    bin_labels.loc[k,'new_index'] = reduced_bin_index
    bin_labels.loc[k,'new_bins'] = reduced_bins[reduced_bin_index]
    bin_labels.loc[k,'new_labels'] = labels[reduced_bin_index]

In [0]:
bin_labels

### Updated paths

In [0]:
path_df.head()

In [0]:
path_df['updated_path'] = ""

In [0]:
path_df.head()

In [0]:
bin_labels.head()

In [0]:
for i, val in path_df.iterrows():
    label_list = val['new_col'].split(',')
    updated_list = []
    for j, val2 in enumerate(label_list):
        # bin_labels.loc[bin_labels['label'] == val2[1:-1], 'freq'] += 1
        new_label = bin_labels.loc[bin_labels['label'] == val2[1:-1]]['new_labels'].values[0]
        updated_list.append(val2[0] + new_label + val2[-1])
        if len(updated_list)>1:
            path_df.loc[i,'updated_path'] = (',').join(updated_list)
        else:
            path_df.loc[i,'updated_path'] = updated_list[0]

In [0]:
path_df

In [0]:
bin_labels.head()

In [0]:
test_data.head()

In [0]:
path_df.head()

In [0]:
test_data = test_data.drop(["new_col"], axis=1)
# test_data = test_data.drop(["updated_path"], axis=1)

In [0]:
test_data.head()

In [0]:
paths_lengths.argmax()

In [0]:
test_data.iloc[52,-1]

In [0]:
### Move to bottom trials

test_data = pd.concat([test_data, path_df.loc[:,'updated_path']], axis=1)
updated_path = []
for i, val in test_data.iterrows():
    updated_path.append(val['updated_path'].split(sep=","))

_ = [x.insert(0, 'S') for x in updated_path]
_ = [x.append('E') for x in updated_path]

test_data['updated_path'] = updated_path

# test_data = test_data.drop(["updated_path"], axis=1)

paths_lengths = np.array([len(xi) for xi in test_data.iloc[:,-1]])

label_char = []
for _, i in enumerate(np.unique(test_data['updated_path'])):
    for _, j in enumerate(i):
        if j not in label_char:
            label_char.append(j)

label_indices = { j : i for i, j in enumerate(label_char) }
indices_label = { i : j for i, j in enumerate(label_char) }

input_path_sequence = []
next_chars = []
features = []
paths_maxlen = np.max(paths_lengths)
# path_vocab_size = len(bin_labels) # How is this working? Validate!
path_vocab_size = len(indices_label) # Temporary test for local trees
feature_size = 4
for i in range(0, len(test_data)):
    # get the feature
    curr_feat = np.array([test_data.iloc[i, 0:4]])
    curr_path = test_data.iloc[i, -1]
    curr_path_len = len(curr_path)
    # curr_label = y[i]
    # curr_dec_feat = df.iloc[i, 6]
    for j in range(1, curr_path_len):
        features.append(curr_feat)
        input_path_sequence.append(curr_path[0:j])
        next_chars.append(curr_path[j])

## Vectorize inputs        

x_path = np.zeros(
    (len(input_path_sequence), paths_maxlen, path_vocab_size), dtype=np.bool)

path_latent_input = np.zeros(
    (len(input_path_sequence), feature_size), dtype=np.float)

y_path = np.zeros(
    (len(input_path_sequence), path_vocab_size), dtype=np.bool)

# print(input_path_sequence)
# print(len(input_path_sequence))
for i, sentence in enumerate(input_path_sequence):
    for t, char in enumerate(sentence):
        # x_path[i, t, self.char_indices[char]] = 1
        # print(bin_labels.index[bin_labels['label'] == char[1]])
        # index = bin_labels.index[bin_labels['label'] == char[1]].tolist()[0]
        x_path[i, t, label_indices[char]] = 1
    # y_path[i, char_indices[next_chars[i]]] = 1
    # index = bin_labels.index[bin_labels['label'] == next_chars[i][1]].tolist()[0]
    # y_path[i, index] = 1
    y_path[i, label_indices[next_chars[i]]] = 1
    path_latent_input[i, :] = features[i]

In [0]:
test_data.head()

## RNN architecture changes

In [None]:
path_df.head(5)

In [None]:
label_freq = {}
count = 0
for i in (path_df.loc[:,'paths']):
    for j in i.split(','):
        try:
            label_freq.update({j:label_freq[j]+1})
        except KeyError:
            label_freq.update({j:1})

In [None]:
len(label_freq)

In [0]:
path_df.loc[:, 'updated_path'].head()

In [0]:
label_freq = {}
count = 0
for i in (path_df.loc[:,'paths']):
    for j in i.split(','):
        try:
            label_freq.update({j[1:-1]:label_freq[j[1:-1]]+1})
        except KeyError:
            label_freq.update({j[1:-1]:1})

In [0]:
len(label_freq)

In [0]:
label_freq = {}
count = 0
for i in (path_df.loc[:,'paths']):
    for j in i.split(','):
        try:
            label_freq.update({j[1:-1]:label_freq[j[1:-1]]+1})
        except KeyError:
            label_freq.update({j[1:-1]:1})

In [0]:
label_freq

In [0]:
import matplotlib.pyplot as plt

plt.bar(label_freq.keys(), label_freq.values(), 2, color='g')

In [0]:
bin_labels.head(5)

In [None]:
dir_indices = {
    'S': 0,
    'E': 1,
    '0': 2,
    '1': 3
}
bin_indices = {0:0, 1:1}
bin_indices.update({val: index+2 for index, val in enumerate(np.unique(bin_labels['label']))})

feature_indices = {'S':0, 'E': 15}
feature_indices.update({str(val): val for val in range(1,15)})

In [None]:
feature_indices

In [None]:
len(feature_indices)

In [None]:
len(bin_indices)

In [None]:
np.unique(bin_labels['label'])

In [None]:
shuffle_data = test_data.sample(frac=1).reset_index(drop=True)

In [0]:
test_data.iloc[0, -1]

In [None]:
feature_vocab_size = 16
bin_vocab_size = 882
dir_vocab_size = 4
latent_dim = 25

input_path_sequence = []
next_chars = []
features = []
paths_maxlen = np.max(paths_lengths)
# path_vocab_size = len(bin_labels) # How is this working? Validate!
# path_vocab_size = len(indices_label) # Temporary test for local trees
feature_size = 104
for i in range(0, len(test_data)):
# for i in range(0, len(shuffle_data[:140])):
    # get the feature
    curr_feat = np.array([test_data.iloc[i, 0:104]])
    curr_path = test_data.iloc[i, -1]
    curr_path_len = len(curr_path)
    for j in range(1, curr_path_len):
        features.append(curr_feat)
        input_path_sequence.append(curr_path[0:j])
        next_chars.append(curr_path[j])

# x_path = np.zeros((len(input_path_sequence), paths_maxlen, path_vocab_size), dtype=np.bool)

x_feat = np.zeros((len(input_path_sequence), paths_maxlen, feature_vocab_size), dtype=np.bool)

x_bin = np.zeros((len(input_path_sequence), paths_maxlen, bin_vocab_size), dtype=np.bool)

x_dir = np.zeros((len(input_path_sequence), paths_maxlen, dir_vocab_size), dtype=np.bool)



path_latent_input = np.zeros((len(input_path_sequence), feature_size), dtype=np.float)

# y_path = np.zeros((len(input_path_sequence), path_vocab_size), dtype=np.bool)

y_feat = np.zeros((len(input_path_sequence), feature_vocab_size), dtype=np.bool)

y_bin = np.zeros((len(input_path_sequence), bin_vocab_size), dtype=np.bool)

y_dir = np.zeros((len(input_path_sequence), dir_vocab_size), dtype=np.bool)

for i, sentence in enumerate(input_path_sequence):
    for t, char in enumerate(sentence):
        if char == 'S':
            x_feat[i, t, feature_indices[char]] = 1
            x_bin[i, t, 0] = 1
            x_dir[i, t, 0] = 1
        else:
            temp = re.compile("(\d+)(\w+)(\d+)") 
            res = temp.match(char).groups()
            x_feat[i, t, feature_indices[res[0]]] = 1
            x_bin[i, t, bin_indices[res[1]]] = 1
            x_dir[i, t, dir_indices[res[2]]] = 1
    if next_chars[i] == 'E':
        y_feat[i, feature_indices[next_chars[i]]] = 1
        y_bin[i, 1] = 1 ## Cross check
        y_dir[i, 1] = 1 ## Cross check
    else:
        temp = re.compile("(\d+)(\w+)(\d+)") 
        res = temp.match(next_chars[i]).groups()
        y_feat[i, feature_indices[res[0]]] = 1
        y_bin[i, bin_indices[res[1]]] = 1
        y_dir[i, dir_indices[res[2]]] = 1
    # y_path[i, label_indices[next_chars[i]]] = 1
    path_latent_input[i, :] = features[i]
    
## Trouble with "S" and "E" index values.

In [None]:
y_bin.shape

In [None]:
x_feat.shape

In [None]:
x_bin.shape

In [None]:
path_latent_input.shape

In [None]:
path_latent_input.shape

In [0]:
# import tensorflow.compat.v1 as tf
# tf.disable_v2_behavior()

In [0]:
## Approach 2 -- linking 3 subsequent rnns
from tensorflow.keras.layers import Reshape, Flatten
import wandb
from wandb.keras import WandbCallback
wandb.init(project="dert")


label_model_latent = Input(shape=(latent_dim,), name='x_ip')

feature_input = Input(shape=(paths_maxlen, feature_vocab_size), name='feat_ip')

bin_input = Input(shape=(paths_maxlen, bin_vocab_size), name='bin_ip')

direction_input = Input(shape=(paths_maxlen, dir_vocab_size), name='dir_ip')

# masked_bin_input = Masking(mask_value=x_bin[0])(bin_input)

# masked_direction_input = Masking(mask_value=x_dir[0])(direction_input)

# if rnn_cell == 'gru':
#     RNN = GRU
# else:
RNN = GRU

decoder_1 = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_1',
                reset_after=True, recurrent_activation='sigmoid', activation= 'tanh',
                recurrent_dropout = 0, unroll = False, use_bias = True, )

decoder_1_outputs = decoder_1(feature_input, initial_state=label_model_latent)

decoder_1b_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_1b', reset_after=True, recurrent_activation='sigmoid')(decoder_1_outputs)

decoder_1c_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_1c', reset_after=True, recurrent_activation='sigmoid')(decoder_1b_outputs)

decoder_1d_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_1d', reset_after=True, recurrent_activation='sigmoid')(decoder_1c_outputs)

decoder_1e_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_1e', reset_after=True, recurrent_activation='sigmoid')(decoder_1d_outputs)


decoder_1f_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_1f', reset_after=True, recurrent_activation='sigmoid')(decoder_1e_outputs)

decoder_1g_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_1g', reset_after=True, recurrent_activation='sigmoid')(decoder_1f_outputs)

decoder_1h_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_1h', reset_after=True, recurrent_activation='sigmoid')(decoder_1g_outputs)

decoder_1i_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_1i', reset_after=True, recurrent_activation='sigmoid')(decoder_1h_outputs)

decoder_2 = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_2', reset_after=True, recurrent_activation='sigmoid')

concat_gru_1_2 = concatenate([bin_input, decoder_1i_outputs], name='gru_1_2')

decoder_2_outputs = decoder_2(concat_gru_1_2, initial_state=label_model_latent)

decoder_2b_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_2b', reset_after=True, recurrent_activation='sigmoid')(decoder_2_outputs)

decoder_2c_outputs = RNN(latent_dim, return_state=False, return_sequences=True, name='gru_2c', reset_after=True, recurrent_activation='sigmoid')(decoder_2b_outputs)

decoder_3 = RNN(latent_dim, return_state=False, return_sequences=False, name='gru_3', reset_after=True, recurrent_activation='sigmoid')

concat_gru_1_2_3 = concatenate([direction_input, decoder_1_outputs, decoder_2_outputs], name='gru_1_2_3')

decoder_3_outputs = decoder_3(concat_gru_1_2_3, initial_state=label_model_latent)

flatten_gru_1 = Flatten()(decoder_1i_outputs)
merge_layer_1 = concatenate([label_model_latent, flatten_gru_1], name='merge_1')

flatten_gru_2 = Flatten()(decoder_2c_outputs)
merge_layer_2 = concatenate([label_model_latent, flatten_gru_2], name='merge_2')

merge_layer_3 = concatenate([label_model_latent, decoder_3_outputs], name='merge_3')

output_feature = Dense(feature_vocab_size, activation='softmax', name='op_feat')(merge_layer_1)

output_bin = Dense(bin_vocab_size, activation='softmax', name='op_bin')(merge_layer_2)

# test_layer = Flatten()(decoder_3)

output_dir = Dense(dir_vocab_size, activation='softmax', name='op_dir')(merge_layer_3)

model = Model([label_model_latent, feature_input, bin_input, direction_input], [output_feature, output_bin, output_dir])

In [0]:
## Approach 3 -- single rnn, multi output model
from tensorflow.keras.layers import Reshape, Flatten
import wandb
from wandb.keras import WandbCallback
wandb.init(project="dert")


label_model_latent = Input(shape=(latent_dim,), name='x_ip')

feature_input = Input(shape=(paths_maxlen, feature_vocab_size), name='feat_ip')

bin_input = Input(shape=(paths_maxlen, bin_vocab_size), name='bin_ip')

direction_input = Input(shape=(paths_maxlen, dir_vocab_size), name='dir_ip')

RNN = GRU

merge_input = concatenate([feature_input, bin_input, direction_input], name='merge_ip')

decoder_1 = RNN(latent_dim, return_state=False, name='gru_1',
                reset_after=True, recurrent_activation='sigmoid', activation= 'tanh',
                recurrent_dropout = 0, unroll = False, use_bias = True, )

decoder_1_outputs = decoder_1(merge_input, initial_state=label_model_latent)

output_feature = Dense(feature_vocab_size, activation='softmax', name='op_feat')(decoder_1_outputs)

output_bin = Dense(bin_vocab_size, activation='softmax', name='op_bin')(decoder_1_outputs)

output_dir = Dense(dir_vocab_size, activation='softmax', name='op_dir')(decoder_1_outputs)

model = Model([label_model_latent, feature_input, bin_input, direction_input], [output_feature, output_bin, output_dir])

In [0]:
model.summary()

In [0]:
## Single RNN, multi-op approach
x_latent = get_hidden_x(path_latent_input, model=label_model)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([x_latent, x_feat, x_bin, x_dir], [y_feat, y_bin, y_dir],batch_size=6000, epochs=20000, verbose=1,
          callbacks=[WandbCallback()])

In [None]:
import wandb
from wandb.keras import WandbCallback
wandb.init(project="dert", name="stack_bilstm_loss_weights")

In [0]:
## Approach 4 -- single rnn, multi output model, multi dense
from tensorflow.keras.layers import Reshape, Flatten

label_model_latent = Input(shape=(latent_dim,), name='x_ip')

feature_input = Input(shape=(paths_maxlen, feature_vocab_size), name='feat_ip')

bin_input = Input(shape=(paths_maxlen, bin_vocab_size), name='bin_ip')

direction_input = Input(shape=(paths_maxlen, dir_vocab_size), name='dir_ip')

RNN = LSTM

merge_input = concatenate([feature_input, bin_input, direction_input], name='merge_ip')

decoder_1 = RNN(latent_dim, return_state=False, name='gru_1')

decoder_1_outputs = decoder_1(merge_input, initial_state=[label_model_latent, label_model_latent])

# decoder_2 = RNN(latent_dim, return_state=False, name='gru_2')

# decoder_2_outputs = decoder_2(decoder_1_outputs, initial_state=label_model_latent)

# hidden_1 = Dense(100, activation='softmax', name='h_1')(decoder_1_outputs)

# hidden_2 = Dense(100, activation='softmax', name='h_2')(hidden_1)

feat_hidden_1 = Dense(100, activation='softmax', name='f_1')(decoder_1_outputs)
output_feature = Dense(feature_vocab_size, activation='softmax', name='op_feat')(feat_hidden_1)

bin_hidden_1 = Dense(100, activation='softmax', name='b_1')(decoder_1_outputs)
bin_hidden_2 = Dense(100, activation='softmax', name='b_2')(bin_hidden_1)
output_bin = Dense(bin_vocab_size, activation='softmax', name='op_bin')(bin_hidden_2)

output_dir = Dense(dir_vocab_size, activation='softmax', name='op_dir')(decoder_1_outputs)

model = Model([label_model_latent, feature_input, bin_input, direction_input], [output_feature, output_bin, output_dir])

In [None]:
## Approach 5 -- two bilstm rnn, multi output model, multi dense
from tensorflow.keras.layers import Reshape, Flatten, Bidirectional

label_model_latent = Input(shape=(latent_dim,), name='x_ip')

feature_input = Input(shape=(paths_maxlen, feature_vocab_size), name='feat_ip')

bin_input = Input(shape=(paths_maxlen, bin_vocab_size), name='bin_ip')

direction_input = Input(shape=(paths_maxlen, dir_vocab_size), name='dir_ip')

RNN = LSTM

merge_input = concatenate([feature_input, bin_input, direction_input], name='merge_ip')

decoder_1 = Bidirectional(RNN(latent_dim, return_state=False, return_sequences=True, name='lstm_1'))

decoder_1_outputs = decoder_1(merge_input, initial_state=[label_model_latent, label_model_latent, label_model_latent, label_model_latent])

decoder_2 = Bidirectional(RNN(latent_dim, return_state=False, name='lstm_2'))

decoder_2_outputs = decoder_2(decoder_1_outputs, initial_state=[label_model_latent, label_model_latent, label_model_latent, label_model_latent])

# hidden_1 = Dense(100, activation='softmax', name='h_1')(decoder_1_outputs)

# hidden_2 = Dense(100, activation='softmax', name='h_2')(hidden_1)

feat_hidden_1 = Dense(100, activation='softmax', name='f_1')(decoder_2_outputs)
output_feature = Dense(feature_vocab_size, activation='softmax', name='op_feat')(feat_hidden_1)

bin_hidden_1 = Dense(100, activation='softmax', name='b_1')(decoder_2_outputs)
bin_hidden_2 = Dense(100, activation='softmax', name='b_2')(bin_hidden_1)
output_bin = Dense(bin_vocab_size, activation='softmax', name='op_bin')(bin_hidden_2)

output_dir = Dense(dir_vocab_size, activation='softmax', name='op_dir')(decoder_2_outputs)

model = Model([label_model_latent, feature_input, bin_input, direction_input], [output_feature, output_bin, output_dir])

In [None]:
model.summary()

In [None]:
## multiple stacked RNN, multi-op approach, 
x_latent = get_hidden_x(path_latent_input, model=label_model)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'], loss_weights={'op_feat': 0.2,'op_bin': 1.0,'op_dir': 0.05})
model.fit([x_latent, x_feat, x_bin, x_dir], [y_feat, y_bin, y_dir],batch_size=128, epochs=9000, verbose=1,
          callbacks=[WandbCallback()])

In [0]:
 !pip install wandb

In [None]:
!wandb login 3c44930157a5a6c5455f6a1ca690543cd2a34362

In [None]:
tf.test.is_built_with_cuda()

In [0]:
!cat /var/log/colab-jupyter.log

In [0]:
!pip show keras

In [0]:
!pip show tensorflow

In [0]:
model.summary()

In [None]:
def get_hidden_x(x, model, layer_num=3):
    def get_hidden_x_inner(model, layer_num=layer_num):
        return K.function([model.layers[0].input], [model.layers[layer_num].output])
    return get_hidden_x_inner(model, layer_num=layer_num)([x])[0]

### fit model

In [0]:
x_latent = get_hidden_x(path_latent_input, model=label_model)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([x_latent, x_feat, x_bin, x_dir], [y_feat, y_bin, y_dir],batch_size=6000, epochs=20000, verbose=1,
          callbacks=[WandbCallback()])
## dag_arch

In [0]:
feature_indices

In [0]:
model.summary()

In [0]:
model.summary()

In [0]:
model.layers

In [0]:
path = ['S', '4A1', '3D0', 'E']
feat_ip = ['S', 4, 3, E]
ip_2 = [0, 'A', 'D', 0]
ip_3 = [0,'']
# Correct masking issue
# Interconnect RNNS, first to second

In [0]:
model.summary()

In [0]:
model.summary()

In [0]:
model.summary()

In [0]:
model.summary()

In [0]:
from keras.utils import plot_model
plot_model(model, to_file='dag_approach_model.png')

In [0]:
path_latent_input.shape

In [0]:
x_bin.shape

In [0]:
x_dir.shape

In [0]:
x_dir[0,0]

In [0]:
x_bin[0,0]

In [0]:
y_cat = to_categorical(y)

# label_model_trial_6.compile(
#     optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# label_model_trial_6.fit(
#     X, y_cat, batch_size=30, epochs=150, verbose=1, shuffle=True, validation_split=0.2)

x_latent = get_hidden_x(path_latent_input, model=label_model_trial_6)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([x_latent, x_feat, x_bin, x_dir], [y_feat, y_bin, y_dir],batch_size=80, epochs=20000, verbose=1)
# Latent dim -- 10

In [0]:
y_cat = to_categorical(y)

x_latent = get_hidden_x(path_latent_input, model=label_model_trial_6)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([x_latent, x_feat, x_bin, x_dir], [y_feat, y_bin, y_dir],batch_size=80, epochs=2000, verbose=1)
# latent_dim -- 10

In [0]:
y_cat = to_categorical(y)

x_latent = get_hidden_x(path_latent_input, model=label_model_trial_6)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([x_latent, x_feat, x_bin, x_dir], [y_feat, y_bin, y_dir],batch_size=80, epochs=4000, verbose=1)
# latent_dim -- 10

In [0]:
y_cat = to_categorical(y)

x_latent = get_hidden_x(path_latent_input, model=label_model_trial_6)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([x_latent, x_feat, x_bin, x_dir], [y_feat, y_bin, y_dir],batch_size=80, epochs=10000, verbose=1)
# latent_dim -- 10

In [0]:
y_cat = to_categorical(y)

x_latent = get_hidden_x(path_latent_input, model=label_model_trial_6)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([x_latent, x_feat, x_bin, x_dir], [y_feat, y_bin, y_dir],batch_size=80, epochs=2000, verbose=1)
# latent_dim -- 100

In [0]:
model.summary()

In [0]:
from addition_rnn_sample_code import CharacterTable

### dag_arch inference flow

In [0]:
model.summary()

In [0]:
feat

In [0]:
dir_indices

In [0]:
iris['data'][0]

In [0]:
token = 'S'

x = iris['data'][0]

x_f = x.reshape(1, feature_size)

x_feat = np.zeros((1, paths_maxlen, feature_vocab_size), dtype=np.bool)
x_bin = np.zeros((1, paths_maxlen, bin_vocab_size), dtype=np.bool)
x_dir = np.zeros((1, paths_maxlen, dir_vocab_size), dtype=np.bool)

x_latent = get_hidden_x(x_f, model=label_model_trial_6)
x_latent = x_latent.reshape(1, latent_dim)

x_feat[0, 0, feature_indices[token]] = 1
x_bin[0, 0, 0] = 1 # Start token for bin is 0. End token is 1.
x_feat[0, 0, dir_indices[token]] = 1
pred = label_model_trial_6.predict(x_f)
label = [np.argmax(pred[0])]

pred = model.predict([x_latent, x_feat, x_bin, x_dir])

In [0]:
pred[2].shape

In [0]:
def predict(x):
    latent_dim = 25
    x_f = x.reshape(1, feature_size)
    token = 'S'
    cont = True
    path = [token]
    # x_path = np.zeros((1, paths_maxlen, path_vocab_size), dtype=np.bool)
    x_feat = np.zeros((1, paths_maxlen, feature_vocab_size), dtype=np.bool)
    x_bin = np.zeros((1, paths_maxlen, bin_vocab_size), dtype=np.bool)
    x_dir = np.zeros((1, paths_maxlen, dir_vocab_size), dtype=np.bool)

    x_latent = get_hidden_x(x_f, model=label_model_trial_6)
    x_latent = x_latent.reshape(1, latent_dim)
    # x_path[0, 0, label_indices[token]] = 1
    x_feat[0, 0, feature_indices[token]] = 1
    x_bin[0, 0, 0] = 1 # Start token for bin is 0. End token is 1.
    x_feat[0, 0, dir_indices[token]] = 1
    pred = label_model_trial_6.predict(x_f)
    label = [np.argmax(pred[0])]
    index = 1
    while cont & (index < paths_maxlen):
        # pred = combined_model.predict([x_latent, x_path])
        pred = model.predict([x_latent, x_feat, x_bin, x_dir])
        feature_index = np.argmax(pred[0])
        bin_index = np.argmax(pred[1])
        dir_index = np.argmax(pred[2])
        x_feat[0, index, feature_index] = 1
        x_bin[0, index, bin_index] = 1
        x_dir[0, index, dir_index] = 1
        next_feat = indices_feat[feature_index]
        next_bin = indices_bin[bin_index]
        next_dir = indices_dir[dir_index]
        if next_feat == 'E':
            path.append('E')
            cont = False
        elif next_bin == 1 or next_dir == 'E' or next_bin == 0 or next_dir == 'S':
            if next_bin == 1 or next_bin == 0:
                x_bin[0, index, bin_index] = 0
                # bin_index = np.argmax(np.argsort(pred[1]) == 15)
                bin_index = np.argmax(pred[1][0][2:]) + 2
                x_bin[0, index, bin_index] = 1
                next_bin = indices_bin[bin_index]
            else:
                x_dir[0, index, dir_index] = 0
                # dir_index = np.argmax(np.argsort(pred[2]) == 2)
                dir_index = np.argmax(pred[2][0][2:]) + 2
                x_dir[0, index, dir_index] = 1
                next_dir = indices_dir[dir_index]
            print('-----from second ifelse', [next_feat, next_bin, next_dir])
            path.append(''.join([next_feat, next_bin, next_dir]))
            index += 1            
        else:
            print('-----', [next_feat, next_bin, next_dir])
            path.append(''.join([next_feat, next_bin, next_dir]))
            index += 1

#     if path[-1] != 'E':
#         path.append('E')

    return [path, label]

In [0]:
dir_indices

In [0]:
bin_indices

In [0]:
test_value = np.array([1,3,2,5])
np.argmax(np.argsort(test_value) == 2)
np.argmax(test_value)

In [0]:
shuffle_data[4] = y

In [0]:
shuffle_data.head()

In [0]:
shuffle_data.iloc[1, 0:4]

In [0]:
def score():
    count = []
    bleu_score = []
    j_coeff = []
    l_dist = []
    path_mismatch_count = []
    traverse_check_count = []
    order_mismatch_count = []
    subset_path_count = []
    # for i in range(test_data.shape[0]):
    for i in range(140,150):
        curr_feat = np.array([shuffle_data.iloc[i, 0:X.shape[1]]])
        path, label = predict(curr_feat)
        actual_path = shuffle_data.iloc[i, -1]

#         actual_path_tok = [label_indices[char] for char in actual_path]
#         pred_path_tok = [label_indices[char] for char in path]

        # j_coeff.append(super().get_j_coeff(actual_path_tok, pred_path_tok))

        print('actual vs predicted: ', shuffle_data.iloc[i, -1], ' vs ', ' '.join(
            path), 'labels: ', shuffle_data.iloc[i,4], label[0])
        count.append(shuffle_data.iloc[i,4] == label[0])
        # print('Actual path -- ', actual_path)
        # print('Pred path -- ', path)
        if actual_path != path:
            print(' -- Path mismatch -- ')
            if sorted(actual_path) == sorted(path):
                print(' -- Order mismatch -- ')
                order_mismatch_count.append(1)
            else:
                path_mismatch_count.append(1)
                # pred_target, subset_path = self.check_path(path)
                pred_val = return_yval(path, i+1)
                # subset_path_count.append(subset_path)
                if pred_val != -1 and pred_val == shuffle_data.iloc[i,4]:
                    traverse_check_count.append(1)


        path = list(''.join(path))
        actual_path = list(''.join(shuffle_data.iloc[i, -1]))
        bleu_score.append(sentence_bleu([actual_path], path))

#         lev_path = []
#         for i in range(len(path)):
#             if i in ['S','L','R','E']:
#                 lev_path.append(i)
#         l_dist.append(distance.levenshtein(
#             self.df.iloc[i, self.X.shape[1]].replace(' ', ''), ''.join(lev_path)))


    print('\nLabel accuracy - ', np.mean(count))
#     print('Path metric (Jaccard) - ', np.mean(j_coeff))
#     print('Path metric (Levenshtein) - ', np.mean(l_dist))
    print('Path mismatch count - ', np.sum(path_mismatch_count))
    print('Right traverse count - ', np.sum(traverse_check_count))
    print('Order mismatch count - ', np.sum(order_mismatch_count))
#     print('Subset path count - ', np.sum(subset_path_count))
    print('Bleu score of paths - ', np.mean(bleu_score))

In [0]:
score() ## 3000 epochs, test set

In [0]:
bin_labels

In [0]:
score() ## 2000 epochs, test set

In [0]:
score() ## 3000 epochs, test set

In [0]:
## Global tree comparison
## Neural fingerprints

In [0]:
test_value = [0.99, 0.98, 0.32, 0.51]
a = np.argsort(test_value)

In [0]:
a

In [0]:
X

In [0]:
## Second best argmax for bin_gru and dir_gru
## Stack one more gru cell on bin_gru
## normal LSTM
## Bipartite
## Path invariance
## Agreement between all grus as error metric(in terms of end token)

In [0]:
indices_feat = {}
indices_bin = {}
indices_dir = {}
for val, i in feature_indices.items():
    indices_feat.update({i: val})
for val, i in bin_indices.items():
    indices_bin.update({i: val})
for val, i in dir_indices.items():
    indices_dir.update({i: val})

In [0]:
indices_feat

In [0]:
indices_bin

In [0]:
indices_dir

In [0]:
test_data

In [0]:
bin_labels

In [0]:
bin_labels.iloc[:10]

### Bipartite graph trials

In [0]:
%matplotlib notebook
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt

B = nx.Graph()
B.add_nodes_from(bin_labels['x'], bipartite=0)
B.add_nodes_from(bin_labels['new_bins'], bipartite=1)
# Add edges only between nodes of opposite node sets
# B.add_edges_from([(1, 'a'), (1, 'b'), (2, 'b'), (2, 'c'), (3, 'c'), (4, 'a')])
B.add_edges_from([(row['x'], row['new_bins']) for idx, row in bin_labels.iterrows()])

In [0]:
# labels = dict((n, "(" + n + "," + d['_type'] + ")") for n,d in B.nodes(data=True))
labels = dict((n, d) for n,d in B.nodes(data=True))
pos = {node:[0, i] for i,node in enumerate(bin_labels['x'])}
pos.update({node:[1, i] for i,node in enumerate(bin_labels['new_bins'])})
nx.draw(B, pos, with_labels=False)
for p in pos:  # raise text positions
    pos[p][1] += 0.25
nx.draw_networkx_labels(B, pos)

plt.show()

In [0]:
B.nodes(data=True)

In [0]:
labels_as_dict = dict((val['label'], val['x']) for key, val in bin_labels.iterrows() )

In [0]:
labels_as_dict

In [0]:
new_bins_df = bin_labels.loc[:,('new_labels','new_bins')].drop_duplicates()

In [0]:
new_bins_df

In [0]:
labels_as_dict.update(dict((val['new_labels'], val['new_bins']) for key, val in new_bins_df.iterrows() ))

In [0]:
labels_as_dict

In [0]:
dict((n, d) for n,d in B.nodes(data=True))

In [0]:
import networkx as nx
from networkx.algorithms import bipartite
%matplotlib notebook
import matplotlib.pyplot as plt

BG = nx.Graph()
employees = [str(i) for i in range(3)]
movies = ["mA", "mB", "mC"]
BG.add_nodes_from(employees, bipartite=0, _type='emp')
BG.add_nodes_from(movies, bipartite=1, _type='mov')
edges = [("0", "mA"), ("0", "mC"), ("1", "mA"),("1", "mB"), ("2", "mA")]
BG.add_edges_from(edges)
labels = dict((n, "(" + n + "," + d['_type'] + ")") for n,d in BG.nodes(data=True))

# Setting up pos for drawing bipartite graph. See the reference for more info
X, Y = bipartite.sets(BG)
pos = dict()
pos.update( (n, (1, i)) for i, n in enumerate(X) ) # put nodes from X at x=1
pos.update( (n, (2, i)) for i, n in enumerate(Y) ) # put nodes from Y at x=2

plt.figure()
edges = BG.edges()
nx.draw_networkx(BG, pos, edges=edges, labels=labels)

In [0]:
labels

In [0]:
bin_labels