In [5]:
import os
import json
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [9]:
data_name = 'user_defined'
def output_test(cols, data, target_names, real_min, real_max, y_pred, y_gt):
    filename = "./output/"+data_name+"/test.json"
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    to_output = {}
    to_output['columns'] = cols
    to_output['data'] = data
    to_output['target_names'] = target_names
    to_output['real_min'] = real_min
    to_output['real_max'] = real_max
    to_output['y_pred'] = y_pred
    to_output['y_gt'] = y_gt
    with open(filename, 'w') as output:
        output.write(json.dumps(to_output))

## Read Data
You can change the code below to read your own data.

In [4]:
df = pd.read_csv(filepath_or_buffer="./input/synthetic_data.csv", header=0, index_col=None)
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,y
0,0.368111,-0.766453,-1.0,1.0,-1.0,0
1,0.855474,-0.245013,1.0,1.0,-1.0,0
2,-0.527025,0.626383,-1.0,1.0,1.0,0
3,0.616431,-0.475369,1.0,-1.0,1.0,0
4,0.016539,-0.975422,1.0,1.0,-1.0,1


In [8]:
'''prepare data'''
X = df.drop(columns=['y']).values
y = df['y'].values
y = y.reshape(len(y))

train, test, train_labels, test_labels = train_test_split(X, y, test_size = 0.2)

train_df = pd.DataFrame(train, columns=df.columns[:-1].values)
test_df = pd.DataFrame(test, columns=df.columns[:-1].values)

'''train the model'''
clf = MLPClassifier(random_state=1, max_iter=300)
clf.fit(train, train_labels)

'''report accuracy'''
print(clf.score(test,test_labels ))

0.98




## Output Training Data

In [None]:
'''name the target classes'''
target_names = ["False", "True"]
min_val = np.min(X, axis=0)
max_val = np.max(X, axis=0)

y_pred = clf.predict(train)
output_data(df.columns[:-1].values.tolist(), train.tolist(), target_names, 
            min_val.tolist(), max_val.tolist(),
            y_pred.tolist(), train_labels.tolist())

In [177]:
dist_list = []

for attr_idx in range(len(to_keep)):
    hist = np.histogram(X[:, attr_idx], bins=10, range=(real_min[attr_idx], real_max[attr_idx]))
    dist_list.append({
        'hist': hist[0].tolist(),
        'bin_edges': hist[1].tolist(),
    })
    
with open('./output/'+ data_name + '/histogram.json', 'w') as output:
    output.write(json.dumps({'histogram': dist_list}))

In [178]:
y_pred = clf.predict(X)

In [179]:
# def prob(data):
#     return np.array(list(zip(1-clf.predict(data),clf.predict(data))))

In [180]:
print(clf.score(X, y))

0.851782363977486


In [181]:
clf.score(test,test_labels )

0.821875

In [182]:
y_train_svm = clf.predict(train)
y_test_svm = clf.predict(test)
y_svm_ = clf.predict(X)

In [183]:
''' transform numerical values to categorical ones: low, medium, high '''
real_3_1 = np.percentile(X, q=33, axis=0)
real_3_2 = np.percentile(X, q=67, axis=0)

def transform_func(col_idx, ele):
    if (ele < real_3_1[col_idx]):
        return 0
    elif (ele < real_3_2[col_idx]):
        return 1
    else:
        return 2
    
cate_X = []
for col_idx in range(X.shape[1]):
     cate_X.append([transform_func(col_idx, ele) for ele in X[:, col_idx]])
        
cate_X = np.transpose(np.array(cate_X))
    

In [184]:
real_percentile = {
    'num_threshold': 2,
    'percentile': [.33, .67],
    'percentile_table': [real_3_1.tolist(), real_3_2.tolist()]
}
output_test(list(to_keep), X.tolist(), target_names, real_min.tolist(), real_max.tolist(), 
            real_percentile, y_pred.tolist(), y.tolist())

In [185]:
# new_label = np.zeros(shape=y_svm.shape)
# new_label[wrong_index] = 1

In [186]:
filter_threshold = {
    'support': 20,
    'fidelity': 0.9,
    'num_feat': 5,
    'depth': 20,
}

In [187]:
from sklearn.ensemble import RandomForestClassifier

rfc=RandomForestClassifier(random_state=1234, min_samples_leaf=filter_threshold['support'])
rfc.fit(cate_X, y_svm_)
print(rfc.score(cate_X, y_svm_))

0.7823639774859287




In [188]:
''' get tree info from the forest '''
estimators = rfc.estimators_

In [189]:
import importlib

In [190]:
import tree_node_info
importlib.reload(tree_node_info)

# tree_node_obj = tree_node_info.tree_node_info()
# tree_node_obj.initialize(estimators[0], cate_X, y, y_svm_, to_keep, filter_threshold)
# tree_node_obj.node_info_dict

<module 'tree_node_info' from '/Users/junyuan/Documents/_/python/rule_vis/tree_node_info.py'>

In [191]:
tree_list = []    
tree_node_obj = tree_node_info.tree_node_info()

for estimator in estimators:
    tree_node_obj.initialize(estimator, cate_X, y, y_svm_, to_keep, filter_threshold)
    tree_list.append(tree_node_obj.node_info_dict)

In [192]:
'''construct a tree'''
forest_obj = tree_node_info.forest()
forest_obj.initialize(tree_list, cate_X, y, y_svm_, to_keep).construct_tree()


<tree_node_info.forest at 0x11c237350>

In [193]:
vis_tree = forest_obj.get_vis_hierarchy()

In [194]:
len(forest_obj.leaves)

35

In [195]:
''' output_tree is for outputing the tree data in a hierarchical form '''
def output_tree(tree):
    filename = "./output/"+data_name+"/tree.json"
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(filename, 'w') as output:
        output.write(json.dumps({
            "tree": tree,
        }))

def output_node_info(node_info_arr):
    filename = "./output/"+data_name+"/node_info.json"
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(filename, 'w') as output:
        output.write(json.dumps(
            {"node_info_arr": node_info_arr,
             "max_depth": estimator.tree_.max_depth,
            }))
        
output_node_info(forest_obj.tree_node_dict)
output_tree(vis_tree)

In [196]:
rep_range = np.zeros(shape=(len(to_keep), 3, 2))

for idx in range(len(to_keep)):
    rep_range[idx][0] = np.array([real_min[idx], real_3_1[idx]])
    rep_range[idx][1] = np.array([real_3_1[idx], real_3_2[idx]])
    rep_range[idx][2] = np.array([real_3_2[idx], real_max[idx]])
    
    
def translate_rule(feat_range, feat_idx):
    # find the integers that fit
    ranges = []
    for i in range(3):
        if (i >= feat_range[0] and i <= feat_range[1]):
            ranges.append(i)
    
    # translate the integer into rule condition
    if (ranges[0] == 0):
        # (min, threshold]
        cond = {
            'feature': feat_idx,
            'sign': '<=',
            'threshold': rep_range[feat_idx][ranges[-1]][1]
        }
    elif (ranges[-1] == 2):
        # (threshold, max]
        cond = {
            'feature': feat_idx,
            'sign': '>',
            'threshold': rep_range[feat_idx][ranges[0]][0]
        }
    else:
        # (threshold0, threshold1]
        cond = {
            'feature': feat_idx,
            'sign': 'range',
            'threshold0': rep_range[feat_idx][ranges[0]][0],
            'threshold1': rep_range[feat_idx][ranges[-1]][1]
        }
    return cond
        

In [197]:
'''trace back from leaves'''
tree_features = estimator.tree_.feature
node_threshold = estimator.tree_.threshold
rule_lists = []

for i in range(len(forest_obj.leaves)):
    node_id = forest_obj.leaves[i]
    print("="*10, "leave node %d"%(node_id), "="*10)
    feature_range = np.zeros(shape=(len(to_keep), 2), dtype=np.float128)
    feature_range[:, 0] = 0
    feature_range[:, 1] = 2

    while node_id >= 0:
        p_id = forest_obj.tree_node_dict[node_id]['parent']
        sign = forest_obj.tree_node_dict[node_id]['sign']
        if (p_id >= 0):
            f_idx = forest_obj.tree_node_dict[p_id]['feature']
            thrshd = forest_obj.tree_node_dict[p_id]['threshold']
            if (sign == '<='):
                # (min, thrshd)
                if (feature_range[f_idx][1] > thrshd):
                    feature_range[f_idx][1] = thrshd
            else:
                # (thrshd, max)
                if (feature_range[f_idx][0] < thrshd):
                    feature_range[f_idx][0] = thrshd
        node_id = p_id
#         print("rules:")
    rules = []
    for j in range(len(feature_range)):
        rule_str = to_keep[j]
        # summarize the condition
        if (feature_range[j][0]!=0 or feature_range[j][1]!=2):
            new_cond = translate_rule(feature_range[j], j)
            rules.append(new_cond)
    rule_lists.append({
        "label": int(np.argmax(forest_obj.tree_node_dict[forest_obj.leaves[i]]['value'])),
        "node_id": int(forest_obj.leaves[i]),
        "rules": rules,})



In [198]:
def output_rule_list(rule_lists, target_names):
    filename = "./output/"+data_name+"/list.json"
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    to_write = {"rule_lists": rule_lists, "target_names": target_names}
    print(to_write)
    with open(filename, 'w') as output:
        output.write(json.dumps(to_write))
        

rule_lists.reverse()
print("="*30)
# print(rule_lists)
output_rule_list(rule_lists, target_names)

{'rule_lists': [{'label': 0, 'node_id': 125, 'rules': [{'feature': 0, 'sign': '>', 'threshold': 8.8}, {'feature': 1, 'sign': '<=', 'threshold': 0.42999999999999994}, {'feature': 2, 'sign': '>', 'threshold': 0.37}, {'feature': 3, 'sign': '<=', 'threshold': 2.0}, {'feature': 4, 'sign': '<=', 'threshold': 0.07400000000000001}]}, {'label': 0, 'node_id': 120, 'rules': [{'feature': 1, 'sign': 'range', 'threshold0': 0.42999999999999994, 'threshold1': 0.6}, {'feature': 4, 'sign': '<=', 'threshold': 0.07400000000000001}, {'feature': 5, 'sign': '>', 'threshold': 10.0}, {'feature': 6, 'sign': '<=', 'threshold': 52.0}, {'feature': 10, 'sign': '>', 'threshold': 10.8}]}, {'label': 0, 'node_id': 113, 'rules': [{'feature': 2, 'sign': '>', 'threshold': 0.15}, {'feature': 4, 'sign': '<=', 'threshold': 0.07400000000000001}, {'feature': 5, 'sign': '<=', 'threshold': 10.0}, {'feature': 6, 'sign': '<=', 'threshold': 52.0}, {'feature': 10, 'sign': '>', 'threshold': 10.8}]}, {'label': 0, 'node_id': 109, 'rule

In [199]:
# import tree_node_info
# importlib.reload(tree_node_info)

rs = tree_node_info.forest_rules()

rs.initialize(df, rule_lists, real_min, real_max)
ts = rs.find_the_min_set()

In [200]:
leng = len(ts)
fidel_list = []
for rid in ts:
    node_id = rule_lists[rid]['node_id']
    fidel_list.append(forest_obj.tree_node_dict[node_id]['fidelity'])
print(leng)

24


In [201]:
fidel_list

[0.9204545454545454,
 0.9186046511627907,
 0.9342105263157895,
 0.9302325581395349,
 0.9178082191780822,
 0.9705882352941176,
 0.9180327868852459,
 1.0,
 0.9032258064516129,
 0.9206349206349206,
 0.9482758620689655,
 0.9761904761904762,
 0.9069767441860465,
 0.9423076923076923,
 0.9666666666666667,
 1.0,
 0.9074074074074074,
 0.975609756097561,
 0.9767441860465116,
 0.9382716049382716,
 0.9705882352941176,
 0.9722222222222222,
 0.9259259259259259,
 0.9722222222222222]

In [202]:
''' scaling '''
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(X)
# scaled = scaler.transform(X)

' scaling '

In [203]:
''' projection '''
# from sklearn.decomposition import PCA
# import altair as alt

# result_pca = PCA(n_components=2).fit_transform(scaled)
# pca_df = pd.DataFrame(data=result_pca, columns=['x','y'])

# alt.Chart(pca_df).mark_point().encode(
#     x='x:Q',
#     y='y:Q',
# )

' projection '

In [204]:
# def output_projection(coordinate):
#     filename = "./output/"+data_name+"/projection.json"
#     directory = os.path.dirname(filename)
#     if not os.path.exists(directory):
#         os.makedirs(directory)
#     with open(filename, 'w') as output:
#         output.write(json.dumps(
#             {"projection": coordinate,
#             }))
# output_projection(result_pca.tolist())