In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from dtextract.data.consts import *
from dtextract.impl.funcs import *
from dtextract.impl.dists import *
from dtextract.util.util import *
from dtextract.data.consts_generated import *
from dtextract.examples.dt_interpreter import *
from dtextract.examples.runCompare import *

import pandas as pd
import numpy as np

In [None]:
output = TEST_OUTPUT  # The log file in which the log of running code will be written
path = TEST_PATH  # Path to the input dataset in csv format
data_types = TEST_DATA_TYPES
has_header = TEST_HAS_HEADER
isClassify = True  # Weather the problem is a classification or regression problem

In [None]:
# The main algorithm's parameters
nComponents = 1000  # Number of components (the gaussian mixtures)
maxSize = 64  # maximum tree size
nPts = 2000  # Number of points used in active sampling
nTestPts = 2000  # Number of test points used in Active sampling

# decision tree training parameters
maxDtSize = maxSize

In [None]:
setCurOutput(output)
log('Parsing CSV...', INFO)
(df, res, resMap, catFeats) = readCsv(path, has_header, data_types)
log('Done!', INFO)

In [None]:
log('Splitting into training and test...', INFO)
(trainDf, testDf) = split(df, trainingProp)
log('Done!', INFO)

In [None]:
log('Constructing data matrices...', INFO)
(XTrain, yTrain, catFeatIndsTrain, numericFeatIndsTrain) = constructDataMatrix(trainDf, res, catFeats)
(XTest, yTest, catFeatIndsTest, numericFeatIndsTest) = constructDataMatrix(testDf, res, catFeats)
log('Done!', INFO)

In [None]:
log('Training random forest...', INFO)
rfConstructor = RandomForestClassifier if isClassify else RandomForestRegressor
rf = rfConstructor(n_estimators=nTrees)
rf.fit(XTrain, yTrain)
log('Done!', INFO)

In [None]:
rfScoreFunc = f1Vec if isClassify else mseVec

rfTrainScore = rfScoreFunc(rf.predict, XTrain, yTrain)
rfTestScore = rfScoreFunc(rf.predict, XTest, yTest)

log('Training score: ' + str(rfTrainScore), INFO)
log('Test score: ' + str(rfTestScore), INFO)

In [None]:
# Step 2: Set up decision tree extraction inputs
paramsLearn = ParamsLearn(tgtScore, minGain, maxSize)
paramsSimp = ParamsSimp(nPts, nTestPts, isClassify)

# Step 3: Function
rfFunc = getRfFunc(rf)

In [None]:
dist = CategoricalGaussianMixtureDist(XTrain, catFeatIndsTrain, numericFeatIndsTrain, nComponents)

In [None]:
# Step 5: Extract decision tree
dtExtract, dtMap = learnDTSimp(genAxisAligned, rfFunc, dist, paramsLearn, paramsSimp)

In [None]:
log('Decision tree:', INFO)
log(str(dtExtract), INFO)
log('Node count: ' + str(dtExtract.nNodes()), INFO)
log('DT in DOT language:', INFO)
log(str(dtExtract.toDotGraph()), INFO)

In [None]:
scoreFunc = f1 if isClassify else mse

dtExtractRelTrainScore = scoreFunc(dtExtract.eval, XTrain, rf.predict(XTrain))
dtExtractRelTestScore = scoreFunc(dtExtract.eval, XTest, rf.predict(XTest))

log('Relative training score: ' + str(dtExtractRelTrainScore), INFO)
log('Relative test score: ' + str(dtExtractRelTestScore), INFO)

dtExtractTrainScore = scoreFunc(dtExtract.eval, XTrain, yTrain)
dtExtractTestScore = scoreFunc(dtExtract.eval, XTest, yTest)

log('Training score: ' + str(dtExtractTrainScore), INFO)
log('Test score: ' + str(dtExtractTestScore), INFO)

In [None]:
# Step 6: Train a (greedy) decision tree
log('Training greedy decision tree', INFO)
maxLeaves = (maxDtSize + 1)/2
dtConstructor = DecisionTreeClassifier if isClassify else DecisionTreeRegressor
dtTrain = dtConstructor(max_leaf_nodes=maxLeaves)
dtTrain.fit(XTrain, rfFunc(XTrain))
log('Done!', INFO)
log('Node count: ' + str(dtTrain.tree_.node_count), INFO)

In [None]:
dtTrainRelTrainScore = scoreFunc(lambda x: dtTrain.predict(x.reshape(1, -1)), XTrain, rf.predict(XTrain))
dtTrainRelTestScore = scoreFunc(lambda x: dtTrain.predict(x.reshape(1, -1)), XTest, rf.predict(XTest))

log('Relative training score: ' + str(dtTrainRelTrainScore), INFO)
log('Relative test score: ' + str(dtTrainRelTestScore), INFO)

In [None]:
dtTrainTrainScore = scoreFunc(lambda x: dtTrain.predict(x.reshape(1, -1)), XTrain, yTrain)
dtTrainTestScore = scoreFunc(lambda x: dtTrain.predict(x.reshape(1, -1)), XTest, yTest)
    
log('Training score: ' + str(dtTrainTrainScore), INFO)
log('Test score: ' + str(dtTrainTestScore), INFO)

In [None]:
predictions, biases, contributions = interpret_tree(dtExtract, dtMap, XTest, yTest)
assert_interpretation(predictions, biases, contributions)

In [None]:
descriptions = ''
if has_header:
    # NOTE: here you should trim the headers array to be the array of names of input columns 
    # with the same size as the number of columns. for example if the first row is and id row 
    # and the last row is the label row, you can use: 
    # headers = list(pd.read_csv(path))[1:-1]
    headers = np.array(headers)
    exps = interpret_samples(rf, dtExtract, XTest, contributions, labels=headers)
    for i in range(len(exps)):
        descriptions += exps[i].get_description(5, True, headers)
else:
    exps = interpret_samples(rf, dtExtract, XTest, contributions)
    for i in range(len(exps)):
        descriptions += exps[i].get_description(5)