In [1]:
%load_ext autoreload
%autoreload 1

In [1]:
import pandas as pd
import numpy as np
import heapq
import math
import time

import gmpy2
from gmpy2 import mpz
import re

from sklearn import tree

import cProfile

In [2]:
# Read in the dataset
df = pd.DataFrame(pd.read_csv('../data/compas-binary.csv'))

In [3]:
x_all = df.as_matrix()[:,:13]

y = df.as_matrix()[:,13]

In [4]:
# Association Rule Mining (Only one feature)

#support
#supp = [(x[:,i]*y).mean() for i in range(13)]
#supp

In [5]:
#confidence
#conf1 = [sum(x_all[:,i]*y)/sum(x_all[:,i]) for i in range(13)]
#conf1

In [6]:
#confidence
#conf0 = [sum((x_all[:,i]==0)*y)/sum((x_all[:,i]==0)) for i in range(13)]
#conf0

In [7]:
#x_idx = [conf1[i]>=0.5 or conf0[i]>=0.5 for i in range(len(conf1))]

# Because Using both conf1 and conf0 would select out too many features, 
#which is hard for the algorithm to run out,
# just use conf1 to select out a small fraction of feature.
#x_idx = [conf1[i]>=0.5 for i in range(len(conf1))]
#x_idx[0] = True # in the CORELS paper, gender is an important feature, so I add it manually
#x_idx

In [8]:
#select out these features
#x = x_all[:,x_idx]

In [9]:
#manaually select out 5 features, accoring to CORELS paper when lambda=0.01
## sex:Female, age:18-20,age:21-22, juvenile-crimes:=0, priors:>3
##x_idx = [0,1,2,8,12]

# sex:Female, age:18-20,age:21-22, priors:2-3, priors:>3
x_idx = [0,1,2,9,12]
x = x_all[:,x_idx]
x

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1],
       ...,
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 0, 1, 1, 0]])

In [10]:
#manaually select out 6 features, accoring to CORELS paper when lambda=0.01
# sex:Female, age:18-20,age:21-22, juvenile-crimes:=0, priors:2-3, priors:>3
x_idx6 = [0,1,2,8,9,12]
x6 = x_all[:,x_idx6]
x6

array([[0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       ...,
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 1, 0],
       [1, 0, 1, 1, 1, 0]])

In [11]:
nrule = x.shape[1]
ndata = len(y)

In [12]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x, y)

In [13]:
accu = clf.score(x,y)
nleaves = (clf.tree_.node_count+1)/2
R_c = 1-accu + 0.001*nleaves
R_c

0.3520089764007529

In [14]:
print(clf.tree_.node_count) #get the node count

35


## With similar support bound

### calculate similar support bound when the highly correlated features substitue each other

### regular expression is used

In [15]:
from corels_dt_nosimilar import bbound_nosimilar
from corels_dt_similar import bbound_similar
from corels_dt_similar_when_big import bbound_similar_when_big

from corels_dt_similar_when_sub import bbound_similar_when_sub

In [16]:
# use actual loss for support bound and accurate support bound
bbound_nosimilar(x, y, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
COUNT: 400000
COUNT: 500000
COUNT: 600000
COUNT: 700000
COUNT: 800000
COUNT: 900000
COUNT: 1000000
COUNT: 1100000
COUNT: 1200000
COUNT: 1300000
COUNT: 1400000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  184.81254839897156
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-4, -3, -1), (-5, -3, -1, 4), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  121295
time when the best tree is achieved:  14.326935529708862
TOTAL COUNT:  1477303


<corels_dt_nosimilar.CacheTree at 0x7f0eeadb9400>

In [19]:
bbound_nosimilar(x, y, lamb=0.01, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  36.048810720443726
lambda:  0.01
leaves:  [(1,), (-3, -1), (-1, 3)]
prediction:  [1, 0, 1]
Objective:  0.3764601129289127
COUNT of the best tree:  7
time when the best tree is achieved:  0.31281542778015137
TOTAL COUNT:  303874


<corels_dt_nosimilar.CacheTree at 0x7f0eeb6304e0>

In [18]:
bbound_similar(x, y, lamb=0.01, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  81.10475730895996
lambda:  0.01
leaves:  [(1,), (-3, -1), (-1, 3)]
prediction:  [1, 0, 1]
Objective:  0.3764601129289127
COUNT of the best tree:  7
time when the best tree is achieved:  0.31783342361450195
TOTAL COUNT:  57485


<corels_dt_similar.CacheTree at 0x7f0eeb630e80>

In [17]:
#if lb + lamb - deadprefix_lb < 0 continue; add to the head of the deadprefix list:
# use actual loss for support bound and accurate support bound
bbound_similar(x, y, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  739.2445468902588
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-4, -3, -1), (-5, -3, -1, 4), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  119783
time when the best tree is achieved:  103.84894824028015
TOTAL COUNT:  264202


<corels_dt_similar.CacheTree at 0x7f0f326d86a0>

In [20]:
cProfile.run("bbound_similar(x, y, lamb=0.0035, prior_metric=\"objective\", MAXDEPTH = 4)")

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  2763.292908191681
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-4, -3, -1), (-5, -3, -1, 4), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  119783
time when the best tree is achieved:  512.4111704826355
TOTAL COUNT:  264202
         446076681 function calls (446076679 primitive calls) in 2763.462 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        5    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.023    0.023 2763.462 2763.462 <string>:1(<module>)
        3    0.000    0.000    0.000    0.000 __init__.py:200(iteritems)
        1    0.000    0.000    0.000    0.000 _bootlocale.py:23(getpref

        1    0.000    0.000    0.000    0.000 {pandas._libs.algos.rank_1d_float64}
        2    0.001    0.001    0.001    0.001 {pandas._libs.lib.maybe_convert_objects}




In [None]:
#if lb + lamb - deadprefix_lb <= 0:
bbound_similar_when_sub(x, y, lamb=0.0035, corr_threshold = 0.15, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]


In [None]:
#if lb + lamb - deadprefix_lb <= 0:
cProfile.run("bbound_similar_when_sub(x, y, lamb=0.0035, corr_threshold = 0.15, prior_metric=\"objective\", MAXDEPTH = 4)")