In [1]:
%load_ext autoreload
%autoreload 1

In [1]:
import pandas as pd
import numpy as np
import heapq
import math
import time

import gmpy2
from gmpy2 import mpz
import re

from sklearn import tree

import cProfile

In [2]:
# Read in the dataset
df = pd.DataFrame(pd.read_csv('../data/compas-binary.csv'))

In [3]:
x_all = df.as_matrix()[:,:13]

y = df.as_matrix()[:,13]

In [4]:
# Association Rule Mining (Only one feature)

#support
#supp = [(x[:,i]*y).mean() for i in range(13)]
#supp

In [5]:
#confidence
#conf1 = [sum(x_all[:,i]*y)/sum(x_all[:,i]) for i in range(13)]
#conf1

In [6]:
#confidence
#conf0 = [sum((x_all[:,i]==0)*y)/sum((x_all[:,i]==0)) for i in range(13)]
#conf0

In [7]:
#x_idx = [conf1[i]>=0.5 or conf0[i]>=0.5 for i in range(len(conf1))]

# Because Using both conf1 and conf0 would select out too many features, 
#which is hard for the algorithm to run out,
# just use conf1 to select out a small fraction of feature.
#x_idx = [conf1[i]>=0.5 for i in range(len(conf1))]
#x_idx[0] = True # in the CORELS paper, gender is an important feature, so I add it manually
#x_idx

In [8]:
#select out these features
#x = x_all[:,x_idx]

In [9]:
#manaually select out 5 features, accoring to CORELS paper when lambda=0.01
## sex:Female, age:18-20,age:21-22, juvenile-crimes:=0, priors:>3
##x_idx = [0,1,2,8,12]

# sex:Female, age:18-20,age:21-22, priors:2-3, priors:>3
x_idx = [0,1,2,9,12]
x = x_all[:,x_idx]
x

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1],
       ...,
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 0, 1, 1, 0]])

In [10]:
#manaually select out 6 features, accoring to CORELS paper when lambda=0.01
# sex:Female, age:18-20,age:21-22, juvenile-crimes:=0, priors:2-3, priors:>3
x_idx6 = [0,1,2,8,9,12]
x6 = x_all[:,x_idx6]
x6

array([[0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       ...,
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 1, 0],
       [1, 0, 1, 1, 1, 0]])

In [11]:
nrule = x.shape[1]
ndata = len(y)

In [12]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x, y)

In [13]:
accu = clf.score(x,y)
nleaves = (clf.tree_.node_count+1)/2
R_c = 1-accu + 0.001*nleaves
R_c

0.3520089764007529

In [14]:
print(clf.tree_.node_count) #get the node count

35


## With similar support bound

### calculate similar support bound when the highly correlated features substitue each other

### regular expression is used

In [16]:
from corels_dt_nosimilar import bbound_nosimilar
from corels_dt_similar import bbound_similar
from corels_dt_similar_when_big import bbound_similar_when_big

from corels_dt_similar_when_sub import bbound_similar_when_sub

In [18]:
#all bounds
#all data, 5 features

bbound_similar_when_sub(x[:,:3], y, z, lamb=0.0035, corr_threshold=0.15,prior_metric="objective", MAXDEPTH = 4)

nrule: 3
ndata: 6907
the rank of x's columns:  [0, 1, 2]
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  0.635047197341919
lambda:  0.0035
leaves:  [(1,), (-1, 2), (-3, -2, -1), (-2, -1, 3)]
prediction:  [0, 1, 0, 1]
Objective:  0.4442881135080353
COUNT of the best tree:  96
time when the best tree is achieved:  0.6119098663330078
TOTAL COUNT:  121


<corels_dt_similar_when_sub.CacheTree at 0x7ff6ecfa0898>

In [19]:
cProfile.run("bbound_similar_when_sub(x[:,:3], y, z, lamb=0.0035, corr_threshold=0.15,prior_metric=\"objective\", MAXDEPTH = 4)")

nrule: 3
ndata: 6907
the rank of x's columns:  [0, 1, 2]
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  1.0165972709655762
lambda:  0.0035
leaves:  [(1,), (-1, 2), (-3, -2, -1), (-2, -1, 3)]
prediction:  [0, 1, 0, 1]
Objective:  0.4442881135080353
COUNT of the best tree:  96
time when the best tree is achieved:  0.8459556102752686
TOTAL COUNT:  121
         79010 function calls (78697 primitive calls) in 1.112 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        7    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.000    0.000    1.112    1.112 <string>:1(<module>)
        7    0.000    0.000    0.000    0.000 __init__.py:200(iteritems)
        1    0.000    0.000    0.000    0.000 _bootlocale.py:23(getpreferredencoding)
        1    0.000    0.000    0.001    0.001 _decorators.

        7    0.000    0.000    0.000    0.000 internals.py:3532(_consolidate_check)
        7    0.000    0.000    0.000    0.000 internals.py:3533(<listcomp>)
        1    0.000    0.000    0.001    0.001 internals.py:3580(get_numeric_data)
        1    0.000    0.000    0.000    0.000 internals.py:3588(<listcomp>)
        1    0.000    0.000    0.001    0.001 internals.py:3590(combine)
        1    0.000    0.000    0.000    0.000 internals.py:3596(<listcomp>)
        2    0.000    0.000    0.000    0.000 internals.py:3666(as_matrix)
        1    0.000    0.000    0.000    0.000 internals.py:373(fillna)
        3    0.000    0.000    0.000    0.000 internals.py:3813(consolidate)
       10    0.000    0.000    0.000    0.000 internals.py:3829(_consolidate_inplace)
        3    0.000    0.000    0.000    0.000 internals.py:3836(get)
        3    0.000    0.000    0.000    0.000 internals.py:3865(iget)
        5    0.000    0.000    0.000    0.000 internals.py:4363(__init__)
       35  

In [23]:
cProfile.run("bbound_similar(x, y, z, lamb=0.0035, prior_metric=\"objective\", MAXDEPTH = 4)")

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
         157979259 function calls (157979258 primitive calls) in 755.301 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        3    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.000    0.000  755.301  755.301 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 __init__.py:200(iteritems)
        1    0.000    0.000    0.000    0.000 _weakrefset.py:70(__contains__)
        1    0.000    0.000    0.000    0.000 abc.py:180(__instancecheck__)
        1    0.000    0.000    0.000    0.000 algorithms.py:217(_get_data_algo)
        1    0.000    0.000    0.000    0.000 algorithms.py:39(_ensure_data)
        1    0.000    0.000    0.000    0.000 algorithms.py:680(rank)
        1    0.000    0.000    0.000    0.000 base.py:4155(_ensure_index)
        1    0.000    0.000    0.000    0.000 base.py:551(_r

KeyboardInterrupt: 

In [22]:
cProfile.run("bbound_nosimilar(x, y, z, lamb=0.0035, prior_metric=\"objective\", MAXDEPTH = 4)")

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
COUNT: 400000
COUNT: 500000
COUNT: 600000
COUNT: 700000
COUNT: 800000
COUNT: 900000
COUNT: 1000000
COUNT: 1100000
COUNT: 1200000
COUNT: 1300000
COUNT: 1400000
COUNT: 1500000
COUNT: 1600000
COUNT: 1700000
COUNT: 1800000
COUNT: 1900000
COUNT: 2000000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  1477.3101377487183
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-4, -3, -1), (-5, -3, -1, 4), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  499510
time when the best tree is achieved:  358.1839289665222
TOTAL COUNT:  2053987
         265200784 function calls (265200783 primitive calls) in 1477.579 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        3    0.000    0.000    0.000    0.00

      206    0.001    0.000    0.001    0.000 {method 'reshape' of 'numpy.ndarray' objects}
      617    1.612    0.003    1.612    0.003 {method 'sub' of '_sre.SRE_Pattern' objects}
        1    0.000    0.000    0.000    0.000 {method 'tolist' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.000    0.000 {method 'update' of 'dict' objects}
        2    0.000    0.000    0.000    0.000 {method 'write' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.000    0.000 {pandas._libs.algos.rank_1d_float64}
        1    0.000    0.000    0.000    0.000 {pandas._libs.lib.maybe_convert_objects}




In [17]:
bbound_nosimilar(x, y, z, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
COUNT: 400000
COUNT: 500000
COUNT: 600000
COUNT: 700000
COUNT: 800000
COUNT: 900000
COUNT: 1000000
COUNT: 1100000
COUNT: 1200000
COUNT: 1300000
COUNT: 1400000
COUNT: 1500000
COUNT: 1600000
COUNT: 1700000
COUNT: 1800000
COUNT: 1900000
COUNT: 2000000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  278.02116656303406
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-4, -3, -1), (-5, -3, -1, 4), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  499510
time when the best tree is achieved:  69.548579454422
TOTAL COUNT:  2053987


<corels_dt_nosimilar.CacheTree at 0x7f3d7824f550>

In [18]:
bbound_similar(x, y, z, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  7529.429073572159
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-4, -3, -1), (-5, -3, -1, 4), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  281332
time when the best tree is achieved:  3372.203535079956
TOTAL COUNT:  395261


<corels_dt_similar.CacheTree at 0x7f3d745e7160>

In [17]:
bbound_similar_when_big(x, y, z, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
COUNT: 400000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  9282.356282234192
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-4, -3, -1), (-5, -3, -1, 4), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  296389
time when the best tree is achieved:  3656.3707978725433
TOTAL COUNT:  418928


<corels_dt_similar_when_big.CacheTree at 0x7f465a34dc88>

In [17]:
# the deadprefix list is reversed
bbound_similar(x, y, z, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  6817.165247440338
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-4, -3, -1), (-5, -3, -1, 4), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  281332
time when the best tree is achieved:  2978.3366239070892
TOTAL COUNT:  395261


<corels_dt_similar.CacheTree at 0x7f0cc2290a20>

In [17]:
# the deadprefix list is reversed, drop the element in deadprefix when it is just used by the similar support bound
# slow, >4 hours
bbound_similar(x, y, z, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
COUNT: 400000
COUNT: 500000
COUNT: 600000
COUNT: 700000
COUNT: 800000


KeyboardInterrupt: 

In [17]:
# the deadprefix is a priority queue, ordered by tree.risk
bbound_similar(x, y, z, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  13787.366845369339
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-4, -3, -1), (-5, -3, -1, 4), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  281332
time when the best tree is achieved:  6265.260919570923
TOTAL COUNT:  395261


<corels_dt_similar.CacheTree at 0x7f860588c7b8>