In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
import numpy as np
import heapq
import math
import time

import gmpy2
from gmpy2 import mpz
import re

from sklearn import tree

import cProfile

In [3]:
# Read in the dataset
df = pd.DataFrame(pd.read_csv('../data/compas-binary.csv'))

In [4]:
x_all = df.as_matrix()[:,:13]

y = df.as_matrix()[:,13]

In [5]:
# Association Rule Mining (Only one feature)

#support
#supp = [(x[:,i]*y).mean() for i in range(13)]
#supp

In [6]:
#confidence
#conf1 = [sum(x_all[:,i]*y)/sum(x_all[:,i]) for i in range(13)]
#conf1

In [7]:
#confidence
#conf0 = [sum((x_all[:,i]==0)*y)/sum((x_all[:,i]==0)) for i in range(13)]
#conf0

In [8]:
#x_idx = [conf1[i]>=0.5 or conf0[i]>=0.5 for i in range(len(conf1))]

# Because Using both conf1 and conf0 would select out too many features, 
#which is hard for the algorithm to run out,
# just use conf1 to select out a small fraction of feature.
#x_idx = [conf1[i]>=0.5 for i in range(len(conf1))]
#x_idx[0] = True # in the CORELS paper, gender is an important feature, so I add it manually
#x_idx

In [9]:
#select out these features
#x = x_all[:,x_idx]

In [10]:
#manaually select out 5 features, accoring to CORELS paper when lambda=0.01
## sex:Female, age:18-20,age:21-22, juvenile-crimes:=0, priors:>3
##x_idx = [0,1,2,8,12]

# sex:Female, age:18-20,age:21-22, priors:2-3, priors:>3
x_idx = [0,1,2,9,12]
x = x_all[:,x_idx]
x

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1],
       ...,
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 0, 1, 1, 0]])

In [11]:
#manaually select out 6 features, accoring to CORELS paper when lambda=0.01
# sex:Female, age:18-20,age:21-22, juvenile-crimes:=0, priors:2-3, priors:>3
x_idx6 = [0,1,2,8,9,12]
x6 = x_all[:,x_idx6]
x6

array([[0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       ...,
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 1, 0],
       [1, 0, 1, 1, 1, 0]])

In [12]:
nrule = x.shape[1]
ndata = len(y)

In [13]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x, y)

In [14]:
accu = clf.score(x,y)
nleaves = (clf.tree_.node_count+1)/2
R_c = 1-accu + 0.001*nleaves
R_c

0.3520089764007529

In [15]:
print(clf.tree_.node_count) #get the node count

35


## With similar support bound

### calculate similar support bound when the highly correlated features substitue each other

### regular expression is used

In [16]:
from corels_dt_nosimilar import bbound_nosimilar
from corels_dt_similar import bbound_similar
from corels_dt_similar_when_big import bbound_similar_when_big

from corels_dt_similar_when_sub import bbound_similar_when_sub

In [19]:
bbound_nosimilar(x, y, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  5.075555086135864
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-5, -3, -1), (-4, -3, -1, 5), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  2927
time when the best tree is achieved:  2.8839590549468994
TOTAL COUNT:  12764


<corels_dt_nosimilar.CacheTree at 0x7f496b036b00>

In [20]:
## with incremental
bbound_similar(x, y, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  12.5983304977417
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-5, -3, -1), (-4, -3, -1, 5), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  2916
time when the best tree is achieved:  3.1878578662872314
TOTAL COUNT:  9157


<corels_dt_similar.CacheTree at 0x7f496b051ac8>

In [21]:
cProfile.run("bbound_nosimilar(x, y, lamb=0.0035, prior_metric=\"objective\", MAXDEPTH = 4)")

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  10.082534074783325
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-5, -3, -1), (-4, -3, -1, 5), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  2927
time when the best tree is achieved:  3.949885606765747
TOTAL COUNT:  12764
         1033248 function calls (1033246 primitive calls) in 10.233 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        5    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.001    0.001   10.233   10.233 <string>:1(<module>)
        3    0.000    0.000    0.000    0.000 __init__.py:200(iteritems)
        1    0.000    0.000    0.000    0.000 _bootlocale.py:23(getpreferredencoding)
    29392    0.107   

In [22]:
cProfile.run("bbound_similar(x, y, lamb=0.0035, prior_metric=\"objective\", MAXDEPTH = 4)")

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  53.315651416778564
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-5, -3, -1), (-4, -3, -1, 5), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  2916
time when the best tree is achieved:  5.6540985107421875
TOTAL COUNT:  9157
         9280221 function calls (9280219 primitive calls) in 53.476 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        5    0.000    0.000    0.000    0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
        1    0.001    0.001   53.476   53.476 <string>:1(<module>)
        3    0.000    0.000    0.000    0.000 __init__.py:200(iteritems)
        1    0.000    0.000    0.000    0.000 _bootlocale.py:23(getpreferredencoding)
    19345    0.071   

        3    0.000    0.000    0.000    0.000 {method 'join' of 'str' objects}
     9158    0.023    0.000    0.023    0.000 {method 'pop' of 'list' objects}
    19346    0.172    0.000    0.172    0.000 {method 'reduce' of 'numpy.ufunc' objects}
      194    0.001    0.000    0.001    0.000 {method 'reshape' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.000    0.000 {method 'search' of '_sre.SRE_Pattern' objects}
      578    1.506    0.003    1.506    0.003 {method 'sub' of '_sre.SRE_Pattern' objects}
    19345    0.093    0.000    0.337    0.000 {method 'sum' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.000    0.000 {method 'tolist' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.000    0.000 {method 'update' of 'dict' objects}
        2    0.000    0.000    0.000    0.000 {method 'write' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.000    0.000 {pandas._libs.algos.rank_1d_float64}
        2    0.001    0.001    0.001  

In [23]:
bbound_nosimilar(x, y, lamb=0.0035, prior_metric="objective", MAXDEPTH = 5)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  11.018238544464111
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-5, -3, -1), (-4, -3, -1, 5), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  14826
time when the best tree is achieved:  4.4402501583099365
TOTAL COUNT:  58505


<corels_dt_nosimilar.CacheTree at 0x7f496b060d68>

In [24]:
bbound_similar(x, y, lamb=0.0035, prior_metric="objective", MAXDEPTH = 5)

nrule: 5
ndata: 6907
the rank of x's columns:  [4, 0, 1, 2, 3]
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  319.85523533821106
lambda:  0.0035
leaves:  [(3,), (-3, 1), (-5, -3, -1), (-4, -3, -1, 5), (-3, -1, 4, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  14056
time when the best tree is achieved:  16.275825262069702
TOTAL COUNT:  48117


<corels_dt_similar.CacheTree at 0x7f496af9d5f8>

In [17]:
bbound_nosimilar(x6, y, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 6
ndata: 6907
the rank of x's columns:  [5, 0, 4, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
COUNT: 400000
COUNT: 500000
COUNT: 600000
COUNT: 700000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  82.06066346168518
lambda:  0.0035
leaves:  [(4,), (-4, 1), (-4, -3, -1), (-5, -4, -1, 3), (-4, -1, 3, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  288521
time when the best tree is achieved:  37.849639892578125
TOTAL COUNT:  714461


<corels_dt_nosimilar.CacheTree at 0x7f11d7455ef0>

In [18]:
bbound_similar(x6, y, lamb=0.0035, prior_metric="objective", MAXDEPTH = 4)

nrule: 6
ndata: 6907
the rank of x's columns:  [5, 0, 4, 1, 2, 3]
COUNT: 100000
COUNT: 200000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  1387.365823507309
lambda:  0.0035
leaves:  [(4,), (-4, 1), (-4, -3, -1), (-5, -4, -1, 3), (-4, -1, 3, 5)]
prediction:  [1, 1, 0, 0, 1]
Objective:  0.3541150282322281
COUNT of the best tree:  196307
time when the best tree is achieved:  518.8560793399811
TOTAL COUNT:  281145


<corels_dt_similar.CacheTree at 0x7f11d7171630>

In [19]:
bbound_nosimilar(x6, y, lamb=0.01, prior_metric="objective", MAXDEPTH = 5)

nrule: 6
ndata: 6907
the rank of x's columns:  [5, 0, 4, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
COUNT: 400000
COUNT: 500000
COUNT: 600000
COUNT: 700000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  104.39202117919922
lambda:  0.01
leaves:  [(1,), (-6, -1), (-1, 6)]
prediction:  [1, 1, 0]
Objective:  0.3748675256985667
COUNT of the best tree:  11
time when the best tree is achieved:  0.5258162021636963
TOTAL COUNT:  754374


<corels_dt_nosimilar.CacheTree at 0x7f11d7fe2518>

In [20]:
bbound_nosimilar(x6, y, lamb=0.01, prior_metric="objective", MAXDEPTH = 6)

nrule: 6
ndata: 6907
the rank of x's columns:  [5, 0, 4, 1, 2, 3]
COUNT: 100000
COUNT: 200000
COUNT: 300000
COUNT: 400000
COUNT: 500000
COUNT: 600000
COUNT: 700000
COUNT: 800000
COUNT: 900000
COUNT: 1000000
COUNT: 1100000
COUNT: 1200000
COUNT: 1300000
>>> log: False
>>> support bound: True
>>> accurate support bound: True
>>> equiv points bound: True
>>> lookahead bound: True
total time:  206.15556597709656
lambda:  0.01
leaves:  [(1,), (-6, -1), (-1, 6)]
prediction:  [1, 1, 0]
Objective:  0.3748675256985667
COUNT of the best tree:  11
time when the best tree is achieved:  0.49757909774780273
TOTAL COUNT:  1336065


<corels_dt_nosimilar.CacheTree at 0x7f11d4dd66a0>

In [None]:
bbound_similar(x6, y, lamb=0.0035, prior_metric="objective", MAXDEPTH = 6)