In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
import numpy as np
import heapq
import math
import time

import gmpy2
from gmpy2 import mpz
import re

In [3]:
# Read in the dataset
df = pd.DataFrame(pd.read_csv('../data/compas-binary.csv'))

In [4]:
x_all = df.as_matrix()[:,:13]

y = df.as_matrix()[:,13]

In [5]:
# Association Rule Mining (Only one feature)

#support
#supp = [(x[:,i]*y).mean() for i in range(13)]
#supp

In [6]:
#confidence
#conf1 = [sum(x_all[:,i]*y)/sum(x_all[:,i]) for i in range(13)]
#conf1

In [7]:
#confidence
#conf0 = [sum((x_all[:,i]==0)*y)/sum((x_all[:,i]==0)) for i in range(13)]
#conf0

In [8]:
#x_idx = [conf1[i]>=0.5 or conf0[i]>=0.5 for i in range(len(conf1))]

# Because Using both conf1 and conf0 would select out too many features, 
#which is hard for the algorithm to run out,
# just use conf1 to select out a small fraction of feature.
#x_idx = [conf1[i]>=0.5 for i in range(len(conf1))]
#x_idx[0] = True # in the CORELS paper, gender is an important feature, so I add it manually
#x_idx

In [9]:
#select out these features
#x = x_all[:,x_idx]

In [10]:
#manaually select out 5 features, accoring to CORELS paper when lambda=0.01
## sex:Female, age:18-20,age:21-22, juvenile-crimes:=0, priors:>3
##x_idx = [0,1,2,8,12]

# sex:Female, age:18-20,age:21-22, priors:2-3, priors:>3
x_idx = [0,1,2,9,12]
x = x_all[:,x_idx]
x

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1],
       ..., 
       [0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 0, 1, 1, 0]])

In [11]:
#manaually select out 6 features, accoring to CORELS paper when lambda=0.01
# sex:Female, age:18-20,age:21-22, juvenile-crimes:=0, priors:2-3, priors:>3
x_idx6 = [0,1,2,8,9,12]
x6 = x_all[:,x_idx6]
x6

array([[0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       ..., 
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 1, 0],
       [1, 0, 1, 1, 1, 0]])

In [12]:
nrule = x.shape[1]
ndata = len(y)

In [13]:
"""
calculate z, which is for the equivalent points bound
z is the vector defined in algorithm 5 of the CORELS paper
z is a binary vector indicating the data with a minority lable in its equivalent set
"""
z = pd.DataFrame([-1]*ndata).as_matrix()
# enumerate through theses samples
for i in range(ndata):
    #if z[i,0]==-1, this sample i has not been put into its equivalent set
    if z[i,0] == -1:
        tag1 = np.array([True]*ndata)
        for j in range(nrule):
            rule_label = x[i][j]
            #tag1 indicates which samples have exactly the same features with sample i
            tag1 = (x[:,j] == rule_label)*tag1
            
        y_l = y[tag1]
        pred = int(y_l.sum()/len(y_l) >= 0.5)
        #tag2 indicates the samples in a equiv set which have the minority label
        tag2 = (y_l != pred)
        z[tag1,0] = tag2

In [14]:
from corels_dt import bbound

#all data, 5 features
bbound(x, y, z, lamb=0.001, prior_metric="gini", MAXDEPTH = 4)

nrule: 5
ndata: 6907


KeyboardInterrupt: 

#### compas-binary

In [None]:
import cProfile
cProfile.run("bbound(x[:,:], y, lamb=0.01, prior_metric=\"entropy\")")

In [None]:
cProfile.run("bbound(x[:,:], y, lamb=0.01, prior_metric=\"gini\")")

In [None]:
import memory_profiler

In [None]:
%%time
#all data

bbound(x, y, lamb=0.01, prior_metric="curiosity")

In [None]:
%%time
#all data, 5 features

bbound(x, y, lamb=0.01, prior_metric="bound")

In [None]:
%%time
#all data, 5 features

bbound(x, y, lamb=0.01, prior_metric="entropy")

In [None]:
%%time
#all data, 5 features

bbound(x, y, lamb=0.005, prior_metric="gini")

In [None]:
%%time
#all data, 5 features

bbound(x, y, lamb=0.005, prior_metric="gini", MAXDEPTH = 5)

In [None]:
%%time
#all data, 5 features

bbound(x, y, lamb=0.0005, prior_metric="gini", MAXDEPTH = 5)

In [None]:
%%time
#all data, 6 features

bbound(x6, y, lamb=0.01, prior_metric="entropy")

In [None]:
%%time
#all data, 6 features

bbound(x6, y, lamb=0.01, prior_metric="gini")