# Analysis of r $Q_{max}$ 

In this experiment we use $Q^*_{max}$ to understand the effect of $\alpha$ on the selection of subinstances. 

Experiment settings: 

 *  $Q^*$ is the oracle 
 *  The oracle is a logistic regression $L_1$ regularized with $C=0.3$ for IMDB and $C=0.01$ for SRAA 
 *  $Q_{max}$ is the confidence of the oracle for a subinstance of  size k: 
     - $Q_k = \max_y Q^*(y|x_i^k)$
 
 

In [2]:
%matplotlib inline

import sys
import os
sys.path.append(os.path.abspath('C:/cygwin/home/mramire8/python_code/sr/active'))
sys.path.append(os.path.abspath('/Users/maru/MyCode/sr/active'))

from sklearn import linear_model

from datautil.textutils import StemTokenizer
from datautil.load_data import *

import numpy as np
from time import time
import pickle

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np

mpl.style.use('bmh')

## Get the data ready
imdb_path = 'C:/Users/mramire8/Documents/Research/Oracle confidence and Interruption/dataset/aclImdb/raw-data'
# imdb_path = '/Users/maru/MyCode/data/imdb'

categories = [['alt.atheism', 'talk.religion.misc'],
              ['comp.graphics', 'comp.windows.x'],
              ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
              ['rec.sport.baseball', 'sci.crypt']]

vct = CountVectorizer(encoding='latin-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                      token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())



In [None]:

dataset = load_dataset("imdb", 100, categories[0], vct, 100, raw=True,  percent=.5, keep_subject=True)
sraa = load_dataset("aviation", 100, categories[0], vct2, 100, raw=True,  percent=.5, keep_subject=True)

# data = load_dataset('imdb', None, categories[0], vct, 100, percent=.5, keep_subject=True)


In [115]:
print("Data size %s" % len(dataset.train.data))
print

kvalues = [10, 25, 50, 75, 100]
cost = np.array([5.7, 8.2, 10.9, 15.9, 16.7])

threshold = .4

def compute_q_max(data_name, dataset, vct, penalty):
    n = len(dataset.train.data)
    print "Data size: %s" %  n
    q_max = np.zeros((5,n))

    for ki, fixk in enumerate(kvalues):

        fixk_saved = "{0}{1}.p".format(data_name, fixk)

        data = process_data(dataset, fixk, 100, vct, silent=True)

        train_x = data.test.bow
        train_y = data.test.target

        test_x = data.train.bowk
        test_y = data.train.target

        print "*"*60
        print
        print("K= %s" % fixk)
        

        clf = linear_model.LogisticRegression(penalty='l1', C=penalty)
        print "penalty: %s" % penalty 
        clf.fit(train_x, train_y)


        prob_y = clf.predict_proba(test_x)

        q = prob_y.max(axis=1)

        unc = prob_y.min(axis=1)

        print "N: %s" % len(test_y)
        print "q: %s" % q.shape

        print "Ave. unc: %s" % unc.mean()
        print "Ave. qmax: %s" % q.mean()
        
        q_max[ki] = q
        
    return q_max



Data size 22267



In [91]:

q_max = compute_q_max('imdb', dataset, vct, 0.3)


Data size 22267

Total Documents: 22267
Minimum size: 100
Fix k: 10
Docs left: 22267
Vectorizing ...
************************************************************

K= 10
penalty: 0.3
Ave. unc: 0.383709611847
Ave. qmax: 0.616290388153
Total Documents: 22267
Minimum size: 100
Fix k: 25
Docs left: 22267
Vectorizing ...
************************************************************

K= 25
penalty: 0.3
Ave. unc: 0.315394757857
Ave. qmax: 0.684605242143
Total Documents: 22267
Minimum size: 100
Fix k: 50
Docs left: 22267
Vectorizing ...
************************************************************

K= 50
penalty: 0.3
Ave. unc: 0.25426018805
Ave. qmax: 0.74573981195
Total Documents: 22267
Minimum size: 100
Fix k: 75
Docs left: 22267
Vectorizing ...
************************************************************

K= 75
penalty: 0.3
Ave. unc: 0.214129386017
Ave. qmax: 0.785870613983
Total Documents: 22267
Minimum size: 100
Fix k: 100
Docs left: 22267
Vectorizing ...
************************************

In [93]:
q_max_sel = q_max.argmax(axis=0)
print q_max_sel.shape

(22267L,)


In [16]:
for i in range(5):
    print "\t".join(["{:.3f}".format(a) for a in q_max[i][:10]])

0.731	0.511	0.659	0.505	0.889	0.516	0.603	0.760	0.804	0.631
0.765	0.582	0.623	0.541	0.922	0.655	0.593	0.953	0.880	0.747
0.719	0.726	0.691	0.529	0.934	0.862	0.764	0.988	0.890	0.818
0.798	0.590	0.756	0.652	0.898	0.777	0.718	0.999	0.930	0.770
0.676	0.563	0.779	0.635	0.677	0.522	0.650	0.999	0.981	0.588


In [94]:
# Percentage of k-word subinstances where Q_max = Q_k
def print_q_max(q_max_sel):
    for i in range(5):
    #     print "k=%s max=%s" % (kvalues[i], 1. * sum((q_max_sel == 0) [(1-q_max[1]) < .4]) / len(q_max_sel) * 100)
        print "k=%s\tcount=%s\tmax=%.4f" % (kvalues[i],sum(q_max_sel == i), np.mean(q_max_sel == i))

print_q_max(q_max_sel)

k=10	count=623	max=0.0280
k=25	count=1767	max=0.0794
k=50	count=2978	max=0.1337
k=75	count=4540	max=0.2039
k=100	count=12359	max=0.5550


In [221]:
def fn(x, alpha):
    return np.power(x,alpha)

def fn_obj(x, cost, alpha):
    ''' Objective function  x^alpha / cost '''
    return fn(x,alpha) / cost

def compute_alpha(q_max, alpha, cost):
    q_obj = np.zeros(q_max.shape)
    
    # For every row = subinstance k, compute objective function
    for i in range(len(q_obj)):
        q_obj[i] = fn_obj(q_max[i], cost[i], alpha)
        
    return q_obj

def max_alpha(q_max, alpha, cost, epsilon=0.0):
    q = compute_alpha(q_max+epsilon, alpha, cost)
    return q.argmax(axis=0)

def stats_k(q_mx_sl, q_alpha_sl):
    ''' Compares two matrix'''

    stats ={}
    stats['match'] = (np.sum(q_mx_sl == q_alpha_sl),np.mean(q_mx_sl == q_alpha_sl))
    stats['cheaper'] = (np.sum(q_mx_sl > q_alpha_sl),np.mean(q_mx_sl > q_alpha_sl))
    stats['expensive'] = (np.sum(q_mx_sl < q_alpha_sl),np.mean(q_mx_sl < q_alpha_sl))
    
    
    for a in np.unique(q_mx_sl):
        stats['k=%s'%kvalues[a]] = (np.sum((q_mx_sl[q_mx_sl == q_alpha_sl]) == a ),np.mean((q_mx_sl[q_mx_sl == q_alpha_sl]) == a ))
    
    
    return stats

def print_stats(stats):
    for k,v in stats.items():
        if k in ['match']:
            print "{}:\t{}\t{:.4f}".format(k,*v)


# Analysis of $Q_{max}$ for $\alpha$ values on IMDB

In [169]:
alphas = np.array([0,1,5,10,100,200,1000,2000])
print "Alpha \t Count \t Percentage" 
for a in alphas:
    
    st = stats_k(q_max_sel, max_alpha(q_max, a, cost))
    print "%s \t" % a,
    print_stats(st)
    

Alpha 	 Count 	 Percentage
0 	match:	623	0.0280
1 	match:	849	0.0381
5 	match:	11881	0.5336
10 	match:	16543	0.7429
100 	match:	21593	0.9697
200 	match:	21890	0.9831
1000 	match:	22176	0.9959
2000 	match:	19992	0.8978


# Analysis of $Q_{max}$ for $\alpha$ values on SRAA

In [117]:
vct2 = CountVectorizer(encoding='latin-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 3),
                      token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())

sraa = load_dataset("aviation", 100, categories[0], vct2, 100, raw=False,  percent=.5, keep_subject=True)

q_max_sraa = compute_q_max('aviation', sraa, vct2, 0.01)
q_max_sel_sraa = q_max_sraa.argmax(axis=0)
print_q_max(q_max_sel_sraa)


Total Documents: 36609
Minimum size: 100
Fix k: 100
Docs left: 22451
Vectorizing ...
Data size: 22451
Total Documents: 22451
Minimum size: 100
Fix k: 10
Docs left: 22451
Vectorizing ...
************************************************************

K= 10
penalty: 0.01
N: 22451
q: 22451
Ave. unc: 0.284436192237
Ave. qmax: 0.715563807763
Total Documents: 22451
Minimum size: 100
Fix k: 25
Docs left: 22451
Vectorizing ...
************************************************************

K= 25
penalty: 0.01
N: 22451
q: 22451
Ave. unc: 0.248522570855
Ave. qmax: 0.751477429145
Total Documents: 22451
Minimum size: 100
Fix k: 50
Docs left: 22451
Vectorizing ...
************************************************************

K= 50
penalty: 0.01
N: 22451
q: 22451
Ave. unc: 0.194642761251
Ave. qmax: 0.805357238749
Total Documents: 22451
Minimum size: 100
Fix k: 75
Docs left: 22451
Vectorizing ...
************************************************************

K= 75
penalty: 0.01
N: 22451
q: 22451
Ave. unc:

In [142]:
alphas = np.array([0,1,5,10,100,200,1000,1254,2000])
cost_sraa= np.array([5.2, 6.5, 7.6, 9.1, 10.3])

print "Alpha \t Count \t Percentage" 

for a in alphas:
    st = stats_k(q_max_sel_sraa, max_alpha(q_max_sraa, a, cost_sraa))
    print "%s \t" % a,
    print_stats(st)
  

Alpha 	 Count 	 Percentage
0 	match:	2266	0.1009
1 	match:	2983	0.1329
5 	match:	7290	0.3247
10 	match:	8578	0.3821
100 	match:	13173	0.5867
200 	match:	14529	0.6471
1000 	match:	17408	0.7754
1254 	match:	17650	0.7862
2000 	match:	16070	0.7158


In [136]:

def print_stats_all(stats):
    keys = [k for k in sorted(stats.keys(), key= lambda x: x[0][2:])]
    for k in keys:
            v = stats[k]
            print "{}:\t{}\t{:.4f}".format(k,*v)

            
alphas = np.array([0,1,5,10,100,200,1000,1300,2000])
cost_sraa= np.array([5.2, 6.5, 7.6, 9.1, 10.3])

print "Alpha \t Count \t Percentage" 

for a in alphas:
    print 
    st = stats_k(q_max_sel_sraa, max_alpha(q_max_sraa, a, cost_sraa))
    print "== Alpha: %s ==" % a
    print_stats_all(st)

Alpha 	 Count 	 Percentage

== Alpha: 0 ==
k=25:	0	0.0000
k=100:	0	0.0000
cheaper:	20185	0.8991
match:	2266	0.1009
k=50:	0	0.0000
k=75:	0	0.0000
k=10:	2266	1.0000
expensive:	0	0.0000

== Alpha: 1 ==
k=25:	239	0.0801
k=100:	0	0.0000
cheaper:	19468	0.8671
match:	2983	0.1329
k=50:	440	0.1475
k=75:	38	0.0127
k=10:	2266	0.7596
expensive:	0	0.0000

== Alpha: 5 ==
k=25:	348	0.0477
k=100:	2553	0.3502
cheaper:	15161	0.6753
match:	7290	0.3247
k=50:	859	0.1178
k=75:	1264	0.1734
k=10:	2266	0.3108
expensive:	0	0.0000

== Alpha: 10 ==
k=25:	421	0.0491
k=100:	3361	0.3918
cheaper:	13873	0.6179
match:	8578	0.3821
k=50:	990	0.1154
k=75:	1540	0.1795
k=10:	2266	0.2642
expensive:	0	0.0000

== Alpha: 100 ==
k=25:	574	0.0436
k=100:	6064	0.4603
cheaper:	9278	0.4133
match:	13173	0.5867
k=50:	1656	0.1257
k=75:	2613	0.1984
k=10:	2266	0.1720
expensive:	0	0.0000

== Alpha: 200 ==
k=25:	748	0.0515
k=100:	6782	0.4668
cheaper:	7922	0.3529
match:	14529	0.6471
k=50:	1858	0.1279
k=75:	2875	0.1979
k=10:	2266	0.1560
expen

In [209]:
print "\t".join("{:.4}".format(rr) for rr in q_max_sraa.max(axis=0)[:10])
print "\t".join("{:.4}".format(q_max_sraa[rr,i]) for i,rr in enumerate(sraa_sel[:10]))

print "\t".join("{:.4}".format(rr) for rr in q_alpha[:10])

0.997	0.6602	0.9999	0.926	0.9996	0.7188	0.9991	0.9424	0.9926	0.9912
0.997	0.6602	0.9999	0.9259	0.9996	0.7188	0.9991	0.9424	0.9925	0.9912
0.997	0.6602	0.9999	0.9259	0.9996	0.7188	0.9991	0.9424	0.9925	0.9912


# Recounting Matches within $\epsilon$

In [219]:
# Q_alpha picked by formula
sraa_sel = max_alpha(q_max_sraa, 1000, cost_sraa)
q_alpha = np.array([q_max_sraa[rr,i] for i,rr in enumerate(sraa_sel)])

# Q_max picked by max
max_sraa = q_max_sraa.max(axis=0)

for epsilon in [0,0.1,0.01,0.001,0.0001, 0.00001, 0.000001]:
    count = sum((q_alpha + epsilon >= max_sraa)
    print "%s=\t%s" % (epsilon, count )

0=	17408
0.1=	22451
0.01=	22451
0.001=	22451
0.0001=	21424
1e-05=	18609
1e-06=	17751


In [238]:
alphas = np.array([0,1,5,10,100,200,1000,2000])
cost_sraa= np.array([5.2, 6.5, 7.6, 9.1, 10.3])

def compute_counts_epsilon(q_max, cost, alphas, epsilon):

    print "Alpha \t Count \t Percentage" 

    for a in alphas:
        q_alpha_sel = max_alpha(q_max, a, cost)
        print "%s \t" % a,

        # Q_alpha picked by formula
        
        q_alpha = np.array([q_max[rr,i] for i,rr in enumerate(q_alpha_sel)])

        # Q_max picked by max
        q_max_sel = q_max.max(axis=0)

        match = ((q_alpha + epsilon) >= q_max_sel)
        count = sum(match)
        print "%s\t%.4f" % (count, np.mean(match))

def all_epsilons(q_max, cost, alphas, eps):

    for e in eps:
        print 
        print "Epsilon=%s" % e
        compute_counts_epsilon(q_max, cost, alphas, e)
        


In [235]:
print "== IMDB =="
compute_counts_epsilon(q_max, cost, alphas, 0.01)

print "== SRAA =="
compute_counts_epsilon(q_max_sraa, cost_sraa, alphas, 0.01)


== IMDB ==
Alpha 	 Count 	 Percentage
0 	760	0.0341
1 	1021	0.0459
5 	13225	0.5939
10 	18094	0.8126
100 	22267	1.0000
200 	22267	1.0000
1000 	22267	1.0000
2000 	20112	0.9032
== SRAA ==
Alpha 	 Count 	 Percentage
0 	6475	0.2884
1 	8878	0.3954
5 	15823	0.7048
10 	18138	0.8079
100 	22451	1.0000
200 	22451	1.0000
1000 	22451	1.0000
2000 	21193	0.9440


In [241]:
print "== IMDB =="
alphas = np.array([0,1,5,10,25,50,75,100,1000,2000])
all_epsilons(q_max, cost, alphas, [0., 0.001, 0.01, 0.05, 0.1])

== IMDB ==

Epsilon=0.0
Alpha 	 Count 	 Percentage
0 	623	0.0280
1 	849	0.0381
5 	11881	0.5336
10 	16543	0.7429
25 	19757	0.8873
50 	20922	0.9396
75 	21354	0.9590
100 	21593	0.9697
1000 	22176	0.9959
2000 	19992	0.8978

Epsilon=0.001
Alpha 	 Count 	 Percentage
0 	643	0.0289
1 	872	0.0392
5 	12019	0.5398
10 	16747	0.7521
25 	20051	0.9005
50 	21267	0.9551
75 	21672	0.9733
100 	21879	0.9826
1000 	22267	1.0000
2000 	20044	0.9002

Epsilon=0.01
Alpha 	 Count 	 Percentage
0 	760	0.0341
1 	1021	0.0459
5 	13225	0.5939
10 	18094	0.8126
25 	21379	0.9601
50 	22196	0.9968
75 	22264	0.9999
100 	22267	1.0000
1000 	22267	1.0000
2000 	20112	0.9032

Epsilon=0.05
Alpha 	 Count 	 Percentage
0 	1670	0.0750
1 	2199	0.0988
5 	17684	0.7942
10 	21727	0.9757
25 	22267	1.0000
50 	22267	1.0000
75 	22267	1.0000
100 	22267	1.0000
1000 	22267	1.0000
2000 	20568	0.9237

Epsilon=0.1
Alpha 	 Count 	 Percentage
0 	3591	0.1613
1 	4528	0.2034
5 	20998	0.9430
10 	22267	1.0000
25 	22267	1.0000
50 	22267	1.0000
75 	22267	1.0

In [242]:
print "== SRAA =="

all_epsilons(q_max_sraa, cost_sraa, alphas, [0, 0.001, 0.01, 0.05, 0.1])

== SRAA ==

Epsilon=0
Alpha 	 Count 	 Percentage
0 	2266	0.1009
1 	2983	0.1329
5 	7290	0.3247
10 	8578	0.3821
25 	10413	0.4638
50 	11998	0.5344
75 	12674	0.5645
100 	13173	0.5867
1000 	17408	0.7754
2000 	16070	0.7158

Epsilon=0.001
Alpha 	 Count 	 Percentage
0 	4453	0.1983
1 	6106	0.2720
5 	12043	0.5364
10 	13939	0.6209
25 	16780	0.7474
50 	18900	0.8418
75 	19838	0.8836
100 	20522	0.9141
1000 	22451	1.0000
2000 	20689	0.9215

Epsilon=0.01
Alpha 	 Count 	 Percentage
0 	6475	0.2884
1 	8878	0.3954
5 	15823	0.7048
10 	18138	0.8079
25 	21186	0.9437
50 	22397	0.9976
75 	22451	1.0000
100 	22451	1.0000
1000 	22451	1.0000
2000 	21193	0.9440

Epsilon=0.05
Alpha 	 Count 	 Percentage
0 	10571	0.4708
1 	13947	0.6212
5 	21531	0.9590
10 	22370	0.9964
25 	22451	1.0000
50 	22451	1.0000
75 	22451	1.0000
100 	22451	1.0000
1000 	22451	1.0000
2000 	21968	0.9785

Epsilon=0.1
Alpha 	 Count 	 Percentage
0 	11846	0.5276
1 	15968	0.7112
5 	22350	0.9955
10 	22451	1.0000
25 	22451	1.0000
50 	22451	1.0000
75 	2245