In [1]:
import cvxpy as cp
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt
import scipy.io
# mosek only needed if we don't use MW
#import mosek
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import cProfile
#from baselines import *
from scipy.linalg import sqrtm

from scipy.special import expit,logit 

In [2]:
#synthetic example with N datapoints, in d dimensions, with k actions, random regressors
#returns a dictionary of covariates x, labels y, regressors ell.  
def synthetic_example(N,d,k):
    #covariates
    x = np.random.normal(0,1,(N,d,k))
    #regressors
    ell = np.random.normal(0,1,d)
    #labels
    y = np.zeros((N,k))
    for i in range(N):
        for j in range(k):
            feature = x[i,:]
            regressor = ell[j,:]
            y[i,j] = np.inner(feature,regressor) + np.random.normal(0,1)
    data = {'cov': x, 'label': y, 'reg': ell}
    return data

from scipy.special import logit, expit

def logistic_synthetic_example(N,d,k,eta,corrupt=False):
    #covariates, scaling the covariates to be of large norm amplifies the effect of corruptions
    #good parameters: scale x by 100, d = 100, N = 1000
    #scaling x produces the altmin beating logistic regression phenomenon
    x = np.random.normal(0,1,(N,d,k))*100
    #regressors
    ell = np.random.normal(0,1,d)
    ell = ell/np.linalg.norm(ell)
    #labels
    y = np.zeros((N,k))
    prob_list = np.zeros((N,k))
    for i in range(N):
        for j in range(k):
            feature = x[i,:,j]
            prob = expit(np.inner(feature,ell))
            prob_list[i,j] = prob
            y[i,j] = np.random.binomial(1,prob)
            #deterministic labels
            #if prob > 0.5:
            #    y[i,j] = 1
            #else:
            #    y[i,j] = 0
    if corrupt:
        corr = int(eta*N)
        print('number of corruptions')
        print(corr)
        poison = False
        if poison:
            print('TODO: poisoning')
            for i in range(corr):
                for j in range(k): 
                    x[i,:,j] = ell*100
                    y[i,j] = -1
        else: 
            select = np.zeros((corr,k))
            for j in range(k):
                order = np.argsort(prob_list[:,j])
                select[:,j] = order[:corr]
            for i in range(corr):
                for j in range(k):
                    index = int(select[i,j])
                    y[index,j] = 1 - y[index,j]
    data = {'cov': x, 'label': y, 'reg': ell}
    return data

#data = synthetic_example(1000,100,10)    
#print(data)

import pandas as pd, numpy as np, re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.datasets import load_svmlight_file

def parse_data(filename):
    with open(filename, "rb") as f:
        infoline = f.readline()
        infoline = re.sub(r"^b'", "", str(infoline))
        n_features = int(re.sub(r"^\d+\s(\d+)\s\d+.*$", r"\1", infoline))
        features, labels = load_svmlight_file(f, n_features=n_features, multilabel=True)
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(labels)
    features = np.array(features.todense())
    features = np.ascontiguousarray(features)
    return features, labels


data_mode = 'logistic_synth'
#data_mode = 'synthetic'
#data_mode = 'real'
N = 1000
d = 3
k = 1000
eta = 0.1

#good settings
#N= 1000, d = 50, k = 50, eta = 0.1
if data_mode == 'logistic_synth':
    data = logistic_synthetic_example(N,d,k,eta,corrupt=True)
    #data = logistic_synthetic_example(N,d,k,eta,corrupt=False)
if data_mode == 'synthetic':
    data = synthetic_example(N,d,k)    
elif data_mode == 'real': 
    x, y = parse_data("Bibtex_data.txt")
    print(x.shape)
    print(y.shape)
    data = {'cov': x, 'label': y}

x = data['cov']
y = data['label']

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', random_state=0, fit_intercept ='False')
lab = y[:,0]
#error on this line TODO
cov = x[:,:,0]
print(cov.shape)
model.fit(cov, lab)
print(model.coef_)
print(data['reg'])
print('score')
print(model.score(x[:,:,0],lab))

vanilla = model.coef_

number of corruptions
100
(1000, 3)
[[ 0.0029023  -0.00537385 -0.01188337]]
[ 0.2719751  -0.52660966 -0.80542647]
score
0.765


In [3]:
#logistic regression version of scram
from scipy.linalg import eigh

def logistic_reg_oracle(x,y,mode='robust-logistic'):
    if mode == 'robust-logistic':
        lr = 0.1
        lam = 0.0
        MW_steps = 30
        #eta = 0.1
        AM_steps = 30
        params = (AM_steps,lr,lam,MW_steps,eta)
        regressor = logistic_scram(x,y,params)
    if mode == 'vanilla':
        (x_train,y_train) = hack(x,y)
        model = LogisticRegression(solver='liblinear', random_state=0, fit_intercept ='False')
        model.fit(x_train, y_train)
        regressor = model.coef_[0]
    return regressor

#ensures two classes  
def hack(x,y):
    (N,d) = x.shape
    x_train = np.zeros((N+2,d))
    y_train = np.zeros(N+2)
    x_train[:N,:] = x
    y_train[:N] = y
    x_train[N:N+2,:] = np.zeros((2,d))
    y_train[N:N+2] = np.array([0,1])
    return (x_train,y_train)

def logistic_scram(x,y,params):
    AM_steps = params[0]
    altmin_params = params[1:]
    N,d = x.shape
    w = [0.]*d
    a = [1.]*N
    #print('a logistic scram first: ', a)
    #print('first length of a:', len(a))
    #TODO: fix isotropic step, for now identity covariance
    #iso_x, Sig_sqrt = isotropic(x)
    iso_x = x
    Sig_sqrt = np.eye(d)
    for i in range(AM_steps):
        #print('a logistic scram: ', a)
        w,a = logistic_altmin_step(iso_x,y,a,altmin_params)
    final_w = np.matmul(Sig_sqrt,w)    
    return final_w

def logistic_altmin_step(Xs,Ys,a,params):
    N,d = Xs.shape
    #print('a logistic altmin step: ', a)
    w = weighted_logistic_reg(Xs,Ys,a)
    a = logistic_get_weights(Xs,Ys,w,params)
    return w, a

def weighted_logistic_reg(x,y,a):
    n,d = x.shape
    hard_weights = np.zeros(n)
    m = min(int(n*(1 - eta))+1,n) 
    prob = a*m
    for i in range(len(a)):
        if prob[i] > 1: 
            prob[i] = 1
        if prob[i] < 0:
            prob[i] = 0
        ind = np.random.binomial(1,prob[i])
        hard_weights[i] = ind
    
    N_train = int(np.sum(hard_weights))
    #hack to ensure two classes
    x_train = np.zeros((N_train+2,d)) 
    y_train = np.zeros(N_train+2)
    x_train[N_train:N_train+2,:] = np.zeros((2,d))
    y_train[N_train:N_train+2] =  np.array([0,1])
    
    hold = 0
    for i in range(len(a)):
        if hard_weights[i] == 1:
            x_train[hold,:] = x[i,:]
            y_train[hold] = y[i]
            hold = hold + 1
    
    model = LogisticRegression(solver='liblinear', random_state=0, fit_intercept ='False')
    #lab = y[:,0]
    model.fit(x_train, y_train)
    regressor = model.coef_[0,:]
    return regressor

def logistic_get_weights(X,y,w,params):
    n,d = X.shape
    lr,lam,steps,eta = params
    a = np.ones(n)/float(n)
    Sig = np.matmul(X.T,X)
    Xw = np.matmul(X,w)
    pred = expit(Xw)
    resids = np.absolute(y - pred)
    for j in range(steps):
        if lam > 0:
            v = v_step(a,X,Sig,eta)
        else:
            v = np.zeros(d)
        a = a_step(a,X,resids,v,lr=lr,lam=lam,eta=eta,n=n)
    return a

def outerprod(a,X):
    """ Compute X^T diag(a) X """
    left = np.multiply(X.T,a)
    return np.dot(left,X)

def cap(a,m):
    if np.max(a) <= 1./m:
        return a
    #sorted_a = np.sort(a)
    ## BUG FIX: use -a to get descending order
    sort_ids = np.argsort(-a)
    # Faster code
    Z = np.sum(a)
    aprime = np.copy(a)
    for i in range(1,min(m,len(a))):
        Z -= a[sort_ids[i - 1]]
        aprime_next = (1-i/m)*a[sort_ids[i]]/float(Z)
        aprime[sort_ids[i-1]] = 1./m
        #aprime_next = (m - i) * a[sort_ids[i]]/float(m * Z)
        #this chunk of code sometimes does not execute 
        if aprime_next <= 1.0/m:
            aprime[sort_ids[i:]] *= (1-i/m)/Z
            #aprime[sort_ids[i:]] *= (m - i)/float(m * Z)
            break
    return aprime   
        

def v_step(a,X,Sig,eta,tol=0):
    """ Compute top eigenvector of X^T diag(1/n-a) X"""
    # Bug fix: this was formerly 1 and not 1/n
    n = a.shape[0]
    #m = int(n*(1 - eta)) 
    M = outerprod(1./n-a,X)

    # this method is slower sometimes and faster sometimes vs other one, depending on d?
    d = M.shape[0]

    # Want top eigenvalue algebraically
    eigenvalue, v = eigh(M, eigvals=(d-1,d-1)) 
    #eigenvalue, v = largest_eigsh(M, 1, which='LA',tol=tol)
    
    ## Don't regularize if constraint is satisfied
    if eigenvalue > 0:
        return v[:,0]
    else:
        return np.zeros(shape=v[:,0].shape)
    
# @jit(nopython=True)
def a_step(a,X,resids_sq,v,lr,lam,eta,n):
    """ Step to minimize \sum_i a_i resids_sq[i]^2 + \lambda sigma_{max}(X^T diag(1/n - a) X)"""
    m = min(int(n*(1 - eta))+1,n) 
    
    Xv_squared = np.dot(X,v)**2
    penalties = resids_sq - lam * Xv_squared

    # multiplicative update
    a *= np.exp(-lr * penalties)
    a /= np.sum(a)
    a = cap(a,m)
    return a

def logistic_eval(x,y,reg):
    n,d = x.shape
    error = 0
    for i in range(n):
        error = error + np.absolute(y[i] - expit(np.inner(reg,x[i,:])))
        
    error = error/n
    return error 
    
#reg = logistic_reg_oracle(x,y[:,0])
#print('non robust performance: ', logistic_eval(x,y,vanilla))
#print('robust performance: ',logistic_eval(x,y,reg))
#print('vanilla: ', vanilla)
#print('reg: ', reg)

In [4]:
#contextual bandits takes covariates and labels
def contextual_bandit(cov_label, mode = 'vanilla'):
    cov = cov_label['cov']
    labels = cov_label['label']
    (N,d,k) = cov.shape
    (N,k) = labels.shape
    estimator = np.zeros(d)
    action_list = []
    mu = k
    delta = 0.1
    gamma = np.sqrt(k*N/(d*np.log(N/d) + 1./(2*delta)))
    params = (mu,gamma)
    rewards = []
    mean_reward = []
    total_x = np.zeros((N,d))
    total_y = np.zeros(N)
    tot_error = 0
    for i in range(N):
        print('iteration: ',i)
        if i < 5:
            action = 0
        else: 
            arms = cov[i,:,:]
            values = np.zeros(k)
            for j in range(k):
                values[j] = expit(np.inner(estimator,arms[:,j]))
            action = select_action(values,params)
            action_list.append(action)
            bandit_feedback = labels[i,action]
            rewards.append(bandit_feedback)
            
            #rewriting get_data
            total_x[i,:] = arms[:,action]
            total_y[i] = labels[i,action]
            data_x = total_x[:i+1,:]
            data_y = total_y[:i+1]

            #bug, ols can run on one datapoint but scram can't
            if i%100 == 0:
                if mode == 'vanilla':
                    estimator = logistic_reg_oracle(data_x,data_y,mode='vanilla')
                if mode == 'robust-logistic':  
                    estimator = logistic_reg_oracle(data_x,data_y,mode='robust-logistic')
            print('estimator: ', estimator)
            #estimators[action,:] = regression_oracle(data_x,data_y,mode='ols')
            #estimators[action,:] = regression_oracle(data_x,data_y,mode='scram')
            print('action')
            print(action)
            print('average reward')
            avg_reward = sum(rewards)/len(rewards)
            mean_reward.append(avg_reward)
            print(avg_reward) 
            print('logistic evaluation: ', logistic_eval(data_x,data_y,estimator))
            
            tot_error += logistic_eval(np.array([arms[:,action]]), np.array([labels[i,action]]), estimator)
            print('average error per step: ', tot_error/i)
            
    return mean_reward

# def get_data(cov_label,action_list):
#     cov = cov_label['cov']
#     labels = cov_label['label']
#     (N,d,k) = cov.shape
#     (N,k) = labels.shape
#     count = len(action_list)
    
#     #ensure two classes in data
#     data_x = np.zeros((count+2,d))
#     data_y = np.zeros(count+2)
#     for i in range(count):
#         data_x[i,:] = cov[i,:,action_list[i]]
#         data_y[i] = labels[i,action_list[i]]
#     data_x[count:count+2,:] = np.zeros((2,d))
#     data_y[count] = 0
#     data_y[count+1] = 1
#     return (data_x,data_y) 


def select_action(values,params):
    (mu,gamma) = params
    k = mu
    max_value = np.amax(values)
    max_index = np.where(values == max_value)[0][0]
    prob = np.zeros(len(values))
    for i in range(k): 
        if i == max_index:
            next
        else: 
            prob[i] = 1./(mu + gamma*(max_value - values[i]))
    prob[max_index] = 1 - np.sum(prob)
    prob = prob/np.sum(prob)
    #TODO roulette wheel
    draw = np.random.rand()
    sums = 0
    action = 0
    for i in range(k):
        sums = sums + prob[i]
        if sums >= draw:
            action = i
            break
    return action

mean_reward = contextual_bandit(data, mode='vanilla')


iteration:  0
iteration:  1
iteration:  2
iteration:  3
iteration:  4
iteration:  5
estimator:  [0. 0. 0.]
action
71
average reward
1.0
logistic evaluation:  0.5
average error per step:  0.1
iteration:  6
estimator:  [0. 0. 0.]
action
57
average reward
1.0
logistic evaluation:  0.5
average error per step:  0.16666666666666666
iteration:  7
estimator:  [0. 0. 0.]
action
691
average reward
0.6666666666666666
logistic evaluation:  0.5
average error per step:  0.21428571428571427
iteration:  8
estimator:  [0. 0. 0.]
action
420
average reward
0.75
logistic evaluation:  0.5
average error per step:  0.25
iteration:  9
estimator:  [0. 0. 0.]
action
83
average reward
0.8
logistic evaluation:  0.5
average error per step:  0.2777777777777778
iteration:  10
estimator:  [0. 0. 0.]
action
993
average reward
0.6666666666666666
logistic evaluation:  0.5
average error per step:  0.3
iteration:  11
estimator:  [0. 0. 0.]
action
725
average reward
0.7142857142857143
logistic evaluation:  0.5
average erro

estimator:  [0. 0. 0.]
action
501
average reward
0.5
logistic evaluation:  0.5
average error per step:  0.4696969696969697
iteration:  67
estimator:  [0. 0. 0.]
action
23
average reward
0.5079365079365079
logistic evaluation:  0.5
average error per step:  0.4701492537313433
iteration:  68
estimator:  [0. 0. 0.]
action
128
average reward
0.5
logistic evaluation:  0.5
average error per step:  0.47058823529411764
iteration:  69
estimator:  [0. 0. 0.]
action
705
average reward
0.5076923076923077
logistic evaluation:  0.5
average error per step:  0.47101449275362317
iteration:  70
estimator:  [0. 0. 0.]
action
425
average reward
0.5151515151515151
logistic evaluation:  0.5
average error per step:  0.4714285714285714
iteration:  71
estimator:  [0. 0. 0.]
action
96
average reward
0.5223880597014925
logistic evaluation:  0.5
average error per step:  0.47183098591549294
iteration:  72
estimator:  [0. 0. 0.]
action
369
average reward
0.5294117647058824
logistic evaluation:  0.5
average error per

estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
315
average reward
0.5384615384615384
logistic evaluation:  0.31798013695207983
average error per step:  0.4440317396764079
iteration:  122
estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
105
average reward
0.5423728813559322
logistic evaluation:  0.3154002204190382
average error per step:  0.44039746642814204
iteration:  123
estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
88
average reward
0.5462184873949579
logistic evaluation:  0.320599428440939
average error per step:  0.444622706661529
iteration:  124
estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
841
average reward
0.55
logistic evaluation:  0.3193970047104414
average error per step:  0.4424103982378774
iteration:  125
estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
98
average reward
0.5454545454545454
logistic evaluation:  0.321407001451645
average error per step:  0.44345236780479114
iteration:  126
estimator:  [ 0.00810757 -0.00453581

logistic evaluation:  0.33151323978881725
average error per step:  0.4251683136077159
iteration:  164
estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
675
average reward
0.55
logistic evaluation:  0.33492455926403497
average error per step:  0.4280293662881551
iteration:  165
estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
693
average reward
0.5527950310559007
logistic evaluation:  0.33468389948148203
average error per step:  0.42722297640374346
iteration:  166
estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
884
average reward
0.5555555555555556
logistic evaluation:  0.3340443048386693
average error per step:  0.42602206446234603
iteration:  167
estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
772
average reward
0.5521472392638037
logistic evaluation:  0.33345636745757223
average error per step:  0.42487984147044183
iteration:  168
estimator:  [ 0.00810757 -0.00453581 -0.01507658]
action
391
average reward
0.5548780487804879
logistic evaluation:  0.332

logistic evaluation:  0.3552186046237082
average error per step:  0.402350704775765
iteration:  210
estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
277
average reward
0.5679611650485437
logistic evaluation:  0.3555479545129973
average error per step:  0.40245718442570766
iteration:  211
estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
88
average reward
0.5700483091787439
logistic evaluation:  0.3539838787371877
average error per step:  0.40066337734331736
iteration:  212
estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
344
average reward
0.5721153846153846
logistic evaluation:  0.3532495093353636
average error per step:  0.39970535762070103
iteration:  213
estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
341
average reward
0.5741626794258373
logistic evaluation:  0.35245171850741563
average error per step:  0.39868571872179864
iteration:  214
estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
684
average reward
0.5761904761904761
logistic evaluati

estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
252
average reward
0.5775193798449613
logistic evaluation:  0.3447079431215305
average error per step:  0.38226557010732326
iteration:  263
estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
181
average reward
0.5752895752895753
logistic evaluation:  0.34483522300319597
average error per step:  0.3822505292775662
iteration:  264
estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
147
average reward
0.573076923076923
logistic evaluation:  0.3445559337697919
average error per step:  0.38182845748542055
iteration:  265
estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
285
average reward
0.5747126436781609
logistic evaluation:  0.3434671546320444
average error per step:  0.3805949187142641
iteration:  266
estimator:  [ 0.00261652 -0.00298828 -0.01192176]
action
113
average reward
0.5725190839694656
logistic evaluation:  0.3441434784005213
average error per step:  0.3811342069928397
iteration:  267
estimator:  [ 0.00

0.5784313725490197
logistic evaluation:  0.34040980670014265
average error per step:  0.37825308274823105
iteration:  311
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
294
average reward
0.5765472312703583
logistic evaluation:  0.3397822124540298
average error per step:  0.3775017879545484
iteration:  312
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
775
average reward
0.577922077922078
logistic evaluation:  0.33949840943171783
average error per step:  0.3770961792318428
iteration:  313
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
564
average reward
0.5792880258899676
logistic evaluation:  0.3395170343599455
average error per step:  0.37699474299434554
iteration:  314
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
150
average reward
0.5806451612903226
logistic evaluation:  0.33846821552924405
average error per step:  0.375823228216303
iteration:  315
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
690
average reward
0.581993569131832

estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
793
average reward
0.6028571428571429
logistic evaluation:  0.3314916724617547
average error per step:  0.3646060720116672
iteration:  355
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
275
average reward
0.603988603988604
logistic evaluation:  0.3308255145550053
average error per step:  0.3638447576050399
iteration:  356
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
815
average reward
0.6051136363636364
logistic evaluation:  0.3299378661898628
average error per step:  0.3628618651628886
iteration:  357
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
684
average reward
0.6062322946175638
logistic evaluation:  0.3300852872364452
average error per step:  0.3629174750668199
iteration:  358
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
838
average reward
0.6073446327683616
logistic evaluation:  0.3318386417916556
average error per step:  0.36458401723858
iteration:  359
estimator:  [ 0.0034507

estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
674
average reward
0.6198979591836735
logistic evaluation:  0.32690125209810034
average error per step:  0.35649192639180066
iteration:  397
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
457
average reward
0.6208651399491094
logistic evaluation:  0.3267810195281421
average error per step:  0.3562968552655108
iteration:  398
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
437
average reward
0.6218274111675127
logistic evaluation:  0.32809863550608975
average error per step:  0.35754362144506796
iteration:  399
estimator:  [ 0.00345074 -0.00430667 -0.01189801]
action
284
average reward
0.6227848101265823
logistic evaluation:  0.3281392496800023
average error per step:  0.3575105404516495
iteration:  400
estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
69
average reward
0.6237373737373737
logistic evaluation:  0.3265272082347744
average error per step:  0.35664805371747826
iteration:  401
estimator:  [ 0.

estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
141
average reward
0.6275862068965518
logistic evaluation:  0.32661172824754686
average error per step:  0.35405688249149747
iteration:  440
estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
542
average reward
0.6284403669724771
logistic evaluation:  0.32617721061860055
average error per step:  0.3535590019719309
iteration:  441
estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
448
average reward
0.6292906178489702
logistic evaluation:  0.32641575093794456
average error per step:  0.3537359929692024
iteration:  442
estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
976
average reward
0.6301369863013698
logistic evaluation:  0.3257123471904313
average error per step:  0.3529691873081625
iteration:  443
estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
303
average reward
0.6309794988610479
logistic evaluation:  0.3259510776432234
average error per step:  0.3531469288000857
iteration:  444
estimator:  [ 0.

estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
42
average reward
0.6263048016701461
logistic evaluation:  0.3300081978649775
average error per step:  0.3549602044544428
iteration:  484
estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
368
average reward
0.625
logistic evaluation:  0.32981484948809636
average error per step:  0.3547149028648213
iteration:  485
estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
55
average reward
0.6257796257796258
logistic evaluation:  0.32916263277922214
average error per step:  0.3540100010629871
iteration:  486
estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
349
average reward
0.6244813278008299
logistic evaluation:  0.32882921932695813
average error per step:  0.35362477530262426
iteration:  487
estimator:  [ 0.00377509 -0.00472776 -0.01170844]
action
199
average reward
0.6252587991718427
logistic evaluation:  0.3291470343129446
average error per step:  0.35389232798678383
iteration:  488
estimator:  [ 0.00377509 -0.00

estimator:  [ 0.00335636 -0.00607438 -0.01201304]
action
248
average reward
0.628731343283582
logistic evaluation:  0.3219834555836751
average error per step:  0.35130119978935914
iteration:  541
estimator:  [ 0.00335636 -0.00607438 -0.01201304]
action
382
average reward
0.62756052141527
logistic evaluation:  0.3217760514700425
average error per step:  0.35103922054020104
iteration:  542
estimator:  [ 0.00335636 -0.00607438 -0.01201304]
action
321
average reward
0.6282527881040892
logistic evaluation:  0.32168390552687176
average error per step:  0.3508929134992197
iteration:  543
estimator:  [ 0.00335636 -0.00607438 -0.01201304]
action
476
average reward
0.6289424860853432
logistic evaluation:  0.3212941530438416
average error per step:  0.35044865132842645
iteration:  544
estimator:  [ 0.00335636 -0.00607438 -0.01201304]
action
369
average reward
0.6277777777777778
logistic evaluation:  0.3210963409804333
average error per step:  0.3501968828121726
iteration:  545
estimator:  [ 0.003

logistic evaluation:  0.32132255797823595
average error per step:  0.3478845059202896
iteration:  597
estimator:  [ 0.00335636 -0.00607438 -0.01201304]
action
736
average reward
0.6222596964586846
logistic evaluation:  0.3212893742729772
average error per step:  0.3478067742558226
iteration:  598
estimator:  [ 0.00335636 -0.00607438 -0.01201304]
action
872
average reward
0.6212121212121212
logistic evaluation:  0.3210871812593372
average error per step:  0.347559899648543
iteration:  599
estimator:  [ 0.00335636 -0.00607438 -0.01201304]
action
615
average reward
0.6218487394957983
logistic evaluation:  0.3209137380219955
average error per step:  0.3473419720011403
iteration:  600
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
945
average reward
0.62248322147651
logistic evaluation:  0.32294173505447554
average error per step:  0.3467881889898595
iteration:  601
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
371
average reward
0.6231155778894473
logistic evaluation: 

logistic evaluation:  0.3212049596754796
average error per step:  0.34362836968465105
iteration:  639
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
341
average reward
0.6220472440944882
logistic evaluation:  0.3213398950422423
average error per step:  0.3437284248094069
iteration:  640
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
209
average reward
0.6210691823899371
logistic evaluation:  0.321103653113324
average error per step:  0.3434568316747134
iteration:  641
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
290
average reward
0.6216640502354788
logistic evaluation:  0.32068155103595813
average error per step:  0.3429991987383168
iteration:  642
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
241
average reward
0.622257053291536
logistic evaluation:  0.3208552589652746
average error per step:  0.3431384145496067
iteration:  643
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
986
average reward
0.622848200312989
logistic evaluation: 

estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
562
average reward
0.6273653566229985
logistic evaluation:  0.32480667008383074
average error per step:  0.3455154071261748
iteration:  692
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
281
average reward
0.627906976744186
logistic evaluation:  0.32501995293177277
average error per step:  0.3456990722657434
iteration:  693
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
236
average reward
0.6284470246734397
logistic evaluation:  0.32457619611553
average error per step:  0.3452248351087356
iteration:  694
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
672
average reward
0.6275362318840579
logistic evaluation:  0.32467863238351874
average error per step:  0.34529766589729316
iteration:  695
estimator:  [ 0.00328266 -0.00649672 -0.01141064]
action
873
average reward
0.6266280752532561
logistic evaluation:  0.3248461552157371
average error per step:  0.34543576209543736
iteration:  696
estimator:  [ 0.00

logistic evaluation:  0.3305139270799797
average error per step:  0.3440823594522855
iteration:  748
estimator:  [ 0.00298317 -0.00642082 -0.01071586]
action
821
average reward
0.6236559139784946
logistic evaluation:  0.330480377305528
average error per step:  0.3440306252097231
iteration:  749
estimator:  [ 0.00298317 -0.00642082 -0.01071586]
action
610
average reward
0.6241610738255033
logistic evaluation:  0.3304560968389774
average error per step:  0.343988221207297
iteration:  750
estimator:  [ 0.00298317 -0.00642082 -0.01071586]
action
614
average reward
0.6246648793565683
logistic evaluation:  0.3303763280248644
average error per step:  0.34389030320227404
iteration:  751
estimator:  [ 0.00298317 -0.00642082 -0.01071586]
action
908
average reward
0.6238286479250335
logistic evaluation:  0.3302225624061268
average error per step:  0.343718338194993
iteration:  752
estimator:  [ 0.00298317 -0.00642082 -0.01071586]
action
754
average reward
0.6243315508021391
logistic evaluation:  

logistic evaluation:  0.33072653578383454
average error per step:  0.34342807913072193
iteration:  799
estimator:  [ 0.00298317 -0.00642082 -0.01071586]
action
937
average reward
0.6289308176100629
logistic evaluation:  0.3306030592448064
average error per step:  0.34328855125266255
iteration:  800
estimator:  [ 0.0029648  -0.00649299 -0.01061606]
action
313
average reward
0.628140703517588
logistic evaluation:  0.330995109663269
average error per step:  0.3432168569411562
iteration:  801
estimator:  [ 0.0029648  -0.00649299 -0.01061606]
action
723
average reward
0.6286072772898369
logistic evaluation:  0.33171033054750804
average error per step:  0.343917712623905
iteration:  802
estimator:  [ 0.0029648  -0.00649299 -0.01061606]
action
934
average reward
0.6278195488721805
logistic evaluation:  0.33181910609888515
average error per step:  0.3440114026309866
iteration:  803
estimator:  [ 0.0029648  -0.00649299 -0.01061606]
action
464
average reward
0.6282853566958698
logistic evaluatio

logistic evaluation:  0.3309523419624369
average error per step:  0.34244159584564365
iteration:  852
estimator:  [ 0.0029648  -0.00649299 -0.01061606]
action
597
average reward
0.6344339622641509
logistic evaluation:  0.33076347853590277
average error per step:  0.34223902570865206
iteration:  853
estimator:  [ 0.0029648  -0.00649299 -0.01061606]
action
733
average reward
0.6336866902237926
logistic evaluation:  0.3308551086973805
average error per step:  0.3423173101292022
iteration:  854
estimator:  [ 0.0029648  -0.00649299 -0.01061606]
action
819
average reward
0.6341176470588236
logistic evaluation:  0.33095714539735593
average error per step:  0.3424060445285548
iteration:  855
estimator:  [ 0.0029648  -0.00649299 -0.01061606]
action
757
average reward
0.6345475910693302
logistic evaluation:  0.3306657746754689
average error per step:  0.3421009424968981
iteration:  856
estimator:  [ 0.0029648  -0.00649299 -0.01061606]
action
745
average reward
0.6349765258215962
logistic evaluat

estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
935
average reward
0.6377777777777778
logistic evaluation:  0.33082584923158814
average error per step:  0.3407256608090598
iteration:  905
estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
679
average reward
0.6381798002219756
logistic evaluation:  0.33102450163728836
average error per step:  0.3409135937018631
iteration:  906
estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
776
average reward
0.6374722838137472
logistic evaluation:  0.33116369179249916
average error per step:  0.3410420223759377
iteration:  907
estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
648
average reward
0.6378737541528239
logistic evaluation:  0.3308646484186025
average error per step:  0.3407317580825732
iteration:  908
estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
649
average reward
0.6382743362831859
logistic evaluation:  0.33071144654667906
average error per step:  0.3405675206252578
iteration:  909
estimator:  [ 0.

logistic evaluation:  0.32838643531025574
average error per step:  0.3377452117246
iteration:  957
estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
994
average reward
0.640083945435467
logistic evaluation:  0.3284600746805578
average error per step:  0.3378091487573429
iteration:  958
estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
179
average reward
0.640461215932914
logistic evaluation:  0.32902343437078907
average error per step:  0.3383633375557302
iteration:  959
estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
854
average reward
0.6408376963350786
logistic evaluation:  0.3291403923891462
average error per step:  0.33847067832156746
iteration:  960
estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
535
average reward
0.641213389121339
logistic evaluation:  0.329297677177438
average error per step:  0.33861840790033415
iteration:  961
estimator:  [ 0.00306351 -0.00644878 -0.01048289]
action
86
average reward
0.6415882967607106
logistic evaluation:  0.

In [5]:
mean_reward = contextual_bandit(data, mode='robust-logistic')

iteration:  0
iteration:  1
iteration:  2
iteration:  3
iteration:  4
iteration:  5
estimator:  [0. 0. 0.]
action
138
average reward
1.0
logistic evaluation:  0.5
average error per step:  0.1
iteration:  6
estimator:  [0. 0. 0.]
action
674
average reward
1.0
logistic evaluation:  0.5
average error per step:  0.16666666666666666
iteration:  7
estimator:  [0. 0. 0.]
action
26
average reward
1.0
logistic evaluation:  0.5
average error per step:  0.21428571428571427
iteration:  8
estimator:  [0. 0. 0.]
action
396
average reward
0.75
logistic evaluation:  0.5
average error per step:  0.25
iteration:  9
estimator:  [0. 0. 0.]
action
972
average reward
0.8
logistic evaluation:  0.5
average error per step:  0.2777777777777778
iteration:  10
estimator:  [0. 0. 0.]
action
566
average reward
0.6666666666666666
logistic evaluation:  0.5
average error per step:  0.3
iteration:  11
estimator:  [0. 0. 0.]
action
546
average reward
0.5714285714285714
logistic evaluation:  0.5
average error per step:  

estimator:  [0. 0. 0.]
action
598
average reward
0.6721311475409836
logistic evaluation:  0.5
average error per step:  0.46923076923076923
iteration:  66
estimator:  [0. 0. 0.]
action
946
average reward
0.6774193548387096
logistic evaluation:  0.5
average error per step:  0.4696969696969697
iteration:  67
estimator:  [0. 0. 0.]
action
188
average reward
0.6825396825396826
logistic evaluation:  0.5
average error per step:  0.4701492537313433
iteration:  68
estimator:  [0. 0. 0.]
action
0
average reward
0.6875
logistic evaluation:  0.5
average error per step:  0.47058823529411764
iteration:  69
estimator:  [0. 0. 0.]
action
972
average reward
0.6923076923076923
logistic evaluation:  0.5
average error per step:  0.47101449275362317
iteration:  70
estimator:  [0. 0. 0.]
action
231
average reward
0.696969696969697
logistic evaluation:  0.5
average error per step:  0.4714285714285714
iteration:  71
estimator:  [0. 0. 0.]
action
553
average reward
0.6865671641791045
logistic evaluation:  0.5


estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
981
average reward
0.6614173228346457
logistic evaluation:  0.3316056811703542
average error per step:  0.43098901450082444
iteration:  132
estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
797
average reward
0.6640625
logistic evaluation:  0.32913977945571565
average error per step:  0.4277515276722078
iteration:  133
estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
221
average reward
0.6589147286821705
logistic evaluation:  0.33027626669643206
average error per step:  0.4281551182138582
iteration:  134
estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
387
average reward
0.6615384615384615
logistic evaluation:  0.3305484032364981
average error per step:  0.42769884643319767
iteration:  135
estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
407
average reward
0.6641221374045801
logistic evaluation:  0.32866101621151833
average error per step:  0.4250778458510203
iteration:  136
estimator:  [-0.00041743

logistic evaluation:  0.3109114629718021
average error per step:  0.38079599226262495
iteration:  187
estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
9
average reward
0.6775956284153005
logistic evaluation:  0.31064707859357404
average error per step:  0.38015648000381375
iteration:  188
estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
836
average reward
0.6793478260869565
logistic evaluation:  0.30972580559969004
average error per step:  0.3788605757630992
iteration:  189
estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
728
average reward
0.6810810810810811
logistic evaluation:  0.31017783042755664
average error per step:  0.3789491998219948
iteration:  190
estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
961
average reward
0.6827956989247311
logistic evaluation:  0.30857689113842074
average error per step:  0.3769778799608401
iteration:  191
estimator:  [-0.00041743 -0.01169471 -0.01387338]
action
795
average reward
0.6844919786096256
logistic evaluat

estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
90
average reward
0.7022222222222222
logistic evaluation:  0.14767026417170162
average error per step:  0.33718191900875355
iteration:  230
estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
992
average reward
0.6991150442477876
logistic evaluation:  0.14779261006051916
average error per step:  0.3364808331195353
iteration:  231
estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
45
average reward
0.7004405286343612
logistic evaluation:  0.1471563788352053
average error per step:  0.3350250155120381
iteration:  232
estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
601
average reward
0.6973684210526315
logistic evaluation:  0.14652495004662247
average error per step:  0.33358108644127676
iteration:  233
estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
368
average reward
0.6943231441048034
logistic evaluation:  0.1461281344366301
average error per step:  0.33237975172396833
iteration:  234
estimator:  [ 0.

estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
717
average reward
0.6855123674911661
logistic evaluation:  0.14279549898328506
average error per step:  0.2939916459954679
iteration:  288
estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
120
average reward
0.6866197183098591
logistic evaluation:  0.14230139691092536
average error per step:  0.2929708416693424
iteration:  289
estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
460
average reward
0.6877192982456141
logistic evaluation:  0.14525897743846491
average error per step:  0.2954173084798201
iteration:  290
estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
984
average reward
0.6888111888111889
logistic evaluation:  0.14480999734778463
average error per step:  0.29444899283351217
iteration:  291
estimator:  [ 0.01560379 -0.04555082 -0.06367874]
action
750
average reward
0.6898954703832753
logistic evaluation:  0.14487563097731634
average error per step:  0.2940006286559779
iteration:  292
estimator:  [ 

logistic evaluation:  0.208588888213647
average error per step:  0.28644687776031325
iteration:  353
estimator:  [ 0.00538747 -0.0236434  -0.0333178 ]
action
646
average reward
0.6905444126074498
logistic evaluation:  0.20803520127986636
average error per step:  0.28567106143140386
iteration:  354
estimator:  [ 0.00538747 -0.0236434  -0.0333178 ]
action
793
average reward
0.6914285714285714
logistic evaluation:  0.20744920100026606
average error per step:  0.28486409544437097
iteration:  355
estimator:  [ 0.00538747 -0.0236434  -0.0333178 ]
action
270
average reward
0.6923076923076923
logistic evaluation:  0.20687242163843517
average error per step:  0.2840676212267487
iteration:  356
estimator:  [ 0.00538747 -0.0236434  -0.0333178 ]
action
769
average reward
0.6903409090909091
logistic evaluation:  0.20636052070713762
average error per step:  0.28333744192320504
iteration:  357
estimator:  [ 0.00538747 -0.0236434  -0.0333178 ]
action
149
average reward
0.6883852691218131
logistic eval

estimator:  [ 0.00538747 -0.0236434  -0.0333178 ]
action
761
average reward
0.673469387755102
logistic evaluation:  0.203926705424375
average error per step:  0.2731220340547721
iteration:  397
estimator:  [ 0.00538747 -0.0236434  -0.0333178 ]
action
909
average reward
0.6743002544529262
logistic evaluation:  0.20418698083774464
average error per step:  0.27320866953560513
iteration:  398
estimator:  [ 0.00538747 -0.0236434  -0.0333178 ]
action
211
average reward
0.6725888324873096
logistic evaluation:  0.20376544802108315
average error per step:  0.272612656262877
iteration:  399
estimator:  [ 0.00538747 -0.0236434  -0.0333178 ]
action
825
average reward
0.6734177215189874
logistic evaluation:  0.2038604795062535
average error per step:  0.27253537652810594
iteration:  400
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
956
average reward
0.6742424242424242
logistic evaluation:  0.18554204366676014
average error per step:  0.27185725151673645
iteration:  401
estimator:  [ 0.0

logistic evaluation:  0.18417444773606406
average error per step:  0.2619328160785412
iteration:  445
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
587
average reward
0.671201814058957
logistic evaluation:  0.18600317572884087
average error per step:  0.26359091566603776
iteration:  446
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
601
average reward
0.6719457013574661
logistic evaluation:  0.18561341929493402
average error per step:  0.2630263217963213
iteration:  447
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
753
average reward
0.672686230248307
logistic evaluation:  0.18563721096170013
average error per step:  0.2628769834612202
iteration:  448
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
83
average reward
0.6734234234234234
logistic evaluation:  0.18522376512642227
average error per step:  0.2622902045493022
iteration:  449
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
424
average reward
0.6741573033707865
logistic evaluati

estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
275
average reward
0.6694214876033058
logistic evaluation:  0.1802631428100903
average error per step:  0.2510024957591351
iteration:  489
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
584
average reward
0.6701030927835051
logistic evaluation:  0.1798952600966429
average error per step:  0.25048919947582576
iteration:  490
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
450
average reward
0.6707818930041153
logistic evaluation:  0.17952887483294597
average error per step:  0.24997799722306174
iteration:  491
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
739
average reward
0.6714579055441479
logistic evaluation:  0.17917824700097573
average error per step:  0.2494831743804559
iteration:  492
estimator:  [ 0.01063294 -0.02154379 -0.04221296]
action
119
average reward
0.6721311475409836
logistic evaluation:  0.1789351498378522
average error per step:  0.24909668692354656
iteration:  493
estimator:  [ 0

estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
122
average reward
0.6642066420664207
logistic evaluation:  0.14003466033772674
average error per step:  0.24425151302248851
iteration:  547
estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
905
average reward
0.6629834254143646
logistic evaluation:  0.1406719733420582
average error per step:  0.24469946672210255
iteration:  548
estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
104
average reward
0.6617647058823529
logistic evaluation:  0.14105510285219264
average error per step:  0.24489346418137944
iteration:  549
estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
871
average reward
0.6623853211009174
logistic evaluation:  0.14080014149868106
average error per step:  0.24444889750421997
iteration:  550
estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
277
average reward
0.663003663003663
logistic evaluation:  0.14062206899781443
average error per step:  0.24408204895152355
iteration:  551
estimator:  

logistic evaluation:  0.13672875146688154
average error per step:  0.2336607713255
iteration:  588
estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
528
average reward
0.6643835616438356
logistic evaluation:  0.13650030477362354
average error per step:  0.23326708574354835
iteration:  589
estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
363
average reward
0.6649572649572649
logistic evaluation:  0.13796382815229152
average error per step:  0.23456880393105964
iteration:  590
estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
3
average reward
0.6655290102389079
logistic evaluation:  0.1377303868187005
average error per step:  0.23417122968710868
iteration:  591
estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
317
average reward
0.666098807495741
logistic evaluation:  0.13749807417631504
average error per step:  0.23377534148548326
iteration:  592
estimator:  [ 0.02071734 -0.0423126  -0.07084983]
action
391
average reward
0.6666666666666666
logistic evaluatio

logistic evaluation:  0.19515783936640274
average error per step:  0.22621470977677982
iteration:  640
estimator:  [ 0.0094739  -0.01889786 -0.03252139]
action
992
average reward
0.6666666666666666
logistic evaluation:  0.19560591633644403
average error per step:  0.22661496050707056
iteration:  641
estimator:  [ 0.0094739  -0.01889786 -0.03252139]
action
757
average reward
0.6671899529042387
logistic evaluation:  0.19685820275828164
average error per step:  0.2278208245299241
iteration:  642
estimator:  [ 0.0094739  -0.01889786 -0.03252139]
action
487
average reward
0.6661442006269592
logistic evaluation:  0.19718595022027877
average error per step:  0.22810085411916478
iteration:  643
estimator:  [ 0.0094739  -0.01889786 -0.03252139]
action
802
average reward
0.6666666666666666
logistic evaluation:  0.19843223108350508
average error per step:  0.22930099404454404
iteration:  644
estimator:  [ 0.0094739  -0.01889786 -0.03252139]
action
783
average reward
0.6671875
logistic evaluation:

logistic evaluation:  0.19662783645783674
average error per step:  0.22514336833904702
iteration:  697
estimator:  [ 0.0094739  -0.01889786 -0.03252139]
action
189
average reward
0.6637806637806638
logistic evaluation:  0.1970332070079308
average error per step:  0.2255084086720233
iteration:  698
estimator:  [ 0.0094739  -0.01889786 -0.03252139]
action
737
average reward
0.6628242074927954
logistic evaluation:  0.19743519552068278
average error per step:  0.22587017768169312
iteration:  699
estimator:  [ 0.0094739  -0.01889786 -0.03252139]
action
579
average reward
0.6633093525179856
logistic evaluation:  0.19857523562530152
average error per step:  0.22697116922829128
iteration:  700
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
560
average reward
0.6623563218390804
logistic evaluation:  0.17186742015449297
average error per step:  0.22693486341046729
iteration:  701
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
105
average reward
0.6614060258249641
logistic eva

estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
985
average reward
0.654421768707483
logistic evaluation:  0.1746760269010885
average error per step:  0.22684114041384712
iteration:  740
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
479
average reward
0.6535326086956522
logistic evaluation:  0.1746562067476931
average error per step:  0.2267508000798218
iteration:  741
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
330
average reward
0.6540027137042063
logistic evaluation:  0.17442917823777365
average error per step:  0.22645316209373226
iteration:  742
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
35
average reward
0.6544715447154471
logistic evaluation:  0.17421542590768388
average error per step:  0.2261690085019362
iteration:  743
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
233
average reward
0.6535859269282814
logistic evaluation:  0.17398676267901622
average error per step:  0.22587011344847324
iteration:  744
estimator:  [ 0.

estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
444
average reward
0.650571791613723
logistic evaluation:  0.17299607026440936
average error per step:  0.22172974779828036
iteration:  792
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
175
average reward
0.649746192893401
logistic evaluation:  0.17279680140294418
average error per step:  0.22146869491358873
iteration:  793
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
249
average reward
0.6501901140684411
logistic evaluation:  0.17328616092022095
average error per step:  0.22189729461498484
iteration:  794
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
731
average reward
0.6506329113924051
logistic evaluation:  0.1731421276194922
average error per step:  0.2216918568218184
iteration:  795
estimator:  [ 0.01058692 -0.02647629 -0.04324704]
action
102
average reward
0.6510745891276865
logistic evaluation:  0.17418039351315562
average error per step:  0.22267035986855271
iteration:  796
estimator:  [ 

estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
829
average reward
0.6470588235294118
logistic evaluation:  0.18534922588017627
average error per step:  0.22069832509988394
iteration:  838
estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
727
average reward
0.6474820143884892
logistic evaluation:  0.1851291646114366
average error per step:  0.22043581853223204
iteration:  839
estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
470
average reward
0.6479041916167665
logistic evaluation:  0.18492206198258884
average error per step:  0.2201863872305003
iteration:  840
estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
596
average reward
0.6483253588516746
logistic evaluation:  0.18470217863479746
average error per step:  0.21992426077723787
iteration:  841
estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
394
average reward
0.6487455197132617
logistic evaluation:  0.1844828186086809
average error per step:  0.21966275872713967
iteration:  842
estimator:  [

logistic evaluation:  0.18476269102780893
average error per step:  0.21804346663190674
iteration:  890
estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
378
average reward
0.6512415349887133
logistic evaluation:  0.18514745951730294
average error per step:  0.218391273315654
iteration:  891
estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
400
average reward
0.6516347237880497
logistic evaluation:  0.18606045294924575
average error per step:  0.2192679807539196
iteration:  892
estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
603
average reward
0.652027027027027
logistic evaluation:  0.18697191527140203
average error per step:  0.22014323672463806
iteration:  893
estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
597
average reward
0.6512935883014623
logistic evaluation:  0.18676852186831067
average error per step:  0.21990246962069976
iteration:  894
estimator:  [ 0.01303255 -0.02207324 -0.03631994]
action
982
average reward
0.650561797752809
logistic evalua

logistic evaluation:  0.17299713490272495
average error per step:  0.2202995924723888
iteration:  935
estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
200
average reward
0.6487647690655209
logistic evaluation:  0.17282298613325367
average error per step:  0.22007466658383823
iteration:  936
estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
225
average reward
0.648068669527897
logistic evaluation:  0.17264382945600074
average error per step:  0.21984483593529489
iteration:  937
estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
469
average reward
0.6484458735262594
logistic evaluation:  0.17274918720770863
average error per step:  0.21989993152187196
iteration:  938
estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
849
average reward
0.6488222698072805
logistic evaluation:  0.1725662419492483
average error per step:  0.21966652390779048
iteration:  939
estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
842
average reward
0.6491978609625668
logistic eval

estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
493
average reward
0.6448979591836734
logistic evaluation:  0.17429469559253433
average error per step:  0.21919489166037565
iteration:  985
estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
993
average reward
0.6442405708460754
logistic evaluation:  0.17448084406335143
average error per step:  0.21933564515901305
iteration:  986
estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
193
average reward
0.6446028513238289
logistic evaluation:  0.1743047414165724
average error per step:  0.21911387222446277
iteration:  987
estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
473
average reward
0.6439471007121058
logistic evaluation:  0.17418047066571263
average error per step:  0.21894407624406018
iteration:  988
estimator:  [ 0.01506109 -0.02583264 -0.0440754 ]
action
430
average reward
0.6443089430894309
logistic evaluation:  0.17501498422904765
average error per step:  0.2197341271636553
iteration:  989
estimator:  

In [6]:
def max_reward(data):
    est = data['reg']
    cov = data['cov']
    lab = data['label']
    
    (N,d,k) = cov.shape
    cum_reward = 0
    for i in range(N):
        values = []
        for j in range(k):
            values.append(expit(np.inner(est,cov[i,:,j])))
        cum_reward += max(values)
    return cum_reward/N

print(max_reward(data))



1.0


In [7]:
a = np.array([1,2,3])
a[:2] = np.array([5,6])
print(a)

[5 6 3]


In [8]:
b = np.random.normal(0,1,(2,3,4))
print(b[0,:,0])
print(b.shape)

[ 1.52697291  1.76998695 -0.58336886]
(2, 3, 4)
