In [1]:
from __future__ import print_function
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import norm
from math import sqrt
import sklearn
from sklearn.linear_model import LogisticRegression
import pickle
import time
import torch

from utils import test_accuracy, projection, veccos, binary_search_cx, \
                  find_exp_score, find_slab_score, project_l2_centroid, \
                  project_l2_centroid_straight, project_slab, project_slab_straight,\
                  contaminate_dataset

from attackers import StraightAttack, SemiOnlineAttack, ConcentratedAttack, GreedyAttack

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
## fully online experiment

def experiment(dataset, taus=None):
    
    filepath = "./data/"+dataset+"/"
    clf = LogisticRegression(fit_intercept=False, solver='liblinear')
    if (taus==None):
        n_taus = 5
    else:
        n_taus = len(taus)
    res = [0 for a in range(n_taus)]
    
    for i in range(n_exp):    
        print ("Running the {}-th experiment".format(i))
        start_time = time.time()
        filename = filepath+str(i)
        with open(filename,"rb") as f:
            datasets = pickle.load(f)

        X_init, Y_init = datasets[0] # generate defense constraints and init w if necessary.
        X_clean, Y_clean = datasets[1] # the clean data stream
        X_valid, Y_valid = datasets[2] # validation set
        X_test, Y_test = datasets[3]   # the actual test set

        clf.fit(X_init, Y_init)
        w_0 = np.zeros((1,d))
        print ("Initial accuracy is {}".format(test_accuracy(X_test, Y_test, w_0)))
        w_t = -clf.coef_
        w_t /= norm(w_t)
        print ("Target accuracy is {}".format(test_accuracy(X_test, Y_test, w_t)))

        if attack_method == "simplistic":
            attacker = StraightAttack()
            attacker.set_param(datasets, w_0, w_t, R, eta, 
                                        defense, n_iter_warmup, n_attack) 
        elif attack_method == "greedy":
            attacker = GreedyAttack()
            attacker.set_param(datasets, w_0, w_t, R, eta, 
                                        defense, n_iter_warmup, n_attack) 
        elif attack_method == "concentrated":
            attacker = ConcentratedAttack()
            attacker.set_param(datasets, w_0, w_t, R, eta, 
                                        defense, n_iter_warmup, n_attack) 
        elif attack_method == "semi-online":
            attacker = SemiOnlineAttack()
            X_adv, Y_adv = X_clean[:n_attack, :], Y_clean[:n_attack]
            attacker.set_param(datasets, w_0, w_t, R, eta, 
                                        defense, n_iter_warmup, n_attack, (-X_adv, Y_adv))

        global loc
        
        attacker.set_init_set(X_init, Y_init)
        if defense_method == "slab":
            scores = attacker.slab_scores()[-1]
        elif defense_method == "norm":
            scores = attacker.l2_norms()[-1]
        elif defense_method == "L2":
            scores = attacker.l2_distances_to_centroid()[-1]
        
        tau_levels = [0.3, 0.5, 0.7, 0.9, 1]
        taus = [scores[int(tau_level*len(scores))-1] for tau_level in tau_levels]
        print (taus)
        w_list = [[] for tau in taus]
        res = [[] for tau in taus]
        accs = [[] for tau in taus]
        online_res = [[] for tau in taus]
        for j,tau in enumerate(taus):

            attacker.set_defense_threshold(defense_method, tau)
            attacker.set_defense_range(defense_range)
            attacker.set_init_set(X_init, Y_init)

            i_clean = 0
            
            for k in range(n_horizon):
                if loc[k]: #attack
                    if attack_method == "semi-online":
                        attacker.set_param_lite(attacker.w_curr, w_t, 0, 1, 
                                           (-X_clean[k].reshape(1,-1), Y_clean[k].reshape(1)))

                    pt = attacker.find_best_poisoning_points()
                    attacker.update_w(pt)
                    online_res[j].append(0)
                else:      #update on clean point
                    pt = (attacker.X_clean[k,:], attacker.Y_clean[k])
                    online_res[j].append(round(test_accuracy(X_clean[k].reshape(1,-1), 
                                                             Y_clean[k].reshape(1),
                                                             attacker.w_curr)))
                    if attacker.meet_constraints(pt):
                        attacker.update_w(pt)
                    i_clean += 1
                acc = test_accuracy(X_test, Y_test, attacker.w_curr)
                w_list[j].append(attacker.w_curr)
                accs[j].append(acc)
                if k%50==0:
                    print ("Accuracy at time {} is {}".format(k, acc))
                
            print ("acc on clean stream is", sum(online_res[j])/i_clean)
                    
            print (tau, sum(accs[j])/n_horizon)
            res[j] = (sum(accs[j])/n_horizon)
            attacker.reset()
            print (res)
            print("--- %s seconds ---" % (time.time() - start_time))
            
        results = [res, online_res, accs, w_list, taus, loc]
        filepath = "_".join(["./results/fully", dataset, attack_method, defense_method, defense_range, str(i)])
        print (filepath)
        with open(filepath, "wb") as f:
            pickle.dump(results, f)
        filepath = "_".join(["./results/fully", dataset, attack_method, defense_method, defense_range, str(i)])
        print (filepath)    
        with open(filepath, "rb") as f:
            results = pickle.load(f)
        res, online_res, accs, w_list, taus, loc = results    
    return w_list, (X_test, Y_test)

In [3]:
n_exp = 1
d, eta = 9, 0.05
n_attack, n_clean, n_init, n_test, n_valid = 80, 400, 100, 100, 50
n_iter_warmup = n_clean
dataset = "BreastCancer"
n_horizon = 400
attack_chance = 0.1
loc = (np.random.permutation(n_horizon) < n_horizon*attack_chance)
R = 3

In [4]:
defense_method = "norm"
defense = {defense_method:0}
attack_methods = ["simplistic", "greedy", "semi-online"]
for attack_method in attack_methods:
    defense_range = "all-pts"
    w_list1, (X_test, Y_test) = experiment(dataset)
    res1 = [test_accuracy(X_test, Y_test, w) for w in w_list1[-1]]

Running the 0-th experiment
Initial accuracy is 0.55
Target accuracy is 0.02
[0.7302562831920678, 0.8955347814864546, 1.1345017796356214, 1.9014260080608296, 2.8311477761324935]
Accuracy at time 0 is 0.55
Accuracy at time 50 is 0.98
Accuracy at time 100 is 0.98
Accuracy at time 150 is 0.98
Accuracy at time 200 is 0.98
Accuracy at time 250 is 0.98
Accuracy at time 300 is 0.98
Accuracy at time 350 is 0.98
acc on clean stream is 0.9555555555555556
0.7302562831920678 0.9786000000000034
[0.9786000000000034, [], [], [], []]
--- 0.1676332950592041 seconds ---
Accuracy at time 0 is 0.97
Accuracy at time 50 is 0.97
Accuracy at time 100 is 0.97
Accuracy at time 150 is 0.97
Accuracy at time 200 is 0.97
Accuracy at time 250 is 0.97
Accuracy at time 300 is 0.97
Accuracy at time 350 is 0.97
acc on clean stream is 0.9666666666666667
0.8955347814864546 0.9708250000000088
[0.9786000000000034, 0.9708250000000088, [], [], []]
--- 0.32782602310180664 seconds ---
Accuracy at time 0 is 0.97
Accuracy at time

In [5]:
defense_method = "L2"
defense = {defense_method:0}

for attack_method in attack_methods:
    defense_range = "all-pts"
    w_list2, (X_test, Y_test) = experiment(dataset)
    res2 = [test_accuracy(X_test, Y_test, w) for w in w_list2[-1]]

Running the 0-th experiment
Initial accuracy is 0.55
Target accuracy is 0.02
[0.3859318136380788, 0.4376050076260244, 0.9070230503049862, 1.3741854968587637, 1.922456238081762]
Accuracy at time 0 is 0.55
Accuracy at time 50 is 0.98
Accuracy at time 100 is 0.97
Accuracy at time 150 is 0.97
Accuracy at time 200 is 0.98
Accuracy at time 250 is 0.97
Accuracy at time 300 is 0.97
Accuracy at time 350 is 0.97
acc on clean stream is 0.9666666666666667
0.3859318136380788 0.9711250000000079
[0.9711250000000079, [], [], [], []]
--- 0.1730663776397705 seconds ---
Accuracy at time 0 is 0.55
Accuracy at time 50 is 0.97
Accuracy at time 100 is 0.97
Accuracy at time 150 is 0.97
Accuracy at time 200 is 0.97
Accuracy at time 250 is 0.97
Accuracy at time 300 is 0.97
Accuracy at time 350 is 0.97
acc on clean stream is 0.9666666666666667
0.4376050076260244 0.9690000000000086
[0.9711250000000079, 0.9690000000000086, [], [], []]
--- 0.33561182022094727 seconds ---
Accuracy at time 0 is 0.97
Accuracy at time 

In [6]:
defense_method = "slab"
defense = {defense_method:0}

for attack_method in attack_methods:
    defense_range = "all-pts"
    w_list3, (X_test, Y_test) = experiment(dataset)
    res3 = [test_accuracy(X_test, Y_test, w) for w in w_list3[-1]]

Running the 0-th experiment
Initial accuracy is 0.55
Target accuracy is 0.02
[0.2306696389777392, 0.3837293717261453, 0.556452819684853, 1.433274847482336, 3.8435622629189736]
Accuracy at time 0 is 0.97
Accuracy at time 50 is 0.98
Accuracy at time 100 is 0.97
Accuracy at time 150 is 0.97
Accuracy at time 200 is 0.97
Accuracy at time 250 is 0.97
Accuracy at time 300 is 0.97
Accuracy at time 350 is 0.97
acc on clean stream is 0.9666666666666667
0.2306696389777392 0.9715250000000087
[0.9715250000000087, [], [], [], []]
--- 0.18408513069152832 seconds ---
Accuracy at time 0 is 0.97
Accuracy at time 50 is 0.97
Accuracy at time 100 is 0.97
Accuracy at time 150 is 0.97
Accuracy at time 200 is 0.97
Accuracy at time 250 is 0.97
Accuracy at time 300 is 0.97
Accuracy at time 350 is 0.97
acc on clean stream is 0.9666666666666667
0.3837293717261453 0.9700000000000086
[0.9715250000000087, 0.9700000000000086, [], [], []]
--- 0.3767735958099365 seconds ---
Accuracy at time 0 is 0.97
Accuracy at time 5