# Bayesian Optimisation Adversarial Examples

This notebook examines some adversarial examples produced using the Bayesian Optimiation method for both the random forest and linear regression models.

In [1]:
import pickle
import pandas as pd
import os
import numpy as np
from skopt import load

from processing.PostcodeEncoder import PostcodeEncoder



In [2]:
pd.options.display.max_rows = 999

In [3]:
main_dir = os.path.dirname(os.getcwd())

In [4]:
with open(os.path.join(main_dir, "Processing", r"train_preproc.p"), 'rb') as data_file:
    train_data = pickle.load(data_file)
X_trainProc, y_trainProc = train_data
# forgot in preprocessing: convert from bool to int
X_trainProc['hasGNotice'] = X_trainProc['hasGNotice'].apply(int)

In [5]:
#load in model to attack - trying rf and lr
rfmodel = pickle.load(open(os.path.join(main_dir, "fitted_models", "rf_0.929"), "rb"))

lrmodel = pickle.load(open(os.path.join(main_dir, "fitted_models", "lr_0.888"), "rb"))

In [6]:
co_file = os.path.join(main_dir, "Co_600K_Jul2019_6M.pkl")
raw_data = pd.read_pickle(co_file)

Finding the intial samples that were attacked:

In [7]:
failed_indices = [i for i in range(y_trainProc.size) if y_trainProc[i] == 1]
failed_companies = X_trainProc.iloc[failed_indices]

In [8]:
rf_indices = [i for i in range(failed_companies.shape[0])
             if rfmodel.predict(failed_companies.iloc[i].to_numpy().reshape(1,-1)) == 1]
rf_starts = failed_companies.iloc[rf_indices]

In [9]:
rf_starts = rf_starts.join(raw_data['pcd'].loc[rf_starts.index])
# A hack here to get around there being multiple entries in the original file with the same name
rf_starts.index = pd.io.parsers.ParserBase({'names':rf_starts.index})._maybe_dedup_names(rf_starts.index)
rf_starts = rf_starts.drop('362579.1')

In [10]:
rf_starts

Unnamed: 0,AccountsAccountCategory,AccountsAccountRefDay,AccountsAccountRefMonth,CompanyCategory,CompanyNameCountNum,CompanyNameCountX,CompanyNameLen,CompanyNameWordLen,Field1014,Field1129,...,oac11,oac2,oseast1m,osnrth1m,ru11ind,dAccountsTimeGap,dConfStmtTimeGap,dReturnsTimeGap,OtherCompInPcd,pcd
362579,9,31.0,12.0,0,0,0,30,3,9.291331e-16,-4.091809e-16,...,1,0,-1.260094,0.749476,21,1.747945,1.038356,1.074654,-0.464777,LL130TS
1485379,12,31.0,3.0,0,0,0,27,3,-2.064843,-4.091809e-16,...,40,14,0.828136,-0.222618,17,1.753425,1.040243,1.074564,0.501456,SG129QL
1862639,13,31.0,3.0,0,0,0,25,4,9.291331e-16,-4.091809e-16,...,39,14,-0.367096,1.132286,15,1.753425,1.038356,1.077873,0.748939,HX59DG
2332145,13,31.0,3.0,0,4,0,29,4,-1.720744,-4.091809e-16,...,25,8,0.771464,-0.547594,15,2.0,1.040213,1.076628,0.094017,W1T2DB


In [11]:
lr_indices = [i for i in range(failed_companies.shape[0])
             if lrmodel.predict(failed_companies.iloc[i].to_numpy().reshape(1,-1)) == 1]
lr_starts = failed_companies.iloc[lr_indices]

In [12]:
lr_starts = lr_starts.join(raw_data['pcd'].loc[lr_starts.index])
lr_starts.index = pd.io.parsers.ParserBase({'names':lr_starts.index})._maybe_dedup_names(lr_starts.index)

In [13]:
lr_starts.drop(['28020.1', '258890.1', '869707.1', '928470.1'])

Unnamed: 0,AccountsAccountCategory,AccountsAccountRefDay,AccountsAccountRefMonth,CompanyCategory,CompanyNameCountNum,CompanyNameCountX,CompanyNameLen,CompanyNameWordLen,Field1014,Field1129,...,oac11,oac2,oseast1m,osnrth1m,ru11ind,dAccountsTimeGap,dConfStmtTimeGap,dReturnsTimeGap,OtherCompInPcd,pcd
28020,9,31.0,1.0,0,0,0,26,4,9.291331e-16,-4.091809e-16,...,75,25,-0.030581,1.154983,17,1.747945,1.038236,1.076628,-0.464777,WF102JT
258890,5,30.0,9.0,0,0,0,24,3,9.291331e-16,-4.091809e-16,...,14,4,-0.684619,1.018469,15,1.747945,1.038356,1.077154,0.793153,M33EB
319554,6,28.0,9.0,0,0,0,41,7,9.291331e-16,-4.091809e-16,...,44,15,0.529201,-0.296668,21,1.742466,1.038356,1.076787,-0.341515,HP12UJ
474889,13,31.0,3.0,0,0,0,37,4,1.146551,0.7933831,...,39,14,-0.310511,-0.494891,17,1.753425,1.038356,1.076855,-0.847661,SN28HW
611766,5,30.0,9.0,0,0,0,28,3,9.291331e-16,-4.091809e-16,...,39,14,-0.147746,1.114317,15,1.747945,1.038356,1.076712,0.142081,WF43BA
646897,5,30.0,9.0,0,0,0,21,3,9.291331e-16,-4.091809e-16,...,67,22,-0.439899,0.460651,15,1.749323,1.038266,1.076712,-0.081892,WS98BX
824696,13,31.0,12.0,0,0,0,18,3,0.9634879,0.3938493,...,1,0,1.052535,0.273346,19,1.747945,1.04031,1.074654,-0.847661,CB75RB
842847,5,31.0,12.0,0,0,0,25,3,9.291331e-16,-4.091809e-16,...,25,8,0.752185,-0.550276,15,1.747945,1.038356,1.076735,1.318781,W22UT
850058,13,27.0,2.0,0,0,0,12,3,1.697521,-4.091809e-16,...,35,12,-0.845328,1.169142,17,1.747945,1.038356,1.077573,-0.155653,BB15QR
869707,13,31.0,12.0,0,0,0,25,3,0.5188562,-4.091809e-16,...,54,18,-1.305087,1.003135,15,1.747945,1.038356,1.076712,0.896221,L207EP


Load in the adversarial exampes we created:

In [14]:
rf_results = []
for i in range(0, 4):
    file_name = 'rf_result' + str(i) + '.p'
    with open(os.path.join(main_dir, 'BayesianOptimization', file_name), 'rb') as file:
        rf_res = load(file)
        rf_results.append(rf_res.x)


In [15]:
with open(os.path.join(main_dir, 'BayesianOptimization', 'lr_examples.p'), 'rb') as file:
    lr_results = pickle.load(file)

In [16]:
derived_features = ["oseast1m", "osnrth1m", "cty", "lat", "long", "ru11ind", "oac11",
                    "country", "oac1", "oac2", "imdu", "OtherCompInPcd"]

In [18]:
encoder = PostcodeEncoder()

# A function to put the original samples and corresponding adversarial example together into a table.
def build_table(originals, adversarials):
    non_derived_originals = originals.drop(derived_features)
    df_index = non_derived_originals.index
    new_df = non_derived_originals.iloc[:, 0:1]
    new_name = str(non_derived_originals.columns[0]) + " - Adversarial"
    new_df.insert(1, new_name, adversarials[0])
    new_df.loc['pcd', new_name] = encoder.search_decode(new_df.loc['pcd', new_name])
    for i in range(1, len(adversarials)):
        sample_number = non_derived_originals.columns[i]
        new_df.insert(2*i, sample_number, non_derived_originals.iloc[:, i])
        new_name = str(sample_number) + " - Adversarial"
        new_df.insert(2*i+1, new_name, adversarials[i])
        new_df.loc['pcd', new_name] = encoder.search_decode(new_df.loc['pcd', new_name])
    return new_df

In [23]:
rf_examples = build_table(rf_starts.transpose(), rf_results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Following are the adversarial samples that we generated for the random forest model. Two out of the four samples had adversarial samples successfully generated - the others could also be found if we allow the distance from the original to be greater.

In [24]:
rf_examples

Unnamed: 0,362579,362579 - Adversarial,1485379,1485379 - Adversarial,1862639,1862639 - Adversarial,2332145,2332145 - Adversarial
AccountsAccountCategory,9,9.0,12,8.0,13,15.0,13,13.0
AccountsAccountRefDay,31.0,31.0,31.0,30.963754,31.0,30.857434,31.0,31.0
AccountsAccountRefMonth,12.0,12.0,3.0,2.776115,3.0,3.318986,3.0,3.0
CompanyCategory,0,0.0,0,0.0,0,0.0,0,0.0
CompanyNameCountNum,0,0.0,0,0.0,0,0.0,4,4.0
CompanyNameCountX,0,0.0,0,1.0,0,0.0,0,0.0
CompanyNameLen,30,30.0,27,27.0,25,25.0,29,29.0
CompanyNameWordLen,3,3.0,3,3.0,4,4.0,4,4.0
Field1014,0.0,0.0,-2.064843,-2.062716,0.0,-0.01979,-1.720744,-1.720744
Field1129,-0.0,-0.0,-0.0,0.020931,-0.0,0.0126,-0.0,-0.0


In [21]:
lr_examples = build_table(lr_starts.transpose(), lr_results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [22]:
lr_examples

Unnamed: 0,28020,28020 - Adversarial,28020.1,28020.1 - Adversarial,258890,258890 - Adversarial,258890.1,258890.1 - Adversarial,319554,319554 - Adversarial,...,928470,928470 - Adversarial,928470.1,928470.1 - Adversarial,1015117,1015117 - Adversarial,1153850,1153850 - Adversarial,1701595,1701595 - Adversarial
AccountsAccountCategory,9,9.0,9,5.0,5,5.0,5,15.0,6,11.0,...,12,14.0,12,0.0,6,13.0,13,13.0,13,15.0
AccountsAccountRefDay,31.0,31.0,31.0,30.0,30.0,27.965259,30.0,31.24015,28.0,30.141871,...,30.0,31.030467,30.0,31.205965,30.0,31.01272,31.0,30.0,31.0,30.0
AccountsAccountRefMonth,1.0,1.0,1.0,9.0,9.0,9.085777,9.0,3.120943,9.0,8.954606,...,9.0,8.068285,9.0,6.782097,9.0,3.043233,3.0,5.0,8.0,11.0
CompanyCategory,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
CompanyNameCountNum,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
CompanyNameCountX,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
CompanyNameLen,26,26.0,26,24.0,24,41.0,24,37.0,41,28.0,...,32,36.0,32,23.0,24,24.0,38,24.0,36,33.0
CompanyNameWordLen,4,4.0,4,3.0,3,7.0,3,4.0,7,3.0,...,4,5.0,4,3.0,3,4.0,4,4.0,5,5.0
Field1014,0.0,0.0,0.0,0.0,0.0,-0.004541,0.0,1.139979,0.0,-0.025715,...,0.0,-1.976219,0.0,-0.036676,0.0,0.007546,1.366133,0.951875,-1.953473,1.038937
Field1129,-0.0,-0.0,-0.0,-0.0,-0.0,0.011961,-0.0,0.80499,-0.0,-0.002901,...,-0.0,0.009901,-0.0,0.052507,-0.0,1.339914,1.713946,-0.0,-0.0,-2.194843
