In [1]:
# # Load the Drive helper and mount
# from google.colab import drive

# # This will prompt for authorization.
# drive.mount('/content/drive')

# %cd '/content/drive/MyDrive/Colab Notebooks/C73_A2_1/data'
# !ls

# data_dir = '/content/drive/MyDrive/Colab Notebooks/C73_A2_1/data'
# result_dir = '/content/drive/MyDrive/Colab Notebooks/C73_A2_1/result'

In [2]:
data_dir = 'data'
result_dir = 'result'

# Import

In [3]:
import numpy as np
import pandas as pd
import pickle
import copy
from IPython.display import display

import warnings
warnings.filterwarnings("ignore")

# Read data

In [4]:
""" Load original data (aggregated and unaggregated) """
af = 'u'

pf1 = '7200-3600'
pf = f'{pf1}.__2__'
ra = '__97.39__'

df = pd.read_csv(f'{data_dir}/t.{af}4.{pf1}.csv', index_col=[0]).reset_index().drop(columns=['index'])
dfo = pd.read_csv(f'{data_dir}/t.{af}1.csv', index_col=[0])
# dfo.head()

In [5]:
""" Load model """
model = pickle.load(open(f'{result_dir}/{af}4-5.{pf}.model_lr.{ra}.pkl', 'rb'))

""" Load preprocessed data used for feeding the model """
X_test = np.load(f'{result_dir}/{af}4-5.{pf}.data_lr.{ra}.t.X.npy')
y_test = np.load(f'{result_dir}/{af}4-5.{pf}.data_lr.{ra}.t.y.npy')

""" Load predicted output for test set """
y_pred = np.load(f'{result_dir}/{af}4-5.{pf}.output_lr.{ra}.t.y_pred.npy')

""" Load used features """
fts_names = [line.strip() for line in open(f'{result_dir}/{af}4-5.{pf}.fts_lr.{ra}.txt').readlines()]

# Choose samples

In [6]:
selected_Conv = '41.232.73.23 -> 150.35.87.168'
df_selecteds = df.loc[df['Conversation'] == selected_Conv]

In [7]:
""" chosen records """
selected_idx_all = np.where(y_test == 1)[0]

# Test pertubing

### Chose sample

In [8]:
""" chosen record """
selected_idx = selected_idx_all[0:1] #? choose the first record
X_chosen = X_test[selected_idx,:]
y_chosen = y_test[selected_idx]

""" test model with original sample """
y_chosen_pred = model.predict(X_chosen)
y_chosen_pred

array([1], dtype=int64)

In [9]:
X_chosen

array([[5.29152304e-02, 8.45864081e-08, 1.80145862e-08, 1.01759312e-01,
        1.01746382e-01, 0.00000000e+00, 5.64817162e-02, 1.86306400e-08,
        8.95980765e-09, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00]])

### Generate adversarial sample

In [10]:
""" delta: gradient of cost with respect to weight """
y_chosen_score = model.predict_proba(X_chosen)[:,1]
print(y_chosen_score)

delta = y_chosen_score.T[[0]].T - y_chosen.reshape(X_chosen.shape[0], 1)
print(delta)
print(delta.shape)

[0.83274558]
[[-0.16725442]]
(1, 1)


In [11]:
""" 
get direction matrix
gradient of cost with respect to X = gradient of cost with respect to wight * gradient of weight with respect to X
gradient of weight with respect to X is the model.coef_
the magnitude of the gradient is not important here, only the direction is necessary.
"""
direction = np.sign(np.matmul(delta, model.coef_))
print(direction)

[[-1.  1.  1.  1. -1.  1.  1.  1.  1. -1.  1.  1.  1. -1. -1.  1.  0.  1.]]


In [12]:
""" compute adversarial samples using the formula """
epsilon = 0.00002
X_chosen_pertubed = X_chosen + epsilon * direction
print(X_chosen_pertubed)

[[ 5.28952304e-02  2.00845864e-05  2.00180146e-05  1.01779312e-01
   1.01726382e-01  2.00000000e-05  5.65017162e-02  2.00186306e-05
   2.00089598e-05  9.99980000e-01  2.00000000e-05  2.00000000e-05
   2.00000000e-05  9.99980000e-01 -2.00000000e-05  2.00000000e-05
   0.00000000e+00  2.00000000e-05]]


In [13]:
""" test the model with pertubed sample """
model.predict(X_chosen_pertubed)

array([1], dtype=int64)

The model classified the pertubed sample as normal with very small epsilon

# Check epsilon

In [14]:
def gen_adv_samples(X, Y, epsilon=0.005):
    Y_score = model.predict_proba(X)[:,1]
    delta = Y_score.T[[0]].T - Y.reshape(X.shape[0], 1)
    
    #? compute delta and direction matrix
    direction = np.sign(np.matmul(delta, model.coef_))

    #? compute adversarial samples
    X_perturbed = X + epsilon * direction

    return X_perturbed

In [15]:
def check_model_on_perturbed_samples(X_pertubed):
    y_pertubed = model.predict(X_pertubed)
    print(y_pertubed)
    misclassified_idx = np.where(y_pertubed == 0)
    misclassified_ratio = round(len(misclassified_idx[0])/len(y_pertubed) * 100, 2)
    print('Misclassification percentage: {}'.format(misclassified_ratio))
    return misclassified_idx

In [16]:
""" gen adversarial samples for all samples """
selected_idx = selected_idx_all[:] #? 0:10 for 10 samples
X_chosen = X_test[selected_idx,:]
y_chosen = y_test[selected_idx]

In [17]:
X_chosen_pertubed = gen_adv_samples(X_chosen, y_chosen, epsilon=0.00002)
misclassified_idx_1 = check_model_on_perturbed_samples(X_chosen_pertubed)
misclassified_idx_1

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Misclassification percentage: 0.0


(array([], dtype=int64),)

In [18]:
X_chosen_pertubed = gen_adv_samples(X_chosen, y_chosen, epsilon=0.000021)
misclassified_idx_2 = check_model_on_perturbed_samples(X_chosen_pertubed)
misclassified_idx_2

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Misclassification percentage: 0.0


(array([], dtype=int64),)

In [19]:
X_chosen_pertubed = gen_adv_samples(X_chosen, y_chosen, epsilon=0.000022)
misclassified_idx_3 = check_model_on_perturbed_samples(X_chosen_pertubed)
misclassified_idx_3

[0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1]
Misclassification percentage: 5.0


(array([ 0,  1, 50], dtype=int64),)

In [20]:
X_chosen_pertubed = gen_adv_samples(X_chosen, y_chosen, epsilon=0.000023)
misclassified_idx_4 = check_model_on_perturbed_samples(X_chosen_pertubed)
misclassified_idx_4

[0 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 1 1]
Misclassification percentage: 18.33


(array([ 0,  1,  6,  7, 28, 30, 50, 51, 54, 55, 56], dtype=int64),)

# Reproduce attack

How to generate flow to attack the model ?

### Choose sample

In [21]:
"','".join(list(df_selecteds.columns))

"Conversation','SrcAddr','DstAddr','State','Proto','n_flows','StreamID_unique','Sport_nunique','Sport_mean','Sport_std','Sport_max','Sport_median','Sport_RU','Dport_nunique','Dport_mean','Dport_std','Dport_max','Dport_median','Dport_RU','TotBytes_sum','TotBytes_mean','TotBytes_std','TotBytes_max','TotBytes_median','TotBytes_RU','SrcBytes_sum','SrcBytes_mean','SrcBytes_std','SrcBytes_max','SrcBytes_median','SrcBytes_RU','DstBytes_sum','DstBytes_mean','DstBytes_std','DstBytes_max','DstBytes_median','DstBytes_RU','TotPkts_sum','TotPkts_mean','TotPkts_std','TotPkts_max','TotPkts_median','TotPkts_RU','PktsPerSec_mean','PktsPerSec_std','PktsPerSec_max','PktsPerSec_median','PktsPerSec_RU','BytesPerSec_mean','BytesPerSec_std','BytesPerSec_max','BytesPerSec_median','BytesPerSec_RU','SrcBytesPerSec_mean','SrcBytesPerSec_std','SrcBytesPerSec_max','SrcBytesPerSec_median','SrcBytesPerSec_RU','DstBytesPerSec_mean','DstBytesPerSec_std','DstBytesPerSec_max','DstBytesPerSec_median','DstBytesPerSec_RU',

In [26]:
# """ selected aggregated records """
# df_selecteds[['SrcAddr','DstAddr','State','Proto','n_flows','StreamID_unique','Sport_nunique','Sport_mean','Label','window_id']]

In [21]:
""" selected aggregated records """
df_selecteds.head()

Unnamed: 0,Conversation,SrcAddr,DstAddr,State,Proto,n_flows,StreamID_unique,Sport_nunique,Sport_mean,Sport_std,...,DstBytesPerSec_median,DstBytesPerSec_RU,BytesPerPkt_mean,BytesPerPkt_std,BytesPerPkt_max,BytesPerPkt_median,BytesPerPkt_RU,Label,window_id,LabelStr
2218,41.232.73.23 -> 150.35.87.168,41.232.73.23,150.35.87.168,alltcp,tcp,2,[ 1976 51591],2,6668.0,1.414214,...,1.725578,1.0,81.047175,10.549573,88.506849,81.047175,1.0,5,0,flow=From-Botnet-V44-TCP-CC107-IRC-Not-Encrypted
3615,41.232.73.23 -> 150.35.87.168,41.232.73.23,150.35.87.168,alltcp,tcp,2,[51591 84191],2,6666.5,0.707107,...,1.735833,1.0,81.148486,10.406297,88.506849,81.148486,1.0,5,1,flow=From-Botnet-V44-TCP-CC107-IRC-Not-Encrypted
4322,41.232.73.23 -> 150.35.87.168,41.232.73.23,150.35.87.168,alltcp,tcp,2,[ 84191 101058],2,6667.0,1.414214,...,0.609414,1.0,73.265432,0.742026,73.790123,73.265432,1.0,5,2,flow=From-Botnet-V44-TCP-CC107-IRC-Not-Encrypted
4675,41.232.73.23 -> 150.35.87.168,41.232.73.23,150.35.87.168,alltcp,tcp,2,[101058 109906],2,6667.5,0.707107,...,0.593343,1.0,73.04537,0.430811,73.35,73.04537,1.0,5,3,flow=From-Botnet-V44-TCP-CC107-IRC-Not-Encrypted
4777,41.232.73.23 -> 150.35.87.168,41.232.73.23,150.35.87.168,alltcp,tcp,2,[109906 115675],2,6667.5,0.707107,...,0.587141,1.0,73.2125,0.194454,73.35,73.2125,1.0,5,4,flow=From-Botnet-V44-TCP-CC107-IRC-Not-Encrypted


All records have the same n_flows value. Just choose one.

In [22]:
""" choose the first record """
selected_idx = selected_idx_all[[0]]
selected_idx

array([2218], dtype=int64)

In [23]:
df_selected = df.iloc[selected_idx]
df_X = pd.DataFrame(X_test[selected_idx], columns=fts_names)
df_ytrue = pd.DataFrame(y_test[selected_idx], columns=['Label'])
df_ypred = pd.DataFrame(y_pred[selected_idx], columns=['Label_Pred'])
df_Xy = pd.concat([df_selected.reset_index(), df_X, df_ytrue, df_ypred], axis=1)
df_Xy

Unnamed: 0,index,Conversation,SrcAddr,DstAddr,State,Proto,n_flows,StreamID_unique,Sport_nunique,Sport_mean,...,P_udp,P_other,S_CON,S_alltcp,S_INT,S_RED,S_other,S_ECO,Label,Label_Pred
0,2218,41.232.73.23 -> 150.35.87.168,41.232.73.23,150.35.87.168,alltcp,tcp,2,[ 1976 51591],2,6668.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,1


In [24]:
""" get streams ids aggregated into this record """
stream_ids = [int(s) for s in df_selected['StreamID_unique'].unique()[0].replace('[', '').replace(']', '').strip().split(' ')]
stream_ids

[1976, 51591]

In [25]:
""" get original flows """
dfo_selected = dfo.loc[dfo['StreamID'].isin(stream_ids)]
dfo_selected

Unnamed: 0,StreamID,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,...,SrcBytes,Label,LabelStr,PktsPerSec,BytesPerSec,SrcBytesPerSec,BytesPerPkt,DstBytes,DstBytesPerSec,State_orig
1975,1976,2022-07-25 23:30:50.093002,3571.229975,tcp,41.232.73.23,6669,<?>,150.35.87.168,1027,alltcp,...,3750,5,flow=From-Botnet-V44-TCP-CC107-IRC-Not-Encrypted,0.022401,1.648452,1.050058,73.5875,2137,0.598393,PA_PA
51590,51591,2022-07-26 00:32:56.114638,3527.809143,tcp,41.232.73.23,6667,<?>,150.35.87.168,1027,alltcp,...,9319,5,flow=From-Botnet-V44-TCP-CC107-IRC-Not-Encrypted,0.062078,5.494345,2.641583,88.506849,10064,2.852762,PA_PA


In [26]:
"','".join(fts_names)

"BytesPerPkt_mean','PktsPerSec_mean','BytesPerSec_mean','Sport_max','Sport_mean','n_flows','BytesPerPkt_max','BytesPerSec_max','SrcBytesPerSec_max','P_tcp','P_udp','P_other','S_CON','S_alltcp','S_INT','S_RED','S_other','S_ECO"

### Gen adv sample excluding some fields (not perturbing)
One-hot fields should not be changed. Sport should not be changed as well.  
Changing these fields might cause the bot not function correctly.  
`n_flows` can be changed but can only be increased (to ensure the bot intention is retained)  

In [27]:
""" choose the fields to be remained """
fixed_cols = ['Sport_max', 'Sport_mean', 'P_tcp', 'P_udp', 'P_other', 'S_CON', 'S_alltcp', 'S_INT', 'S_RED', 'S_other', 'S_ECO']
idx_fixed = [fts_names.index(c) for c in fixed_cols]
idx_fixed

[3, 4, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [28]:
""" store the true values of the fixed fields """
X_chosen = X_test[selected_idx,:]
y_chosen = y_test[selected_idx]

#? store true value of one-hot cols cause we'll keep these fields.
X_chosen_fixed = X_chosen[:,idx_fixed]
X_chosen_fixed

array([[0.10175931, 0.10174638, 1.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [29]:
""" generate adversarial sample whose fixed fields remain old value """
#? generate adversarial sample
X_chosen_pertubed = gen_adv_samples(X_chosen, y_chosen, epsilon=0.00002)
print(X_chosen_pertubed)

#? replace one-hot cols with true values
X_chosen_pertubed[:,idx_fixed] = X_chosen_fixed
print(X_chosen_pertubed)

#? check model output
_ = check_model_on_perturbed_samples(X_chosen_pertubed)

[[ 5.28952304e-02  2.00845864e-05  2.00180146e-05  1.01779312e-01
   1.01726382e-01  2.00000000e-05  5.65017162e-02  2.00186306e-05
   2.00089598e-05  9.99980000e-01  2.00000000e-05  2.00000000e-05
   2.00000000e-05  9.99980000e-01 -2.00000000e-05  2.00000000e-05
   0.00000000e+00  2.00000000e-05]]
[[5.28952304e-02 2.00845864e-05 2.00180146e-05 1.01759312e-01
  1.01746382e-01 2.00000000e-05 5.65017162e-02 2.00186306e-05
  2.00089598e-05 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]
[1]
Misclassification percentage: 0.0


Old epsilon cannot change the model output. Increase epsilon.

In [30]:
""" change epsilon till the model breaks """
X_chosen_pertubed = gen_adv_samples(X_chosen, y_chosen, epsilon=0.00005)
X_chosen_pertubed[:,idx_fixed] = X_chosen_fixed
_ = check_model_on_perturbed_samples(X_chosen_pertubed)

[0]
Misclassification percentage: 100.0


`epsilon = 0.00005` is enough

The fields we should change are: `BytesPerPkt_mean`, `PktsPerSec_mean`, `BytesPerSec_mean`, `BytesPerPkt_max`, `BytesPerSec_max`, `SrcBytesPerSec_max`.  
Note that the model doing minmax scaling on data. 

In [31]:
""" difference between original and perturbed sample """
print(X_chosen)
print(X_chosen_pertubed)

diff = np.subtract(X_chosen_pertubed, X_chosen)
diff

[[5.29152304e-02 8.45864081e-08 1.80145862e-08 1.01759312e-01
  1.01746382e-01 0.00000000e+00 5.64817162e-02 1.86306400e-08
  8.95980765e-09 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]
[[5.28652304e-02 5.00845864e-05 5.00180146e-05 1.01759312e-01
  1.01746382e-01 5.00000000e-05 5.65317162e-02 5.00186306e-05
  5.00089598e-05 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


array([[-5.e-05,  5.e-05,  5.e-05,  0.e+00,  0.e+00,  5.e-05,  5.e-05,
         5.e-05,  5.e-05,  0.e+00,  0.e+00,  0.e+00,  0.e+00,  0.e+00,
         0.e+00,  0.e+00,  0.e+00,  0.e+00]])

### New values ?

df_selected['BytesPerPkt_mean'].values[0]   ~  X_chosen[0,0]   
dfn_selected['BytesPerPkt_mean'].values[0]  ~  X_chosen_pertubed[0,0]

In [32]:
""" change epsilon till the model breaks """
X_chosen_pertubed = gen_adv_samples(X_chosen, y_chosen, epsilon=0.00005)
X_chosen_pertubed[:,idx_fixed] = X_chosen_fixed
_ = check_model_on_perturbed_samples(X_chosen_pertubed)

[0]
Misclassification percentage: 100.0


In [33]:
df_selected_n = copy.deepcopy(df_selected)

num_cols = ['BytesPerPkt_mean', 'PktsPerSec_mean', 'n_flows', 'BytesPerSec_mean', 'BytesPerPkt_max', 'BytesPerSec_max', 'SrcBytesPerSec_max']

idx_num = [fts_names.index(c) for c in num_cols]
for c in num_cols:
    idx = fts_names.index(c)
    print(c, X_chosen_pertubed[0,idx], diff[0,idx])
    if X_chosen[0,idx] == 0: #? n_flows is small therefore it is encoded into 0
        df_selected_n[c].values[0] = -1
    else:
        df_selected_n[c].values[0] = X_chosen_pertubed[0,idx] * df_selected[c].values[0] / X_chosen[0,idx]

display(df_selected[num_cols])
display(df_selected_n[num_cols])

BytesPerPkt_mean 0.052865230413430074 -5.000000000000143e-05
PktsPerSec_mean 5.0084586408050573e-05 5e-05
n_flows 5e-05 5e-05
BytesPerSec_mean 5.00180145862275e-05 5e-05
BytesPerPkt_max 0.05653171621893331 5.000000000000143e-05
BytesPerSec_max 5.001863064003617e-05 5e-05
SrcBytesPerSec_max 5.000895980765106e-05 5e-05


Unnamed: 0,BytesPerPkt_mean,PktsPerSec_mean,n_flows,BytesPerSec_mean,BytesPerPkt_max,BytesPerSec_max,SrcBytesPerSec_max
2218,81.047175,0.04224,2,3.571398,88.506849,5.494345,2.641583


Unnamed: 0,BytesPerPkt_mean,PktsPerSec_mean,n_flows,BytesPerSec_mean,BytesPerPkt_max,BytesPerSec_max,SrcBytesPerSec_max
2218,80.970593,25.010623,-1,9916.089546,88.585199,14750.948178,14743.93356


In [125]:
df_selected

Unnamed: 0,Conversation,SrcAddr,DstAddr,State,Proto,n_flows,StreamID_unique,Sport_nunique,Sport_mean,Sport_std,...,DstBytesPerSec_median,DstBytesPerSec_RU,BytesPerPkt_mean,BytesPerPkt_std,BytesPerPkt_max,BytesPerPkt_median,BytesPerPkt_RU,Label,window_id,LabelStr
2218,41.232.73.23 -> 150.35.87.168,41.232.73.23,150.35.87.168,alltcp,tcp,2,[ 1976 51591],2,6668.0,1.414214,...,1.725578,1.0,81.047175,10.549573,88.506849,81.047175,1.0,5,0,flow=From-Botnet-V44-TCP-CC107-IRC-Not-Encrypted


In [130]:
dfo_selected_n = copy.deepcopy(dfo_selected)
dfo_selected_n[['StreamID','StartTime','Dur','Proto','SrcAddr','Sport','Dir','DstAddr','Dport','State','Label','LabelStr','PktsPerSec','BytesPerSec','SrcBytesPerSec','BytesPerPkt','State_orig']].to_csv(f'{result_dir}/dfo_new.csv')

In [68]:
(0.022401+0.062078)/2

0.0422395

In [114]:
9916.089546*7 - 1.648452 - 5.494345 - 14750.948178*4

10401.691312999996

In [124]:
(80.970593*7 - 73.587500 - 88.506849 - 88.585199)/4

79.02865075

In [123]:
(25.010623*7 - 0.022401 - 0.062078)/4

43.74747049999999

In [109]:
99.958013/2

49.9790065

In [None]:
dfo_selected['BytesPerPkt'].mean(), dfo_selected_n['BytesPerPkt'].mean()

(81.04717465753424, 81.04717465753424)

In [None]:
dfo_selected_n.iloc[[0]]['BytesPerPkt']

1975    73.5875
Name: BytesPerPkt, dtype: float64

In [None]:
dfo_selected_n.iloc[[0]]['BytesPerPkt'] = 0

In [None]:
dfo_selected_n.iloc[[0]]['BytesPerPkt']

1975    73.5875
Name: BytesPerPkt, dtype: float64