In [1]:
import sys
import numpy as np

import torch
from torch.nn import Parameter
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.autograd import Variable
import Sklearn_PyTorch

import syft as sy

  from ._conv import register_converters as _register_converters





## Data Preprocessing Functions

In [2]:
from sklearn import preprocessing

### Normalization

In [3]:
def normalize(df): 
    x = df.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)
    return df

### Transforming categorical feature to numerical feature

In [4]:
def encoding(data):
    for col in data.columns:
        if data[col].dtype == type(object):
            le_x = preprocessing.LabelEncoder()
            le_x.fit(data[col])
            data[col] = le_x.transform(data[col])
    return data

### One-Hot Encoding

In [5]:
from sklearn.preprocessing import OneHotEncoder
def label_encoder(df):
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(df)
    print(enc.categories_)
    df_array = enc.transform(df).toarray() #Encode the classes to a binary array 
    return df_array

## ACM KDD'99

In [None]:
data_path = "../../../Dataset/KDD99/kddcup99.csv"

dataset = pd.read_csv(data_path, sep=',', usecols=range(0, 42))

print("Dataset Shape:", dataset.shape)

In [None]:
#Randomly split data into three parts
data_server = dataset.sample(frac=0.5, random_state=1)
dataset = dataset.drop(data_server.index)
data_alice = dataset.sample(frac=0.5, random_state=1)
data_bob = dataset.drop(data_alice.index)

In [None]:
#Divide data into x and y
data_server_x = pd.DataFrame(data_server.iloc[:, 0:41])
data_server_y = pd.DataFrame(data_server.iloc[:, 41])
data_alice_x = pd.DataFrame(data_alice.iloc[:, 0:41])
data_alice_y = pd.DataFrame(data_alice.iloc[:, 41])
data_bob_x = pd.DataFrame(data_bob.iloc[:, 0:41])
data_bob_y = pd.DataFrame(data_bob.iloc[:, 41])

In [None]:
# Relabel data into biclasses
new_class = {'back':'abnormal', 'buffer_overflow':'abnormal', 'ftp_write':'abnormal', 'guess_passwd':'abnormal', 'imap':'abnormal',
            'ipsweep':'abnormal', 'land':'abnormal', 'loadmodule':'abnormal', 'multihop':'abnormal', 'neptune':'abnormal', 'nmap':'abnormal',
            'perl':'abnormal', 'phf':'abnormal', 'pod':'abnormal', 'portsweep':'abnormal', 'rootkit':'abnormal', 'satan':'abnormal',
            'smurf':'abnormal', 'spy':'abnormal', 'teardrop':'abnormal', 'warezclient':'abnormal', 'warezmaster':'abnormal'}
data_server_y = data_server_y.replace(new_class)
data_alice_y = data_alice_y.replace(new_class)
data_bob_y = data_bob_y.replace(new_class)

In [None]:
# Encode the string datatype to numerial
data_server_x = encoding(data_server_x)
data_server_y = encoding(data_server_y)
data_alice_x = encoding(data_alice_x)
data_alice_y = encoding(data_alice_y)
data_bob_x = encoding(data_bob_x)
data_bob_y = encoding(data_bob_y)

In [None]:
#Normalize x data
data_server_x = normalize(data_server_x)
data_alice_x = normalize(data_alice_x)
data_bob_x = normalize(data_bob_x)

In [None]:
#One-Hot encoding labels
data_server_y = label_encoder(data_server_y)
data_alice_y = label_encoder(data_alice_y)
data_bob_y = label_encoder(data_bob_y)

In [None]:
print(data_server_y)

## IoT Botnet Stream Data

In [6]:
# Load all the data from the CSV file 
BM_DATA_PATH = "../../../Dataset/Botnet_Detection/Philips_B120N10_Baby_Monitor"
DB_DATA_PATH = "../../../Dataset/Botnet_Detection/Danmini_Doorbell"
ET_DATA_PATH = "../../../Dataset/Botnet_Detection/Ecobee_Thermostat"
PT_DATA_PATH = "../../../Dataset/Botnet_Detection/PT_838_Security Camera"
XC_DATA_PATH = "../../../Dataset/Botnet_Detection/XCS7_1002_WHT_Security_Camera"
df_bm_b = pd.read_csv(BM_DATA_PATH+"/benign_traffic.csv")
df_bm_m = pd.read_csv(BM_DATA_PATH+"/Mirai/udp.csv")
df_db_b = pd.read_csv(DB_DATA_PATH+"/benign_traffic.csv")
df_db_m = pd.read_csv(DB_DATA_PATH+"/Mirai/udp.csv")
df_et_b = pd.read_csv(ET_DATA_PATH+"/benign_traffic.csv")
df_et_m = pd.read_csv(ET_DATA_PATH+"/Mirai/udp.csv")
df_pt_b = pd.read_csv(ET_DATA_PATH+"/benign_traffic.csv")
df_pt_m = pd.read_csv(ET_DATA_PATH+"/Mirai/udp.csv")
df_xc_b = pd.read_csv(XC_DATA_PATH+"/benign_traffic.csv")
df_xc_m = pd.read_csv(XC_DATA_PATH+"/Mirai/udp.csv")

In [7]:
#Assign the label to each dataframe
df_bm_b = df_bm_b.assign(label = 'b') 
df_db_b = df_db_b.assign(label = 'b') 
df_et_b = df_et_b.assign(label = 'b')
df_pt_b = df_pt_b.assign(label = 'b')
df_xc_b = df_xc_b.assign(label = 'b')
df_bm_m = df_bm_m.assign(label = 'm')
df_db_m = df_db_m.assign(label = 'm') 
df_et_m = df_et_m.assign(label = 'm') 
df_pt_m = df_pt_m.assign(label = 'm')
df_xc_m = df_xc_m.assign(label = 'm')

In [9]:
print(df_pt_m.shape)
print(df_pt_b.shape)
print(df_xc_m.shape)
print(df_xc_b.shape)

(151481, 116)
(13113, 116)
(151879, 116)
(46585, 116)


In [10]:
#Combine the benign traffic and malicious traffic
df_bm = df_bm_b
df_bm = df_bm.append(df_bm_m, ignore_index = True)
df_db = df_db_b
df_db = df_db.append(df_db_m, ignore_index = True)
df_et = df_et_b
df_et = df_et.append(df_et_m, ignore_index = True)
df_pt = df_pt_b
df_pt = df_pt.append(df_pt_m, ignore_index = True)
df_xc = df_xc_b
df_xc = df_xc.append(df_xc_m, ignore_index = True)

In [11]:
def shuffler(df):
  return df.reindex(np.random.permutation(df.index))

In [12]:
# Shuffle the rows in dataframe
df_bm = shuffler(df_bm)
df_db = shuffler(df_db)
df_et = shuffler(df_et)
df_pt = shuffler(df_pt)
df_xc = shuffler(df_xc)

In [13]:
# Create a dataset on server for initial model (second version)
df_server = pd.DataFrame()
df_server = df_server.append(df_et.sample(frac =.25), ignore_index=True)
df_server = df_server.append(df_pt.sample(frac =.25), ignore_index=True)
df_server = df_server.append(df_xc.sample(frac =.25), ignore_index=True)

In [None]:
# Create a dataset on server for initial model (first version)
df_server = pd.DataFrame()
df_server = df_server.append(df_bm.sample(frac =.25), ignore_index=True)
df_server = df_server.append(df_db.sample(frac =.25), ignore_index=True)
df_server = df_server.append(df_et.sample(frac =.25), ignore_index=True)

In [14]:
df_server.shape

(131912, 116)

In [15]:
#Divide dataframe into x and y
df_s_x = pd.DataFrame(df_server.iloc[:, 0:115])
df_s_y = pd.DataFrame(df_server.iloc[:, 115])
df_bm_x = pd.DataFrame(df_bm.iloc[:, 0:115])
df_bm_y = pd.DataFrame(df_bm.iloc[:, 115])
df_db_x = pd.DataFrame(df_db.iloc[:, 0:115])
df_db_y = pd.DataFrame(df_db.iloc[:, 115])

In [16]:
#Normalize the x dataframe
df_s_x = normalize(df_s_x)
df_bm_x = normalize(df_bm_x)
df_db_x = normalize(df_db_x)

In [17]:
print(df_bm.shape)

(392274, 116)


In [18]:
print(df_s_x.shape)

(131912, 115)


In [19]:
print(df_db_x.shape)

(287213, 115)


In [20]:
#One-Hot encoding labels and transform into array
s_y = label_encoder(df_s_y)
bm_y = label_encoder(df_bm_y)
db_y = label_encoder(df_db_y)

[array(['b', 'm'], dtype=object)]
[array(['b', 'm'], dtype=object)]
[array(['b', 'm'], dtype=object)]


In [21]:
s_y

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

## Start Transfering data to workers 

In [22]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [None]:
class ELM(torch.nn.Module):
    def __init__(self, n_inputs: int, hidden_units = 1000):
        self.random_weights = np.random.normal(size=[n_inputs, hidden_units]) # A random weight is assigned
    
    def learn(self, X: np.ndarray, Y: np.ndarray):
        H = self._hidden_layer(X)
        self.output_weights = np.linalg.pinv(H) @ Y
    
    def _f(self, x: np.ndarray): 
        return 1. / (1. + np.exp(-x)) #activation function: sigmoid
    
    def _hidden_layer(self, inputs: np.ndarray): 
        return self._f(inputs @ self.random_weights)
  
    def _output_layer(self, hidden: np.ndarray): 
        return hidden @ self.output_weights
  
    def __call__(self, inputs: np.ndarray):  #infer
        return self._output_layer(self._hidden_layer(inputs))

In [23]:
hook = sy.TorchHook(torch)

### (IoT BotNet)

In [24]:
BM = sy.VirtualWorker(hook, id='BM')
DB = sy.VirtualWorker(hook, id='DB')

In [25]:
from sklearn.model_selection import train_test_split
b_train_x, b_test_x, b_train_y, b_test_y = train_test_split(df_bm_x, bm_y, test_size=0.20)
d_train_x, d_test_x, d_train_y, d_test_y = train_test_split(df_db_x, db_y, test_size=0.20)

In [26]:
tensor_server_x = torch.FloatTensor(df_s_x.values.astype(np.float32))
tensor_server_y = torch.FloatTensor(s_y.astype(np.float32))
t_b_train_x = torch.FloatTensor(b_train_x.values.astype(np.float32))
t_b_test_x = torch.tensor(b_test_x.values.astype(np.float32))
t_b_train_y = torch.tensor(b_train_y.astype(np.float32))
t_b_test_y = torch.tensor(b_test_y.astype(np.float32))
t_d_train_x = torch.tensor(d_train_x.values.astype(np.float32))
t_d_test_x = torch.tensor(d_test_x.values.astype(np.float32))
t_d_train_y = torch.tensor(d_train_y.astype(np.float32))
t_d_test_y = torch.tensor(d_test_y.astype(np.float32))

In [27]:
print(t_d_train_x.shape)
print(t_d_test_x.shape)

torch.Size([229770, 115])
torch.Size([57443, 115])


In [28]:
b_x_train_ptr = t_b_train_x.send(BM)
b_x_test_ptr = t_b_test_x.send(BM)
b_y_train_ptr = t_b_train_y.send(BM)
b_y_test_ptr = t_b_test_y.send(BM)
d_x_train_ptr = t_d_train_x.send(DB)
d_x_test_ptr = t_d_test_x.send(DB)
d_y_train_ptr = t_d_train_y.send(DB)
d_y_test_ptr = t_d_test_y.send(DB)

### (ACM KDD)

In [None]:
Alice = sy.VirtualWorker(hook, id='Alice')
Bob = sy.VirtualWorker(hook, id='Bob')

In [None]:
from sklearn.model_selection import train_test_split
a_train_x, a_test_x, a_train_y, a_test_y = train_test_split(data_alice_x, data_alice_y, test_size=0.20)
b_train_x, b_test_x, b_train_y, b_test_y = train_test_split(data_bob_x, data_bob_y, test_size=0.20)

In [None]:
tensor_server_x = torch.FloatTensor(data_server_x.values.astype(np.float32))
tensor_server_y = torch.FloatTensor(data_server_y.values.astype(np.float32))
t_a_train_x = torch.tensor(a_train_x.values.astype(np.float32))
t_a_test_x = torch.tensor(a_test_x.values.astype(np.float32))
t_a_train_y = torch.tensor(a_train_y.values.astype(np.float32))
t_a_test_y = torch.tensor(a_test_y.values.astype(np.float32))
t_b_train_x = torch.FloatTensor(b_train_x.values.astype(np.float32))
t_b_test_x = torch.tensor(b_test_x.values.astype(np.float32))
t_b_train_y = torch.tensor(b_train_y.values.astype(np.float32))
t_b_test_y = torch.tensor(b_test_y.values.astype(np.float32))

In [None]:
print(t_b_test_y.shape)

In [None]:
a_x_train_ptr = t_a_train_x.send(Alice)
a_x_test_ptr = t_a_test_x.send(Alice)
a_y_train_ptr = t_a_train_y.send(Alice)
a_y_test_ptr = t_a_test_y.send(Alice)
b_x_train_ptr = t_b_train_x.send(Bob)
b_x_test_ptr = t_b_test_x.send(Bob)
b_y_train_ptr = t_b_train_y.send(Bob)
b_y_test_ptr = t_b_test_y.send(Bob)

In [None]:
print(Bob._objects)

In [None]:
print(Alice._objects)

In [None]:
print(tensor_server_x)
print(tensor_server_y)

In [None]:
from Sklearn_PyTorch import TorchRandomForestClassifier

# Initialisation of the model
my_model = TorchRandomForestClassifier(nb_trees=100, nb_samples=3, max_depth=5, bootstrap=True)

# Fitting function
my_model.fit(tensor_server_x, tensor_server_y)


In [None]:
print(my_model)

### Initialize the parameters

In [29]:
epochs = 3
input_dim = 115
output_dim = 2 #Number of clasees
lr_rate = 0.001

In [None]:
tensor_server_x.size(0)

In [None]:
n_hidden = 1000

In [None]:
model=ELM(tensor_server_x.size(0), n_hidden)

In [None]:
def training():
    

### (Logistic Regression)

In [30]:
model = LogisticRegression(input_dim, output_dim)
optimizer = torch.optim.SGD(model.parameters(), lr=lr_rate)

In [31]:
def training(epochs, model, data, labels):
    print(epochs)
    for epochs in range(int(epochs)):    
        print("In the loop")
        optimizer.zero_grad() ## Zero out the gradient
        outputs = model(data) ## Call forward
        print(outputs)
        print(labels)
        loss = ((outputs - labels)**2).sum() ## softmax
        print(loss)
        loss.backward() ## Accumulated gradient updates into x
        optimizer.step()

In [32]:
tensor_server_y = tensor_server_y.squeeze()
print(tensor_server_y.shape)
training(epochs, model, tensor_server_x, tensor_server_y) ## Train the initial model on Server

torch.Size([131912, 2])
3
In the loop
tensor([[-0.0124,  0.3223],
        [-0.2458,  0.3140],
        [-0.0091,  0.3751],
        ...,
        [-0.2767,  0.2999],
        [ 0.0223,  0.3692],
        [-0.2024,  0.3914]], grad_fn=<AddmmBackward>)
tensor([[1., 0.],
        [0., 1.],
        [0., 1.],
        ...,
        [0., 1.],
        [0., 1.],
        [0., 1.]])
tensor(78220.0781, grad_fn=<SumBackward0>)
In the loop
tensor([[ 654.7180, 1291.6458],
        [1291.6085, 3076.9915],
        [ 806.3953, 2153.6287],
        ...,
        [1312.4276, 3134.3572],
        [ 780.8770, 2067.3516],
        [1193.5680, 2788.5806]], grad_fn=<AddmmBackward>)
tensor([[1., 0.],
        [0., 1.],
        [0., 1.],
        ...,
        [0., 1.],
        [0., 1.],
        [0., 1.]])
tensor(1.1475e+12, grad_fn=<SumBackward0>)
In the loop
tensor([[ -2573060.5000,  -6170566.0000],
        [ -5868652.5000, -14119151.0000],
        [ -3904625.0000,  -9520482.0000],
        ...,
        [ -5973299.5000, -14372

## Transfer model to clients

### (IoTBot)

In [33]:
BM_model = model.copy().send(BM)
DB_model = model.copy().send(DB)

BM_opt = torch.optim.SGD(params=BM_model.parameters(),lr=lr_rate)
DB_opt = torch.optim.SGD(params=DB_model.parameters(),lr=lr_rate)

In [34]:
print(t_b_test_y.shape)
print(t_d_test_y.shape)

torch.Size([78455, 2])
torch.Size([57443, 2])


In [35]:
for i in range(6):

    # Train Bob's Model
    BM_opt.zero_grad()
    BM_pred = BM_model(b_x_train_ptr)
    BM_loss = ((BM_pred - b_y_train_ptr)**2).sum()
    BM_loss.backward()

    BM_opt.step()
    BM_loss = BM_loss.get().data

    # Train Alice's Model
    DB_opt.zero_grad()
    DB_pred = DB_model(d_x_train_ptr)
    DB_loss = ((DB_pred - d_y_train_ptr)**2).sum()
    DB_loss.backward()

    DB_opt.step()
    DB_loss = DB_loss.get().data

    total_b = 78455
    correct = 0
    outputs_b = BM_model(b_x_test_ptr)
    _b, pred_b = torch.max(outputs_b.data, 1)
    vb, labels_b = torch.max(b_y_test_ptr.data, 1)
    correct+= (pred_b == labels_b).sum()
    accuracy_b = 100*correct/total_b
    print("Iteration:", i, "BM Accuracy: ", accuracy_b.get().data)

    total_d = 57443
    correct = 0
    outputs_d = DB_model(d_x_test_ptr)
    _d, pred_d = torch.max(outputs_d.data, 1)
    vd, labels_d = torch.max(d_y_test_ptr.data, 1)
    correct+= (pred_d == labels_d).sum()
    accuracy_d = 100*correct/total_d
    print("Iteration:", i, "DB Accuracy: ", accuracy_d.get().data)

Iteration: 0 BM Accuracy:  tensor(44)
Iteration: 0 DB Accuracy:  tensor(17)
Iteration: 1 BM Accuracy:  tensor(55)
Iteration: 1 DB Accuracy:  tensor(82)


### (ACM KDD)

In [None]:
bobs_model = my_model.copy().send(Bob)
alices_model = my_model.copy().send(Alice)

bobs_opt = torch.optim.SGD(params=bobs_model.parameters(),lr=lr_rate)
alices_opt = torch.optim.SGD(params=alices_model.parameters(),lr=lr_rate)

In [None]:
print(Bob._objects)

## Sencond Training with local data

In [None]:
for i in range(2):

    # Train Bob's Model
    bobs_opt.zero_grad()
    bobs_pred = bobs_model(b_x_train_ptr)
    bobs_loss = ((bobs_pred - b_y_train_ptr)**2).sum()
    bobs_loss.backward()

    bobs_opt.step()
    bobs_loss = bobs_loss.get().data

    # Train Alice's Model
    alices_opt.zero_grad()
    alices_pred = alices_model(a_x_train_ptr)
    alices_loss = ((alices_pred - a_y_train_ptr)**2).sum()
    alices_loss.backward()

    alices_opt.step()
    alices_loss = alices_loss.get().data

    total = 24701
    correct = 0
    outputs_a = alices_model(a_x_test_ptr)
    _a, pred_a = torch.max(outputs_a.data, 1)
    va, labels_a = torch.max(a_y_test_ptr.data, 1)
    correct+= (pred_a == labels_a).sum()
    accuracy_a = 100*correct/total
    print("Iteration:", i, "ALice Accuracy: ", accuracy_a.get().data)

    correct = 0
    outputs_b = bobs_model(b_x_test_ptr)
    _b, pred_b = torch.max(outputs_b.data, 1)
    vb, labels_b = torch.max(b_y_test_ptr.data, 1)
    correct+= (pred_b == labels_b).sum()
    accuracy_b = 100*correct/total
    print("Iteration:", i, "Bob Accuracy: ", accuracy_b.get().data)
    

In [None]:
print(Bob._objects)

In [None]:
print(Alice._objects)