In [1]:
import sys
import numpy as np

import torch
from torch.nn import Parameter
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.autograd import Variable
import syft as sy

  from ._conv import register_converters as _register_converters





## Data Preprocessing Functions

In [2]:
from sklearn import preprocessing

### Normalization

In [3]:
def normalize(df): 
    x = df.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)
    return df

### Transforming categorical feature to numerical feature

In [4]:
def encoding(data):
    for col in data.columns:
        if data[col].dtype == type(object):
            le_x = preprocessing.LabelEncoder()
            le_x.fit(data[col])
            data[col] = le_x.transform(data[col])
    return data

### One-Hot Encoding

In [5]:
from sklearn.preprocessing import OneHotEncoder
def label_encoder(df):
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(df)
    print(enc.categories_)
    df_array = enc.transform(df).toarray() #Encode the classes to a binary array 
    return df_array

## ACM KDD'99

In [None]:
data_path = "../../../Dataset/KDD99/kddcup99.csv"

dataset = pd.read_csv(data_path, sep=',', usecols=range(0, 42))

print("Dataset Shape:", dataset.shape)

In [None]:
#Randomly split data into three parts
data_server = dataset.sample(frac=0.5, random_state=1)
dataset = dataset.drop(data_server.index)
data_alice = dataset.sample(frac=0.5, random_state=1)
data_bob = dataset.drop(data_alice.index)

In [None]:
#Divide data into x and y
data_server_x = pd.DataFrame(data_server.iloc[:, 0:41])
data_server_y = pd.DataFrame(data_server.iloc[:, 41])
data_alice_x = pd.DataFrame(data_alice.iloc[:, 0:41])
data_alice_y = pd.DataFrame(data_alice.iloc[:, 41])
data_bob_x = pd.DataFrame(data_bob.iloc[:, 0:41])
data_bob_y = pd.DataFrame(data_bob.iloc[:, 41])

In [None]:
# Relabel data into biclasses
new_class = {'back':'abnormal', 'buffer_overflow':'abnormal', 'ftp_write':'abnormal', 'guess_passwd':'abnormal', 'imap':'abnormal',
            'ipsweep':'abnormal', 'land':'abnormal', 'loadmodule':'abnormal', 'multihop':'abnormal', 'neptune':'abnormal', 'nmap':'abnormal',
            'perl':'abnormal', 'phf':'abnormal', 'pod':'abnormal', 'portsweep':'abnormal', 'rootkit':'abnormal', 'satan':'abnormal',
            'smurf':'abnormal', 'spy':'abnormal', 'teardrop':'abnormal', 'warezclient':'abnormal', 'warezmaster':'abnormal'}
data_server_y = data_server_y.replace(new_class)
data_alice_y = data_alice_y.replace(new_class)
data_bob_y = data_bob_y.replace(new_class)

In [None]:
# Encode the string datatype to numerial
data_server_x = encoding(data_server_x)
data_server_y = encoding(data_server_y)
data_alice_x = encoding(data_alice_x)
data_alice_y = encoding(data_alice_y)
data_bob_x = encoding(data_bob_x)
data_bob_y = encoding(data_bob_y)

In [None]:
#Normalize x data
data_server_x = normalize(data_server_x)
data_alice_x = normalize(data_alice_x)
data_bob_x = normalize(data_bob_x)

In [None]:
#One-Hot encoding labels
data_server_y = label_encoder(data_server_y)
data_alice_y = label_encoder(data_alice_y)
data_bob_y = label_encoder(data_bob_y)

In [None]:
print(data_server_y)

## IoT Botnet Stream Data

In [6]:
# Load all the data from the CSV file 
PT_DATA_PATH = "../../../Dataset/Botnet_Detection/PT_838_Security Camera"
PT2_DATA_PATH = "../../../Dataset/Botnet_Detection/PT737E_Security Camera"
XC_DATA_PATH = "../../../Dataset/Botnet_Detection/XCS7_1002_WHT_Security_Camera"
XC2_DATA_PATH = "../../../Dataset/Botnet_Detection/XCS7_1003_WHT_Security_Camera"
df_pt_1 = pd.read_csv(PT_DATA_PATH+"/benign_traffic.csv")
df_pt_2 = pd.read_csv(PT2_DATA_PATH+"/benign_traffic.csv")
df_xc_1 =  pd.read_csv(XC_DATA_PATH+"/benign_traffic.csv")
df_xc_2 =  pd.read_csv(XC2_DATA_PATH+"/benign_traffic.csv")

In [7]:
#Assign the label to each dataframe
df_pt1 = df_pt_1.assign(label = 'pt1')
df_pt2 = df_pt_2.assign(label = 'pt2')
df_xc1 = df_xc_1.assign(label = 'xc1')
df_xc2 = df_xc_2.assign(label = 'xc2')

In [8]:
def shuffler(df):
  return df.reindex(np.random.permutation(df.index))

In [9]:
df_pt1 = shuffler(df_pt1)
df_pt2 = shuffler(df_pt2)
df_xc1 = shuffler(df_xc1)
df_xc2 = shuffler(df_xc2)

In [10]:
# Create a dataset on server for initial model (second version)
df_server = pd.DataFrame()
df_server = df_server.append(df_pt1.sample(frac =.25), ignore_index=True)
df_server = df_server.append(df_pt2.sample(frac =.25), ignore_index=True)
df_server = df_server.append(df_xc1.sample(frac =.25), ignore_index=True)
#df_server = df_server.append(df_xc2.sample(frac =.25), ignore_index=True)

In [11]:
# Create two dataframes for each device mixed two external classes
df_pt = pd.DataFrame()
df_pt = df_pt.append(df_pt1)
df_pt = df_pt.append(df_pt2.sample(frac =.25), ignore_index = True)
df_pt = df_pt.append(df_xc2.sample(frac =.25), ignore_index = True)
df_xc = pd.DataFrame()
df_xc = df_xc.append(df_xc1)
df_xc = df_xc.append(df_xc2.sample(frac =.25), ignore_index = True)
df_xc = df_xc.append(df_pt2.sample(frac =.25), ignore_index = True)

In [12]:
#Divide dataframe into x and y
df_s_x = pd.DataFrame(df_server.iloc[:, 0:115])
df_s_y = pd.DataFrame(df_server.iloc[:, 115])
df_pt_x = pd.DataFrame(df_pt.iloc[:, 0:115])
df_pt_y = pd.DataFrame(df_pt.iloc[:, 115])
df_xc_x = pd.DataFrame(df_xc.iloc[:, 0:115])
df_xc_y = pd.DataFrame(df_xc.iloc[:, 115])

In [13]:
#Normalize the x dataframe
df_s_x = normalize(df_s_x)
df_pt_x = normalize(df_pt_x)
df_xc_x = normalize(df_xc_x)

In [14]:
print(df_s_x.shape)
print(df_pt_x.shape)
print(df_xc_x.shape)

(51812, 115)
(118934, 115)
(67005, 115)


In [15]:
#One-Hot encoding labels and transform into array
s_y = label_encoder(df_s_y)
pt_y = label_encoder(df_pt_y)
xc_y = label_encoder(df_xc_y)

[array(['pt1', 'pt2', 'xc1'], dtype=object)]
[array(['pt1', 'pt2', 'xc2'], dtype=object)]
[array(['pt2', 'xc1', 'xc2'], dtype=object)]


## Start Transfering data to workers 

In [None]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [None]:
class ELM(torch.nn.Module):
    def __init__(self, n_inputs: int, hidden_units = 1000):
        self.random_weights = np.random.normal(size=[n_inputs, hidden_units]) # A random weight is assigned
    
    def learn(self, X: np.ndarray, Y: np.ndarray):
        H = self._hidden_layer(X)
        self.output_weights = np.linalg.pinv(H) @ Y
    
    def _f(self, x: np.ndarray): 
        return 1. / (1. + np.exp(-x)) #activation function: sigmoid
    
    def _hidden_layer(self, inputs: np.ndarray): 
        return self._f(inputs @ self.random_weights)
  
    def _output_layer(self, hidden: np.ndarray): 
        return hidden @ self.output_weights
  
    def __call__(self, inputs: np.ndarray):  #infer
        return self._output_layer(self._hidden_layer(inputs))

In [16]:
class Net(torch.nn.Module):
    def __init__(self, in_dim, h_dim, out_dim):
        super(Net, self).__init__()
        self.linear1 = torch.nn.Linear(in_dim, h_dim)
        self.bn1 = nn.BatchNorm1d(h_dim)
        self.linear2 = torch.nn.Linear(h_dim, out_dim)
        
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [17]:
hook = sy.TorchHook(torch)

### (IoT BotNet)

In [18]:
PT = sy.VirtualWorker(hook, id='PT')
XC = sy.VirtualWorker(hook, id='XC')

In [19]:
from sklearn.model_selection import train_test_split
p_train_x, p_test_x, p_train_y, p_test_y = train_test_split(df_pt_x, pt_y, test_size=0.20)
x_train_x, x_test_x, x_train_y, x_test_y = train_test_split(df_xc_x, xc_y, test_size=0.20)

In [20]:
tensor_server_x = torch.FloatTensor(df_s_x.values.astype(np.float32))
tensor_server_y = torch.FloatTensor(s_y.astype(np.float32))
t_p_train_x = torch.FloatTensor(p_train_x.values.astype(np.float32))
t_p_test_x = torch.tensor(p_test_x.values.astype(np.float32))
t_p_train_y = torch.tensor(p_train_y.astype(np.float32))
t_p_test_y = torch.tensor(p_test_y.astype(np.float32))
t_x_train_x = torch.tensor(x_train_x.values.astype(np.float32))
t_x_test_x = torch.tensor(x_test_x.values.astype(np.float32))
t_x_train_y = torch.tensor(x_train_y.astype(np.float32))
t_x_test_y = torch.tensor(x_test_y.astype(np.float32))

In [21]:
print(t_p_train_x.shape)
print(t_p_test_x.shape)

torch.Size([95147, 115])
torch.Size([23787, 115])


In [22]:
p_x_train_ptr = t_p_train_x.send(PT)
p_x_test_ptr = t_p_test_x.send(PT)
p_y_train_ptr = t_p_train_y.send(PT)
p_y_test_ptr = t_p_test_y.send(PT)
x_x_train_ptr = t_x_train_x.send(XC)
x_x_test_ptr = t_x_test_x.send(XC)
x_y_train_ptr = t_x_train_y.send(XC)
x_y_test_ptr = t_x_test_y.send(XC)

### (ACM KDD)

In [None]:
Alice = sy.VirtualWorker(hook, id='Alice')
Bob = sy.VirtualWorker(hook, id='Bob')

In [None]:
from sklearn.model_selection import train_test_split
a_train_x, a_test_x, a_train_y, a_test_y = train_test_split(data_alice_x, data_alice_y, test_size=0.20)
b_train_x, b_test_x, b_train_y, b_test_y = train_test_split(data_bob_x, data_bob_y, test_size=0.20)

In [None]:
tensor_server_x = torch.FloatTensor(data_server_x.values.astype(np.float32))
tensor_server_y = torch.FloatTensor(data_server_y.values.astype(np.float32))
t_a_train_x = torch.tensor(a_train_x.values.astype(np.float32))
t_a_test_x = torch.tensor(a_test_x.values.astype(np.float32))
t_a_train_y = torch.tensor(a_train_y.values.astype(np.float32))
t_a_test_y = torch.tensor(a_test_y.values.astype(np.float32))
t_b_train_x = torch.FloatTensor(b_train_x.values.astype(np.float32))
t_b_test_x = torch.tensor(b_test_x.values.astype(np.float32))
t_b_train_y = torch.tensor(b_train_y.values.astype(np.float32))
t_b_test_y = torch.tensor(b_test_y.values.astype(np.float32))

In [None]:
print(t_b_test_y.shape)

In [None]:
a_x_train_ptr = t_a_train_x.send(Alice)
a_x_test_ptr = t_a_test_x.send(Alice)
a_y_train_ptr = t_a_train_y.send(Alice)
a_y_test_ptr = t_a_test_y.send(Alice)
b_x_train_ptr = t_b_train_x.send(Bob)
b_x_test_ptr = t_b_test_x.send(Bob)
b_y_train_ptr = t_b_train_y.send(Bob)
b_y_test_ptr = t_b_test_y.send(Bob)

In [None]:
print(Bob._objects)

In [None]:
print(Alice._objects)

In [None]:
print(tensor_server_x)
print(tensor_server_y)

In [None]:
from Sklearn_PyTorch import TorchRandomForestClassifier

# Initialisation of the model
my_model = TorchRandomForestClassifier(nb_trees=100, nb_samples=3, max_depth=5, bootstrap=True)

# Fitting function
my_model.fit(tensor_server_x, tensor_server_y)


In [None]:
print(my_model)

## Initialize the parameters

In [23]:
epochs = 600
input_dim = 115
output_dim = 3 #Number of clasees
h_dim = 100
lr_rate = 1e-6

### (Neural Network)

In [24]:
model = Net(input_dim, h_dim, output_dim)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=lr_rate)

In [25]:
def training(epochs, model, data, labels):
    for e in range(int(epochs)):
        y_pred = model(data)

        # Compute and print loss
        loss = criterion(y_pred, labels)
        if e % 100 == 99:
            print(e, loss.item())
    
        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [26]:
training(epochs, model, tensor_server_x, tensor_server_y) ## Train the initial model on Server

99 24540.1640625
199 22018.076171875
299 20343.76953125
399 19262.943359375
499 18585.546875
599 18140.708984375


### (ELM)

In [None]:
tensor_server_x.size(0)

In [None]:
n_hidden = 1000

In [None]:
model=ELM(tensor_server_x.size(0), n_hidden)

In [None]:
def training(epochs, model, data, labels):
    

### (Logistic Regression)

In [None]:
model = LogisticRegression(input_dim, output_dim)
optimizer = torch.optim.SGD(model.parameters(), lr=lr_rate)

In [None]:
def training(epochs, model, data, labels):
    for epochs in range(int(epochs)):    
        optimizer.zero_grad() ## Zero out the gradient
        outputs = model(data) ## Call forward
        
        loss = ((outputs - labels)**2).sum() ## softmax
        if epochs % 100 == 99:
            print(loss)
        loss.backward() ## Accumulated gradient updates into x
        optimizer.step()

In [None]:
tensor_server_y = tensor_server_y.squeeze()
print(tensor_server_y.shape)
training(epochs, model, tensor_server_x, tensor_server_y) ## Train the initial model on Server

## Transfer model to clients

### (IoTBot)

In [27]:
PT_model = model.copy().send(PT)
XC_model = model.copy().send(XC)

PT_opt = torch.optim.SGD(params=PT_model.parameters(),lr=lr_rate)
XC_opt = torch.optim.SGD(params=XC_model.parameters(),lr=lr_rate)

In [28]:
print(t_p_test_y.shape)
print(t_x_test_y.shape)

torch.Size([23787, 3])
torch.Size([13401, 3])


In [29]:
n_pt, y_pt = t_p_test_y.shape

In [30]:
n_xc, y_xc = t_x_test_y.shape

## Secondary Training on the device with local data

### Neural Network

In [None]:
print(PT._objects)

In [None]:
print(XC._objects)

In [31]:
for e in range(100):
    
    #Baby Monitor
    PT_pred = PT_model(p_x_train_ptr.get())
    
    # Compute and print loss
    PT_loss = criterion(PT_pred, p_y_train_ptr.get())
    
    # Zero gradients, perform a backward pass, and update the weights.
    PT_opt.zero_grad()
    PT_loss.backward()
    PT_opt.step()
    
    #Door Bell
    XC_pred = XC_model(x_x_train_ptr.get())
    
    # Compute and print loss
    XC_loss = criterion(XC_pred, x_y_train_ptr.get())        
    
    # Zero gradients, perform a backward pass, and update the weights.
    XC_opt.zero_grad()
    XC_loss.backward()
    XC_opt.step()
    
    if e % 100 == 99:
        print(e, "PT_loss:", PT_loss.item())
        print(e, "XC_loss:", XC_loss.item())
        total_p = n_pt
        correct = 0
        outputs_p = PT_model(p_x_test_ptr)
        _p, pred_p = torch.max(outputs_p.data, 1)
        vp, labels_p = torch.max(p_y_test_ptr.data, 1)
        correct+= (pred_p == labels_p).sum()
        accuracy_p = 100*correct/total_p
        print("Iteration:", i, "PT Accuracy: ", accuracy_p.get().data)
        total_x = n_xc
        correct = 0
        outputs_x = XC_model(x_x_test_ptr)
        _x, pred_x = torch.max(outputs_x.data, 1)
        vx, labels_x = torch.max(x_y_test_ptr.data, 1)
        correct+= (pred_x == labels_x).sum()
        accuracy_x = 100*correct/total_x
        print("Iteration:", i, "XC Accuracy: ", accuracy_x.get().data)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

### Logistic Regression

In [None]:
for i in range(4):

    # Train Bob's Model
    BM_opt.zero_grad()
    BM_pred = BM_model(b_x_train_ptr)
    BM_loss = ((BM_pred - b_y_train_ptr)**2).sum()
    BM_loss.backward()

    BM_opt.step()
    BM_loss = BM_loss.get().data

    # Train Alice's Model
    DB_opt.zero_grad()
    DB_pred = DB_model(d_x_train_ptr)
    DB_loss = ((DB_pred - d_y_train_ptr)**2).sum()
    DB_loss.backward()

    DB_opt.step()
    DB_loss = DB_loss.get().data

    total_b = n_bm
    correct = 0
    outputs_b = BM_model(b_x_test_ptr)
    _b, pred_b = torch.max(outputs_b.data, 1)
    vb, labels_b = torch.max(b_y_test_ptr.data, 1)
    correct+= (pred_b == labels_b).sum()
    accuracy_b = 100*correct/total_b
    print("Iteration:", i, "BM Accuracy: ", accuracy_b.get().data)

    total_d = n_db
    correct = 0
    outputs_d = DB_model(d_x_test_ptr)
    _d, pred_d = torch.max(outputs_d.data, 1)
    vd, labels_d = torch.max(d_y_test_ptr.data, 1)
    correct+= (pred_d == labels_d).sum()
    accuracy_d = 100*correct/total_d
    print("Iteration:", i, "DB Accuracy: ", accuracy_d.get().data)

### (ACM KDD)

In [None]:
bobs_model = my_model.copy().send(Bob)
alices_model = my_model.copy().send(Alice)

bobs_opt = torch.optim.SGD(params=bobs_model.parameters(),lr=lr_rate)
alices_opt = torch.optim.SGD(params=alices_model.parameters(),lr=lr_rate)

In [None]:
print(Bob._objects)

In [None]:
for i in range(2):

    # Train Bob's Model
    bobs_opt.zero_grad()
    bobs_pred = bobs_model(b_x_train_ptr)
    bobs_loss = ((bobs_pred - b_y_train_ptr)**2).sum()
    bobs_loss.backward()

    bobs_opt.step()
    bobs_loss = bobs_loss.get().data

    # Train Alice's Model
    alices_opt.zero_grad()
    alices_pred = alices_model(a_x_train_ptr)
    alices_loss = ((alices_pred - a_y_train_ptr)**2).sum()
    alices_loss.backward()

    alices_opt.step()
    alices_loss = alices_loss.get().data

    total = 24701
    correct = 0
    outputs_a = alices_model(a_x_test_ptr)
    _a, pred_a = torch.max(outputs_a.data, 1)
    va, labels_a = torch.max(a_y_test_ptr.data, 1)
    correct+= (pred_a == labels_a).sum()
    accuracy_a = 100*correct/total
    print("Iteration:", i, "ALice Accuracy: ", accuracy_a.get().data)

    correct = 0
    outputs_b = bobs_model(b_x_test_ptr)
    _b, pred_b = torch.max(outputs_b.data, 1)
    vb, labels_b = torch.max(b_y_test_ptr.data, 1)
    correct+= (pred_b == labels_b).sum()
    accuracy_b = 100*correct/total
    print("Iteration:", i, "Bob Accuracy: ", accuracy_b.get().data)
    

In [None]:
print(Bob._objects)

In [None]:
print(Alice._objects)