In [145]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.tree import export_graphviz, DecisionTreeClassifier
from sklearn import tree
from scipy import stats
import seaborn as sns
import os
import pickle
import re
import sys
import tempfile
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
import warnings
warnings.filterwarnings('ignore')

pd.options.mode.chained_assignment = None
from IPython.display import display, HTML

In [None]:
# IoT device list for labeling
IoT_Device_List = pd.read_csv("iot_device_list.csv")

In [182]:
# List of 16 classes under consideration
classes = ["Withings Smart Baby Monitor","Withings Aura smart sleep sensor","Dropcam",
           "TP-Link Day Night Cloud camera","Samsung SmartCam","Netatmo weather station","Netatmo Welcome",
          "Amazon Echo", "Laptop","NEST Protect smoke alarm","Insteon Camera","Belkin Wemo switch",
           "Belkin wemo motion sensor", "Light Bulbs LiFX Smart Bulb", "Triby Speaker", "Smart Things"]
classes_df = pd.DataFrame(classes, columns=['class'])
IoT_Device_List_16 = IoT_Device_List[IoT_Device_List["List of Devices"].isin(classes)]
IoT_Device_List_16 = IoT_Device_List_16.reset_index(drop=True)

## Helper Functions

In [117]:
""" Run grid search on RF hyperparameters - maximum tree depth and number of estimators """
def RF_evaluation(X_train, y_train, X_test, y_test, threshold, mode, depth_range, tree_range, labels, target_names):
    scores_iot = []
    for max_depth in depth_range:
        for num_trees in tree_range:
            RF = RandomForestClassifier(max_depth = max_depth, n_estimators = num_trees, random_state=42, bootstrap=False)
            RF = RF.fit(X_train, y_train)
            y_pred_rf = RF.predict(X_test)
            F1score = 100*f1_score(y_test, y_pred_rf, average=mode)            
            class_report = classification_report(y_test, y_pred_rf, labels = labels, target_names=target_names, output_dict = True)
            if F1score > 70:
                print('\n ##')
                print("F1-score:", F1score)
    return class_report, RF

In [118]:
"""
Function to Fit model based on optimal values of depth and number of estimators and use it
to compute feature importance for all the features according to MDI
"""
def get_feature_importance(evaluation, X_train, y_train):
    rf_opt = RandomForestClassifier(max_depth = evaluation[1], n_estimators = evaluation[2], random_state=42)
    rf_opt.fit(X_train, y_train)
    Feature_importance = pd.DataFrame(rf_opt.feature_importances_)
    Feature_importance.index = X_train.columns
    Feature_importance = Feature_importance.sort_values(by=list(Feature_importance.columns),axis=0,ascending=False)
    return Feature_importance

In [119]:
"""
Function to Fit model based on optimal values of depth and number of estimators and feature importance
to find the fewest possible features to exceed the previously attained score with all selected features
"""
def get_fewest_features(evaluation, importance, X_train, y_train, X_test, y_test, f1_threshold, mode):    
    sorted_feature_names = importance.index
    fewest_tuple = []
    for f in range(1,len(sorted_feature_names)+1):
        rf_try = RandomForestClassifier(max_depth=evaluation[1], n_estimators = evaluation[2], 
                                        random_state=42)
        rf_try.fit(X_train[sorted_feature_names[0:f]], y_train)
        y_pred = rf_try.predict(X_test[sorted_feature_names[0:f]])
        score = f1_score(y_test, y_pred, average=mode)
        tuple_feat = [f, score,sorted_feature_names[0:f]]
        print(tuple_feat)
        if(score >= f1_threshold):
            return tuple_feat
        else:
            continue

In [120]:
""" Function to extract bits n to m from a bitstring """
def extractKBits(num, start_bit, end_bit):
    # convert number into binary first
    binary = bin(int(num))
    # remove first two characters and fill to 48 bits
    binary = binary[2:].zfill(48)
    #extract required bits
    num_32bits = binary[16:48]
    num_bin = num_32bits[start_bit:end_bit]
    num_bin = "0b" + num_bin
    num_dec = int(num_bin,2)
    return num_dec
    
# get new train and test data with n to m bits selected    
def compress_feature(X_train, X_test, n, m,  feature):
    X_train_func, X_test_func = X_train.copy(), X_test.copy()
    X_train_func[feature] = [extractKBits(x, n, n + m) for x in X_train_func[feature]]
    X_test_func[feature] = [extractKBits(x, n, n + m) for x in X_test_func[feature]]
    
    return X_train_func, X_test_func

In [122]:
""" Function to fit the final models which we will translate to P4 """
def fit_final_model(few, evaluation, X_train, y_train):    
    rf_final = RandomForestClassifier(max_depth= evaluation[1], \
        n_estimators = evaluation[2], random_state=42, bootstrap = False)
    rf_final.fit(X_train[few], y_train)
    return rf_final

In [66]:
""" Save trained model for onward processing """
def save_model(RF, filename):
    pickle.dump(RF, open(filename, 'wb'))

In [237]:
""" Label Flows based on MAC address information in the IoT device list file """
def label_flows(IoT_Train, IoT_Device_List):
    IoT_Train['Label_New'] = len(IoT_Train)*[0]
    for i in range(len(IoT_Device_List)):
        IoT_Train['Label_New'] = np.where((IoT_Train['Src MAC']==IoT_Device_List["MAC ADDRESS"][i]), 
                                          IoT_Device_List["List of Devices"][i], IoT_Train['Label_New'])

    for i in range(len(IoT_Device_List)):
        IoT_Train['Label_New'] = np.where((IoT_Train['Dst MAC']==IoT_Device_List["MAC ADDRESS"][i]) & 
                                       (IoT_Train['Src MAC']=="14:cc:20:51:33:ea"), 
                                      IoT_Device_List["List of Devices"][i], IoT_Train['Label_New'])

    IoT_Train = IoT_Train[IoT_Train['Label_New']!="TPLink Router Bridge LAN (Gateway)"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="0"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="Nest Dropcam"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="MacBook/Iphone"]

    return IoT_Train

""" Extract features from train data"""
def prepare_train(IoT):
    X_train = IoT.drop(["Flow ID",'Label', 'Src MAC', 'Dst MAC', 'Protocol', 'Packet Count', 
                        'Label_New','Flow IAT Mean','Packet Length Mean'], axis=1)
    y_train = IoT['Label_New'].replace(classes, range(len(IoT["Label_New"].unique())))
    return X_train, y_train

""" Extract features from test data"""
def prepare_test(Test, Train):
    X_test = Test.drop(["Flow ID", 'Src MAC', 'Dst MAC', 'Protocol', 'Packet Count', 'Label_New', 
                        'Flow IAT Mean','Packet Length Mean'], axis=1)
    y_test = Test['Label_New'].replace(classes, range(len(Train["Label_New"].unique())))
    return X_test, y_test

In [74]:
""" Get combinations of models for different number of trees, maximum tree depth and number of features"""
def analyze_models_features(depths, n_trees, X_train, y_train, X_test, y_test, max_feats):
    for depth in depths:
        for n_tree in n_trees:
            print("Depth, Tree:", depth, n_tree)
            importance = get_feature_importance([None, depth, n_tree], X_train, y_train)
            print(importance)
            get_fewest_features([None, depth, n_tree], importance[0:max_feats], X_train, y_train, X_test, y_test, 99, 'macro')

In [75]:
""" Get names and indices of classes present in the test data """
def get_test_labels(IoT_Test):
    array_of_indices = []
    unique_labels = IoT_Test["Label_New"].unique()
    for lab in unique_labels:
        index = classes_df[classes_df['class'] == lab].index.values[0]
        array_of_indices.append(index)
    return unique_labels, array_of_indices

In [2]:
""" 
Find the best number of bits and their positions, that yield similar or better model performance.
As range matches in hardware cannot support all sizes of variables, we propose this technique to 
extract just n bits from the features with longer bit sizes and use just those n bits in model training.
In the switch as well, only the n bits that were used in training will be extracted for that feature and 
used for in-switch inference.
"""
def find_time_shifts(X_train, X_test, y_train, y_test, features, test_ind, test_lab, d_range, t_range):
    for m1 in range(1, 16):
        for m2 in range(1, 16):
            X_train1, X_test1 = compress_feature(X_train, X_test, 0, m1, 'Flow Duration')
            X_train2, X_test2 = compress_feature(X_train1, X_test1, 0, m2, 'Flow IAT Max')
            c_report, RF = RF_evaluation(X_train2[features], y_train, X_test2[features], y_test, 
                                     90, 'macro', d_range, t_range, test_ind, test_lab)
            macro_f1 = 100*c_report['macro avg']['f1-score']
            if macro_f1 > 92:
                print("Compression applied to 'Flow Duration': 32 -> ", 32 - m1)
                print("Compression applied to 'Flow IAT Max': 32 -> ", 32 - m2)
                print('Macro F1-score (from c. rep.): ', macro_f1)

""" Once the bit positions are known, we can then regenerate the train and test data features with the modified features"""
def get_final_compression(X_train, X_test, y_train, y_test, features, test_ind, test_lab, d_range, t_range, m1, m2):

    X_train1, X_test1 = compress_feature(X_train, X_test, 0, m1, 'Flow Duration')
    X_train2, X_test2 = compress_feature(X_train1, X_test1, 0, m2, 'Flow IAT Max')
    c_report, final_rf = RF_evaluation(X_train2[features], y_train, X_test2[features], y_test, 90, 'macro', d_range, t_range, test_ind, test_lab)
    macro_f1 = 100*c_report['macro avg']['f1-score']
    
    print("Compression applied to 'Flow Duration': 32 -> ", 32 - m1)
    print("Compression applied to 'Flow IAT Max': 32 -> ", 32 - m2)
    print('Macro F1-score (from c. rep.): ', macro_f1)
    
    return X_train2[features], X_test2[features], final_rf

In [244]:
""" Obtains final trained model and performance statistics based on selected value of max_leaf_nodes"""
def prune_model(X_train, y_train, X_test, y_test, depth, trees, max_leaves, indices, labels):
    MaxTen = 0 # MaxTen is max number of bits supported for a ternary match key in hardware. Actual value is confidential 
    model = RandomForestClassifier(max_depth = depth, n_estimators = trees, max_leaf_nodes=max_leaves, random_state=42, bootstrap=False)
    model = model.fit(X_train, y_train)
    y_pred_rf = model.predict(X_test)
    F1score = 100*f1_score(y_test, y_pred_rf, average='macro')
    c_report = classification_report(y_test, y_pred_rf, labels = indices, target_names=labels, output_dict = True)
    macro_f1 = 100*c_report['macro avg']['f1-score']
    
    if macro_f1 > 80:    
        print("####")
        print("Macro F1-score:", F1score)
        print('Macro F1-score (from c. rep.): ', macro_f1)
        
        for num in range(len(model.estimators_)):
            nbitsp4 = model.estimators_[num].tree_.node_count - model.estimators_[num].tree_.n_leaves
            print("Number of bits in P4: ", nbitsp4)
            if nbitsp4 > MaxTen:
                print("############### ERRROR ###############")
                
    return model, c_report

## Feature Selection, Model Training and Evaluation

#### Load and label training data

In [201]:
# load the labelled excel files
IoT_Train_csv = pd.read_csv("rf3_train.csv")

# Label the flows, convert time features to nanoseconds
IoT_Train = label_flows(IoT_Train_csv, IoT_Device_List_16)
X_train, y_train = prepare_train(IoT_Train)
X_train['Flow IAT Min'] = X_train['Flow IAT Min']*1000
X_train['Flow IAT Max'] = X_train['Flow IAT Max']*1000
X_train['Flow Duration'] = X_train['Flow Duration']*1000

In [None]:
# See number of samples per device class
IoT_Train.groupby("Label_New").count()

#### Load and label test data

In [203]:
# Load test data
IoT_Test_csv = pd.read_csv("rf3_test.csv")

# Label test data
IoT_Test = label_flows(IoT_Test_csv, IoT_Device_List_16)

# Separate features from the dependent variable
X_test, y_test = prepare_test(IoT_Test, IoT_Train)

# Get names and indices of represented classes
test_labels, test_indices = get_test_labels(IoT_Test)

# See if any missing classes
set(classes) - set(test_labels)

#### Model analysis

In [None]:
# analyze_models_features(set_of_max_tree_depths, set_of_number_of_trees, X_train, y_train, X_test[X_train.columns], y_test, max_num_of_features)
analyze_models_features([5,6,7,8,9,10], [1,2,3,4,5,6,7,8,9,10], X_train, y_train, X_test[X_train.columns], y_test, 12)

In [214]:
# Best model is (depth = 10, n_trees = 3) with 7 features - 94.38 %
Selected_Features = ['Packet Length Total', 'Flow Duration', 'Max Packet Length','Source Port', 'Min Packet Length', 'Destination Port', 'Flow IAT Max']

In [None]:
# Find optimal bits for time-based features e.g Flow Duration and Flow IAT Max
find_time_shifts(X_train, X_test, y_train, y_test, Selected_Features, test_indices, test_labels, [10], [3])

In [None]:
# Best models are those with first 3 MSBs of Flow Duration and first 5 MSBs of Flow IAT Max
"""
Accuracy: 98.47165532879819
F1-score: 93.91843072591722
Compression applied to 'Flow Duration': 32 ->  29
Compression applied to 'Flow IAT Max': 32 ->  27
Macro F1-score (from c. rep.):  93.91843072591722
"""

In [227]:
# Use selected bits to obtain compressed X_train, y_train and final model
X_train_comp_29_27, X_test_comp_29_27, rf_final_comp_29_27 = get_final_compression(X_train, X_test, y_train, y_test, 
                                                                                   Selected_Features,test_indices, 
                                                                                   test_labels, [10], [3], 
                                                                                   3, 5)


 ######
Accuracy: 98.47165532879819
F1-score: 93.91843072591722
Compression applied to 'Flow Duration': 32 ->  29
Compression applied to 'Flow IAT Max': 32 ->  27
Macro F1-score (from c. rep.):  93.91843072591722


In [228]:
# Check if Pruning is required for nbits to fit hardware constraints
for num in range(len(rf_final_comp_29_27.estimators_)):
    nbitsp4 = rf_final_comp_29_27.estimators_[num].tree_.node_count - rf_final_comp_29_27.estimators_[num].tree_.n_leaves
    print("Number of bits in P4: ", nbitsp4)

Number of bits in P4:  437
Number of bits in P4:  523
Number of bits in P4:  474


In [None]:
# Find optimal value of max_leaf_nodes that preserves accuracy reasonably 
# but cuts down number of bits required to encode tree
for leaf in range(200, 512, 10):
    print("leaves:", leaf)
    prune_model(X_train_comp_29_27, y_train, X_test_comp_29_27, y_test, 10, 3, leaf, test_indices, test_labels)

In [232]:
# Use selected value of 350 as max_leaf_nodes to get final model and performance in classification report
final_10_3_29_27_model, cl_report_29_27_full = prune_model(X_train_comp_29_27, y_train, X_test_comp_29_27, 
                                          y_test, 10, 3, 350, test_indices, test_labels)

####
Accuracy: 98.71655328798185
Macro F1-score: 90.67521742444109
Macro F1-score (from c. rep.):  90.67521742444109
Number of bits in P4:  349
Number of bits in P4:  349
Number of bits in P4:  349


In [235]:
# save model for onward conversion into M/A entries
save_model(final_10_3_29_27_model, "unsw_per_flow_saved_model_16_classes.sav")