In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.tree import export_graphviz, DecisionTreeClassifier
from sklearn import tree
from scipy import stats
import seaborn as sns
import os
import pickle
import re
import sys
import tempfile
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
import warnings
warnings.filterwarnings('ignore')

pd.options.mode.chained_assignment = None
from IPython.display import display, HTML

In [None]:
# IoT device list for labeling
IoT_Device_List = pd.read_csv("iot_device_list.csv")

# List of 16 classes under consideration
classes = ["Withings Smart Baby Monitor","Withings Aura smart sleep sensor","Dropcam",
           "TP-Link Day Night Cloud camera","Samsung SmartCam","Netatmo weather station","Netatmo Welcome",
          "Amazon Echo", "Laptop","NEST Protect smoke alarm","Insteon Camera","Belkin Wemo switch",
           "Belkin wemo motion sensor", "Light Bulbs LiFX Smart Bulb", "Triby Speaker", "Smart Things"]
           
classes_df = pd.DataFrame(classes, columns=['class'])
IoT_Device_List_16 = IoT_Device_List[IoT_Device_List["List of Devices"].isin(classes)]
IoT_Device_List_16 = IoT_Device_List_16.reset_index(drop=True)

## Helper Functions

In [None]:
""" Run grid search on RF hyperparameters - maximum tree depth and number of estimators """
def RF_evaluation(X_train, y_train, X_test, y_test, threshold, mode, depth_range, tree_range, labels, target_names):
    scores_iot = []
    for max_depth in depth_range:
        for num_trees in tree_range:
            RF = RandomForestClassifier(max_depth = max_depth, n_estimators = num_trees, random_state=42, bootstrap=False)
            RF = RF.fit(X_train, y_train)
            y_pred_rf = RF.predict(X_test)
            F1score = 100*f1_score(y_test, y_pred_rf, average=mode)            
            class_report = classification_report(y_test, y_pred_rf, labels = labels, target_names=target_names, output_dict = True)
            if F1score > 70:
                print('\n ##')
                print("F1-score:", F1score)
    return class_report, RF

"""
Function to Fit model based on optimal values of depth and number of estimators and use it
to compute feature importance for all the features according to MDI
"""
def get_feature_importance(evaluation, X_train, y_train):
    rf_opt = RandomForestClassifier(max_depth = evaluation[1], n_estimators = evaluation[2], random_state=42)
    rf_opt.fit(X_train, y_train)
    Feature_importance = pd.DataFrame(rf_opt.feature_importances_)
    Feature_importance.index = X_train.columns
    Feature_importance = Feature_importance.sort_values(by=list(Feature_importance.columns),axis=0,ascending=False)
    return Feature_importance

"""
Function to Fit model based on optimal values of depth and number of estimators and feature importance
to find the fewest possible features to exceed the previously attained score with all selected features
"""
def get_fewest_features(evaluation, importance, X_train, y_train, X_test, y_test, f1_threshold, mode):    
    sorted_feature_names = importance.index
    fewest_tuple = []
    for f in range(1,len(sorted_feature_names)+1):
        rf_try = RandomForestClassifier(max_depth=evaluation[1], n_estimators = evaluation[2], 
                                        random_state=42)
        rf_try.fit(X_train[sorted_feature_names[0:f]], y_train)
        y_pred = rf_try.predict(X_test[sorted_feature_names[0:f]])
        score = f1_score(y_test, y_pred, average=mode)
        tuple_feat = [f, score,sorted_feature_names[0:f]]
        print(tuple_feat)
        if(score >= f1_threshold):
            return tuple_feat
        else:
            continue

""" Label Flows based on MAC address information in the IoT device list file """
def label_flows(IoT_Train, IoT_Device_List):
    IoT_Train['Label_New'] = len(IoT_Train)*[0]
    for i in range(len(IoT_Device_List)):
        IoT_Train['Label_New'] = np.where((IoT_Train['Src MAC']==IoT_Device_List["MAC ADDRESS"][i]), 
                                          IoT_Device_List["List of Devices"][i], IoT_Train['Label_New'])

    for i in range(len(IoT_Device_List)):
        IoT_Train['Label_New'] = np.where((IoT_Train['Dst MAC']==IoT_Device_List["MAC ADDRESS"][i]) & 
                                       (IoT_Train['Src MAC']=="14:cc:20:51:33:ea"), 
                                      IoT_Device_List["List of Devices"][i], IoT_Train['Label_New'])

    IoT_Train = IoT_Train[IoT_Train['Label_New']!="TPLink Router Bridge LAN (Gateway)"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="0"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="Nest Dropcam"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="MacBook/Iphone"]

    return IoT_Train

In [None]:
""" Get combinations of models for different number of trees, maximum tree depth and number of features"""
def analyze_models_features(depths, n_trees, X_train, y_train, X_test, y_test, max_feats):
    for depth in depths:
        for n_tree in n_trees:
            print("Depth, Tree:", depth, n_tree)
            importance = get_feature_importance([None, depth, n_tree], X_train, y_train)
            print(importance)
            get_fewest_features([None, depth, n_tree], importance[0:max_feats], X_train, y_train, X_test, y_test, 99, 'macro')

""" Get names and indices of classes present in the test data """
def get_test_labels(IoT_Test):
    array_of_indices = []
    unique_labels = IoT_Test["Label_New"].unique()
    for lab in unique_labels:
        index = classes_df[classes_df['class'] == lab].index.values[0]
        array_of_indices.append(index)
    return unique_labels, array_of_indices

""" Obtains final trained model and performance statistics based on selected value of max_leaf_nodes"""
def prune_model(X_train, y_train, X_test, y_test, depth, trees, max_leaves, indices, labels):
    MaxTen = 0 # MaxTen is max number of bits supported for a ternary match key in hardware. Actual value is confidential 
    model = RandomForestClassifier(max_depth = depth, n_estimators = trees, max_leaf_nodes=max_leaves, random_state=42, bootstrap=False)
    model = model.fit(X_train, y_train)
    y_pred_rf = model.predict(X_test)
    F1score = 100*f1_score(y_test, y_pred_rf, average='macro')
    c_report = classification_report(y_test, y_pred_rf, labels = indices, target_names=labels, output_dict = True)
    macro_f1 = 100*c_report['macro avg']['f1-score']
    
    if macro_f1 > 80:    
        print("####")
        print("Macro F1-score:", F1score)
        print('Macro F1-score (from c. rep.): ', macro_f1)
        
        for num in range(len(model.estimators_)):
            nbitsp4 = model.estimators_[num].tree_.node_count - model.estimators_[num].tree_.n_leaves
            print("Number of bits in P4: ", nbitsp4)
            if nbitsp4 > MaxTen:
                print("############### ERRROR ###############")
                
    return model, c_report

In [None]:
""" Function to fit the final models which we will translate to P4 """
def fit_final_model(few, evaluation, X_train, y_train):    
    rf_final = RandomForestClassifier(max_depth= evaluation[1], \
        n_estimators = evaluation[2], random_state=42, bootstrap = False)
    rf_final.fit(X_train[few], y_train)
    return rf_final

""" Save trained model for onward processing """
def save_model(RF, filename):
    pickle.dump(RF, open(filename, 'wb'))

In [19]:
""" Extract features from test data"""
def prepare_test(packet_data, IoT_Device_List):

    packet_data.columns = ["frame.time_relative","ip.src","ip.dst","tcp.srcport","tcp.dstport","ip.len",
                       "tcp.flags.syn","tcp.flags.ack","tcp.flags.push","tcp.flags.fin",
                       "tcp.flags.reset","tcp.flags.ece","ip.proto","udp.srcport","udp.dstport",
                       "eth.src","eth.dst"]
    packet_data = packet_data[(packet_data["ip.proto"] != "1,17") & (packet_data["ip.proto"] != "1,6")].reset_index(drop=True)
    packet_data = packet_data.dropna(subset=['ip.proto'])
    packet_data["ip.src"] = packet_data["ip.src"].astype(str)
    packet_data["ip.dst"] = packet_data["ip.dst"].astype(str)
    packet_data["ip.proto"] = packet_data["ip.proto"].astype('int')
    packet_data["ip.len"] = packet_data["ip.len"].astype("int")
    packet_data["tcp.srcport"] = packet_data["tcp.srcport"].astype('Int64').fillna(0)
    packet_data["tcp.dstport"] = packet_data["tcp.dstport"].astype('Int64').fillna(0)
    packet_data["udp.srcport"] = packet_data["udp.srcport"].astype('Int64').fillna(0)
    packet_data["udp.dstport"] = packet_data["udp.dstport"].astype('Int64').fillna(0)
    packet_data["tcp.flags.syn"] = packet_data["tcp.flags.syn"].astype('Int64').fillna(0)
    packet_data["tcp.flags.ack"] = packet_data["tcp.flags.ack"].astype('Int64').fillna(0)
    packet_data["tcp.flags.push"] = packet_data["tcp.flags.push"].astype('Int64').fillna(0)
    packet_data["tcp.flags.fin"] = packet_data["tcp.flags.fin"].astype('Int64').fillna(0)
    packet_data["tcp.flags.reset"] = packet_data["tcp.flags.reset"].astype('Int64').fillna(0)
    packet_data["tcp.flags.ece"] = packet_data["tcp.flags.ece"].astype('Int64').fillna(0)
    packet_data["srcport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.srcport"], packet_data["udp.srcport"])
    packet_data["dstport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.dstport"], packet_data["udp.dstport"])
    packet_data["srcport"] = packet_data["srcport"].astype('Int64')
    packet_data["dstport"] = packet_data["dstport"].astype('Int64')
    packet_data = packet_data.drop(["tcp.srcport","tcp.dstport","udp.srcport","udp.dstport"],axis=1)
    packet_data["label"] = [0] * len(packet_data)
    for i in range(len(IoT_Device_List)):
        packet_data["label"] = np.where((packet_data["eth.src"]==IoT_Device_List["MAC ADDRESS"][i]), 
                                          IoT_Device_List["List of Devices"][i], packet_data["label"])
    for i in range(len(IoT_Device_List)):
        packet_data["label"] = np.where((packet_data["eth.dst"] ==IoT_Device_List["MAC ADDRESS"][i]) & 
                                       (packet_data["eth.src"]=="14:cc:20:51:33:ea"), 
                                      IoT_Device_List["List of Devices"][i], packet_data["label"])
    packet_data = packet_data[packet_data['label']!="TPLink Router Bridge LAN (Gateway)"]
    packet_data = packet_data[packet_data['label']!="0"]
    packet_data = packet_data[packet_data['label']!="Nest Dropcam"]
    packet_data = packet_data[packet_data['label']!="MacBook/Iphone"]
    
    return packet_data

In [None]:
""" Extract features from test data"""
def get_xtest_ytest(Test):
    X_test = Test[['ip.len', 'tcp.flags.syn', 'tcp.flags.ack', 'tcp.flags.push',
                   'tcp.flags.fin', 'tcp.flags.reset', 'tcp.flags.ece', 'ip.proto','srcport', 'dstport']]
    y_test = Test['Label_New'].replace(classes, range(len(classes)))
    return X_test, y_test

""" Extract features from train data"""
def get_xtrain_ytrain(Test):
    X_train = Test[['ip.len', 'tcp.flags.syn', 'tcp.flags.ack', 'tcp.flags.push',
                   'tcp.flags.fin', 'tcp.flags.reset', 'tcp.flags.ece', 'ip.proto','srcport', 'dstport']]
    y_train = Test['Label_New'].replace(classes, range(len(classes)))
    return X_train, y_train

## Feature Selection, Model Training and Evaluation

### Load and process training data

In [None]:
# Load train data csv
IoT_Train = pd.read_csv("unsw_packet_data_train.csv")
IoT_Train = IoT_Train.dropna(axis=0)

# Label the packets
IoT_Train = label_flows(IoT_Train, IoT_Device_List_16)

# Get X_train and y_train
X_train, y_train = get_xtrain_ytrain(IoT_Train)

### Load and process testing data

In [103]:
# Load test data csv
IoT_Test_csv = pd.read_csv("/unsw_packet_data_test.txt")

In [104]:
# Label the packets
IoT_Test = label_flows(IoT_Test_csv, IoT_Device_List_16)
IoT_Test = IoT_Test.dropna(axis=0)

# Get X_test and y_test
X_test, y_test = get_xtest_ytest(IoT_Test)

# Check classes present in the test data and their indices
test_labels, test_indices = get_test_labels(IoT_Test)

### Run model analysis to get optimal model

In [None]:
# analyze_models_features(set_of_max_tree_depths, set_of_number_of_trees, X_train, y_train, X_test[X_train.columns], y_test, max_num_of_features)
analyze_models_features([5,6,7,8,9,10,11,12,13], [3, 4, 5,6,7,8,9,10], X_train, y_train, X_test, y_test, 10)

In [108]:
# Best model is (depth = 11, n_trees = 3) with 6 features, macro F1 score = 73.47 %
selected_features = ['srcport', 'ip.len', 'dstport', 'ip.proto', 'tcp.flags.push','tcp.flags.ack']

In [109]:
# Fit model with selected model parameters
final_model = fit_final_model(selected_features, [None, 11, 3], X_train, y_train) 

In [110]:
# Check number of bits required to encode paths to leaves in P4
for num in range(len(final_model.estimators_)):
    nbitsp4 = final_model.estimators_[num].tree_.node_count - final_model.estimators_[num].tree_.n_leaves
    print("Number of bits in P4: ", nbitsp4)

Number of bits in P4:  926
Number of bits in P4:  754
Number of bits in P4:  779


In [None]:
# Find minimum value of max_leaf_nodes that will preserve accuracy reasonably
for leaf in range(400, 510, 10):
    print("leaves:", leaf)
    prune_model(X_train[selected_features], y_train, X_test[selected_features], 
                y_test, 11, 3, leaf, test_indices, test_labels)

In [111]:
# Optimal max_leaf_nodes chosen = 450 leaves
# Get final model and classification report
final_10_3_model_packet, cl_report_packet = prune_model(X_train[selected_features], y_train, 
                                                                    X_test[selected_features], 
                                                                    y_test, 11, 3, 450, 
                                                                    test_indices, test_labels)

####
Accuracy: 88.98372781065089
Macro F1-score: 74.2461953520498
Macro F1-score (from c. rep.):  74.2461953520498
Number of bits in P4:  449
Number of bits in P4:  449
Number of bits in P4:  449


In [93]:
# Save model for onward conversion into M/A table entries
save_model(final_10_3_model_packet, "unsw_per_packet_saved_model_16_classes.sav")