In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import pickle
pd.options.mode.chained_assignment = None
from IPython.display import display, HTML

In [2]:
# List of 21 IoT devices and their MAC addresses for labeling the data
IoT_Device_List = pd.read_csv("../Data/iot_device_list.csv")

In [3]:
# List of 21 devices for device identification task
classes = ['Belkin Wemo switch', 'iHome', 'TP-Link Smart plug','Light Bulbs LiFX Smart Bulb', "Withings Aura smart sleep sensor", \
    "Belkin wemo motion sensor", "NEST Protect smoke alarm", "Withings Smart Baby Monitor", "Insteon Camera", "TP-Link Day Night Cloud camera", \
        "Samsung SmartCam", "Dropcam", "Netatmo Welcome", "Amazon Echo", "Triby Speaker", 'Netatmo weather station','Withings Smart scale','Smart Things',\
            'PIX-STAR Photo-frame', 'Laptop', 'MacBook']

classes_df = pd.DataFrame(classes, columns=['class'])
IoT_Device_List = IoT_Device_List[IoT_Device_List["List of Devices"].isin(classes)]
IoT_Device_List = IoT_Device_List.reset_index(drop=True)

In [4]:
# Grouping of 21 devices into 5 groups for the first stage of the hierarchy
Plugs = ['Belkin Wemo switch', 'iHome', 'TP-Link Smart plug','Light Bulbs LiFX Smart Bulb']
Sensors = ["Withings Aura smart sleep sensor", "Belkin wemo motion sensor", "NEST Protect smoke alarm"]
Video = ["Withings Smart Baby Monitor", "Insteon Camera", "TP-Link Day Night Cloud camera", "Samsung SmartCam", "Dropcam", "Netatmo Welcome"]
Appliances = ["Amazon Echo", "Triby Speaker", 'Netatmo weather station','Withings Smart scale','Smart Things','PIX-STAR Photo-frame']
Computers = ['Laptop', 'MacBook']

Group_Names = ["Plugs", "Sensors", "Video", "Appliances", "Computers" ]
Groups = [Plugs, Sensors, Video, Appliances, Computers]

### Functions for model analysis

In [5]:
# Get feature importance for a random forest

def get_feature_importance(evaluation, X_train, y_train):
    
    rf_opt = RandomForestClassifier(max_depth = evaluation[1], n_estimators = evaluation[2], random_state=42, bootstrap=False)
    rf_opt.fit(X_train, y_train)
    Feature_importance = pd.DataFrame(rf_opt.feature_importances_)
    Feature_importance.index = X_train.columns
    Feature_importance = Feature_importance.sort_values(by=list(Feature_importance.columns),axis=0,ascending=False)
    
    return Feature_importance


In [6]:
# Get fewest features required to provide an f1 score beyond a given threshold
def get_fewest_features(evaluation, importance, X_train, y_train, X_test, y_test, f1_threshold, mode):    
    sorted_feature_names = importance.index
    
    for f in range(1,len(sorted_feature_names)+1):
        rf_try = RandomForestClassifier(max_depth=evaluation[1], n_estimators = evaluation[2], 
                                        random_state=42, bootstrap=False)
        rf_try.fit(X_train[sorted_feature_names[0:f]], y_train)
        y_pred = rf_try.predict(X_test[sorted_feature_names[0:f]])

        class_report = classification_report(y_test, y_pred, output_dict = True)
        macro_score = class_report['macro avg']['f1-score']
        weighted_score = class_report['weighted avg']['f1-score']
        tuple_feat = [f, macro_score, weighted_score, sorted_feature_names[0:f]]
        print(tuple_feat)
        if(macro_score >= f1_threshold):
            return tuple_feat
        else:
            continue

In [7]:
## Function to fit the final models which we will translate to P4
def fit_final_model(few, evaluation, X_train, y_train):    
    rf_final = RandomForestClassifier(max_depth= evaluation[1], n_estimators = evaluation[2], 
                                      random_state=42, bootstrap = False)
    rf_final.fit(X_train[few], y_train)
    
    return rf_final

In [8]:
# Get classification report of a given model
def show_performance(X_test, y_test, RF, features, target_names):
    # testing with out-of-sample data
    X_tested = X_test[features]
    y_pred_rf3 = RF.predict(X_tested)
    class_report = classification_report(y_test, y_pred_rf3, target_names=target_names, output_dict = True)
    return class_report

In [9]:
# Dump model to file for later conversion to M/A
def save_model(RF, filename):
    pickle.dump(RF, open(filename, 'wb'))

In [10]:
# Assign labels based on MAC addresses
def label_flows(IoT_Train, IoT_Device_List):
    IoT_Train['Label_New'] = len(IoT_Train)*[0]
    for i in range(len(IoT_Device_List)):
        IoT_Train['Label_New'] = np.where((IoT_Train['eth.src']==IoT_Device_List["MAC ADDRESS"][i]), 
                                          IoT_Device_List["List of Devices"][i], IoT_Train['Label_New'])

    for i in range(len(IoT_Device_List)):
        IoT_Train['Label_New'] = np.where((IoT_Train['eth.dst']==IoT_Device_List["MAC ADDRESS"][i]) & 
                                       (IoT_Train['eth.src']=="14:cc:20:51:33:ea"), 
                                      IoT_Device_List["List of Devices"][i], IoT_Train['Label_New'])

    IoT_Train = IoT_Train[IoT_Train['Label_New']!="TPLink Router Bridge LAN (Gateway)"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="0"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="Nest Dropcam"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="MacBook/Iphone"]

    return IoT_Train

In [11]:
# Assign labels based on group information
def assign_new_labels(IoT_Train, Groups, Group_Names):
    IoT_Train['Group_Label'] = len(IoT_Train)*[0]
    for group, group_name in zip(Groups, Group_Names):
        IoT_Train['Group_Label'] = np.where((IoT_Train['Label_New'].isin(group)), group_name, IoT_Train['Group_Label'])
    IoT_Train = IoT_Train[IoT_Train['Group_Label']!="0"]
    return IoT_Train

In [12]:
# Load and clean test data
def prepare_test(packet_data, IoT_Device_List):

    packet_data.columns = ["frame.time_relative","ip.src","ip.dst","tcp.srcport","tcp.dstport","ip.len",
                       "tcp.flags.syn","tcp.flags.ack","tcp.flags.push","tcp.flags.fin",
                       "tcp.flags.reset","tcp.flags.ece","ip.proto","udp.srcport","udp.dstport",
                       "eth.src","eth.dst"]
    packet_data = packet_data[(packet_data["ip.proto"] != "1,17") & (packet_data["ip.proto"] != "1,6")].reset_index(drop=True)
    packet_data = packet_data.dropna(subset=['ip.proto'])
    packet_data["ip.src"] = packet_data["ip.src"].astype(str)
    packet_data["ip.dst"] = packet_data["ip.dst"].astype(str)
    packet_data["ip.proto"] = packet_data["ip.proto"].astype('int')
    packet_data["ip.len"] = packet_data["ip.len"].astype("int")
    packet_data["tcp.srcport"] = packet_data["tcp.srcport"].astype('Int64').fillna(0)
    packet_data["tcp.dstport"] = packet_data["tcp.dstport"].astype('Int64').fillna(0)
    packet_data["udp.srcport"] = packet_data["udp.srcport"].astype('Int64').fillna(0)
    packet_data["udp.dstport"] = packet_data["udp.dstport"].astype('Int64').fillna(0)
    packet_data["tcp.flags.syn"] = packet_data["tcp.flags.syn"].astype('Int64').fillna(0)
    packet_data["tcp.flags.ack"] = packet_data["tcp.flags.ack"].astype('Int64').fillna(0)
    packet_data["tcp.flags.push"] = packet_data["tcp.flags.push"].astype('Int64').fillna(0)
    packet_data["tcp.flags.fin"] = packet_data["tcp.flags.fin"].astype('Int64').fillna(0)
    packet_data["tcp.flags.reset"] = packet_data["tcp.flags.reset"].astype('Int64').fillna(0)
    packet_data["tcp.flags.ece"] = packet_data["tcp.flags.ece"].astype('Int64').fillna(0)
    packet_data["srcport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.srcport"], packet_data["udp.srcport"])
    packet_data["dstport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.dstport"], packet_data["udp.dstport"])
    packet_data["srcport"] = packet_data["srcport"].astype('Int64')
    packet_data["dstport"] = packet_data["dstport"].astype('Int64')
    packet_data = packet_data.drop(["tcp.srcport","tcp.dstport","udp.srcport","udp.dstport"],axis=1)
    packet_data["Label_New"] = [0] * len(packet_data)
    for i in range(len(IoT_Device_List)):
        packet_data["Label_New"] = np.where((packet_data["eth.src"]==IoT_Device_List["MAC ADDRESS"][i]), 
                                          IoT_Device_List["List of Devices"][i], packet_data["Label_New"])
    for i in range(len(IoT_Device_List)):
        packet_data["Label_New"] = np.where((packet_data["eth.dst"] ==IoT_Device_List["MAC ADDRESS"][i]) & 
                                       (packet_data["eth.src"]=="14:cc:20:51:33:ea"), 
                                      IoT_Device_List["List of Devices"][i], packet_data["Label_New"])
    packet_data = packet_data[packet_data['Label_New']!="TPLink Router Bridge LAN (Gateway)"]
    packet_data = packet_data[packet_data['Label_New']!="0"]
    packet_data = packet_data[packet_data['Label_New']!="Nest Dropcam"]
    packet_data = packet_data[packet_data['Label_New']!="MacBook/Iphone"]
    
    return packet_data

In [13]:
# Get test data features and labels
def get_xtest_ytest(Test):
    X_test = Test[['ip.len', 'tcp.flags.syn', 'tcp.flags.ack', 'tcp.flags.push',
                   'tcp.flags.fin', 'tcp.flags.reset', 'tcp.flags.ece', 'ip.proto','srcport', 'dstport']]
    y_test = Test['Group_Label'].replace(Group_Names, range(len(Group_Names)))
    return X_test, y_test

In [14]:
# Get train data features and labels
def get_xtrain_ytrain(Test):
    X_train = Test[['ip.len', 'tcp.flags.syn', 'tcp.flags.ack', 'tcp.flags.push',
                   'tcp.flags.fin', 'tcp.flags.reset', 'tcp.flags.ece', 'ip.proto','srcport', 'dstport']]
    y_train = Test['Group_Label'].replace(Group_Names, range(len(Group_Names)))
    return X_train, y_train

In [15]:
# Performs a grid search on the depth of the trees and the max number of trees using different combinations of features
def analyze_models_features(depths, n_trees, X_train, y_train, X_test, y_test, max_feats):
    for depth in depths:
        for n_tree in n_trees:
            print("Depth, Tree:", depth, n_tree)
            importance = get_feature_importance([None, depth, n_tree], X_train, y_train)
            print(importance)
            get_fewest_features([None, depth, n_tree], importance[0:max_feats], X_train, y_train, X_test, y_test, 99, 'macro')

In [16]:
# Use max_leaf_nodes hyperparameter to set a limit on the number of leaves
def prune_model(X_train, y_train, X_test, y_test, depth, trees, max_leaves):
    
    model = RandomForestClassifier(max_depth = depth, n_estimators = trees, max_leaf_nodes=max_leaves, random_state=42, bootstrap=False)
    model = model.fit(X_train, y_train)
    y_pred_rf = model.predict(X_test)
    F1score = 100*f1_score(y_test, y_pred_rf, average='macro')
    c_report = classification_report(y_test, y_pred_rf, target_names=Group_Names, output_dict = True)
    macro_f1 = 100*c_report['macro avg']['f1-score']
    
    if F1score > 75:    
        print("####")
        print("Accuracy:",100*accuracy_score(y_test, y_pred_rf))
        print("Macro F1-score:", F1score)
        print('Macro F1-score (from c. rep.): ', macro_f1)
        
        for num in range(len(model.estimators_)):
            nbitsp4 = model.estimators_[num].tree_.node_count - model.estimators_[num].tree_.n_leaves
            print("Number of bits in P4: ", nbitsp4)
            if nbitsp4 > 512:
                print("#### ERRROR - String ####")
                
    return model, c_report, y_pred_rf

### Model training and testing

In [17]:
# Load and process train data
with open("../Data/IOT_Train_Pickle.pkl", 'rb') as fp:
    IoT_Train = pickle.load(fp)
IoT_Train = label_flows(IoT_Train, IoT_Device_List)   

# Assign group labels
IoT_Train = assign_new_labels(IoT_Train, Groups, Group_Names)

In [18]:
IoT_Train.groupby("Group_Label").count()

Unnamed: 0_level_0,ip.len,tcp.flags.syn,tcp.flags.ack,tcp.flags.push,tcp.flags.fin,tcp.flags.reset,tcp.flags.ece,ip.proto,eth.src,eth.dst,srcport,dstport,label,Label_New
Group_Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Appliances,1412830,1412830,1412830,1412830,1412830,1412830,1412830,1412830,1412830,1412830,1412830,1412830,1412830,1412830
Computers,8465399,8465399,8465399,8465399,8465399,8465399,8465399,8465399,8465399,8465399,8465399,8465399,8465399,8465399
Plugs,514260,514260,514260,514260,514260,514260,514260,514260,514260,514260,514260,514260,514260,514260
Sensors,746777,746777,746777,746777,746777,746777,746777,746777,746777,746777,746777,746777,746777,746777
Video,5136123,5136123,5136123,5136123,5136123,5136123,5136123,5136123,5136123,5136123,5136123,5136123,5136123,5136123


In [19]:
# Get train data features and labels
X_train, y_train = get_xtrain_ytrain(IoT_Train)

In [20]:
# Load and process test data
IoT_Test_csv = pd.read_csv("../Data/Test_Data.txt", sep="|")
IoT_Test_csv = prepare_test(IoT_Test_csv, IoT_Device_List)
IoT_Test = IoT_Test_csv.dropna(axis=0)

# Assign group labels
IoT_Test = assign_new_labels(IoT_Test_csv, Groups, Group_Names)
IoT_Test = IoT_Test.dropna(axis=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [21]:
IoT_Test.groupby("Group_Label").count()

Unnamed: 0_level_0,frame.time_relative,ip.src,ip.dst,ip.len,tcp.flags.syn,tcp.flags.ack,tcp.flags.push,tcp.flags.fin,tcp.flags.reset,tcp.flags.ece,ip.proto,eth.src,eth.dst,srcport,dstport,Label_New
Group_Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Appliances,81328,81328,81328,81328,81328,81328,81328,81328,81328,81328,81328,81328,81328,81328,81328,81328
Computers,477075,477075,477075,477075,477075,477075,477075,477075,477075,477075,477075,477075,477075,477075,477075,477075
Plugs,43939,43939,43939,43939,43939,43939,43939,43939,43939,43939,43939,43939,43939,43939,43939,43939
Sensors,55986,55986,55986,55986,55986,55986,55986,55986,55986,55986,55986,55986,55986,55986,55986,55986
Video,526996,526996,526996,526996,526996,526996,526996,526996,526996,526996,526996,526996,526996,526996,526996,526996


In [22]:
# Get test data features and labels
X_test, y_test = get_xtest_ytest(IoT_Test)

In [None]:
# This runs the grid search on different depths and number of trees and
# We pick the best model according to desired F1 score
analyze_models_features([6,7,8,9,10,11,12,13,14], [2,3,4,5,6], X_train, y_train, X_test, y_test, 10)

In [24]:
# Features of selected model with max depth of 10 and 3 trees
feats_per_packet = ['srcport', 'dstport', 'ip.len', 'tcp.flags.push', 'ip.proto', 'tcp.flags.ack']

In [25]:
# Fit final model
final_model_packet = fit_final_model(feats_per_packet, [None, 10, 3], X_train, y_train) 

In [26]:
# Check if any of the trees has number of leaves bigger than number of bits
# we don't want to exceed in P4. Here we choose 512 bits
for num in range(len(final_model_packet.estimators_)):
   nbitsp4 = final_model_packet.estimators_[num].tree_.node_count - final_model_packet.estimators_[num].tree_.n_leaves
   print("Number of bits in P4: ", nbitsp4)

Number of bits in P4:  666
Number of bits in P4:  583
Number of bits in P4:  546


In [None]:
# Perform search for the smallest number of leaves that maintains model performance 
# but keeps us beneath the bit limit
for leaf in range(200, 511, 10):
   print("leaves:", leaf)
   prune_model(X_train[feats_per_packet], y_train, X_test[feats_per_packet], y_test, 10, 3, leaf)

In [28]:
# Get final model and its scores by using the best number of leaves - in this case, 480 for example
final_10_3_21_480_model_packet_comp, cl_report_21_480_packet_comp, y_pred_data = prune_model(X_train[feats_per_packet], y_train, 
                                                                    X_test[feats_per_packet], 
                                                                    y_test, 10, 3, 480)

####
Accuracy: 90.3152218296432
Macro F1-score: 84.63678791356926
Macro F1-score (from c. rep.):  84.63678791356926
Number of bits in P4:  479
Number of bits in P4:  479
Number of bits in P4:  479


In [29]:
# Save final model for later conversion into M/A entries
save_model(final_10_3_21_480_model_packet_comp, "2stage_1stStage_model.sav")