In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
import pickle
pd.options.mode.chained_assignment = None
from IPython.display import display, HTML

In [2]:
# List of 21 IoT devices and their MAC addresses for labeling the data
IoT_Device_List = pd.read_csv("../Data/iot_device_list.csv")

In [3]:
# List of 21 devices for device identification task
classes = ['Belkin Wemo switch', 'iHome', 'TP-Link Smart plug','Light Bulbs LiFX Smart Bulb', "Withings Aura smart sleep sensor", \
    "Belkin wemo motion sensor", "NEST Protect smoke alarm", "Withings Smart Baby Monitor", "Insteon Camera", "TP-Link Day Night Cloud camera", \
        "Samsung SmartCam", "Dropcam", "Netatmo Welcome", "Amazon Echo", "Triby Speaker", 'Netatmo weather station','Withings Smart scale','Smart Things',\
            'PIX-STAR Photo-frame', 'Laptop', 'MacBook']

classes_df = pd.DataFrame(classes, columns=['class'])
IoT_Device_List = IoT_Device_List[IoT_Device_List["List of Devices"].isin(classes)]
IoT_Device_List = IoT_Device_List.reset_index(drop=True)

In [4]:
# Grouping of 21 devices into 5 groups for the first stage of the hierarchy
Plugs = ['Belkin Wemo switch', 'iHome', 'TP-Link Smart plug','Light Bulbs LiFX Smart Bulb']
Sensors = ["Withings Aura smart sleep sensor", "Belkin wemo motion sensor", "NEST Protect smoke alarm"]
Video = ["Withings Smart Baby Monitor", "Insteon Camera", "TP-Link Day Night Cloud camera", "Samsung SmartCam", "Dropcam", "Netatmo Welcome"]
Appliances = ["Amazon Echo", "Triby Speaker", 'Netatmo weather station','Withings Smart scale','Smart Things','PIX-STAR Photo-frame']
Computers = ['Laptop', 'MacBook']

Group_Names = ["Plugs", "Sensors", "Video", "Appliances", "Computers" ]
Groups = [Plugs, Sensors, Video, Appliances, Computers]

### Functions for the model analysis

In [5]:
# Get feature importance for a decision tree
def get_feature_importance_DT(depth, X_train, y_train):
    dt_opt = DecisionTreeClassifier(max_depth = depth, random_state=42)
    dt_opt.fit(X_train, y_train)
    Feature_importance = pd.DataFrame(dt_opt.feature_importances_)
    Feature_importance.index = X_train.columns
    Feature_importance = Feature_importance.sort_values(by=list(Feature_importance.columns),axis=0,ascending=False)
    return Feature_importance

In [6]:
# Get fewest features required to provide an f1 score beyond a given threshold
def get_fewest_features_DT(depth, importance, X_train, y_train, X_test, y_test, f1_threshold, mode):    
    sorted_feature_names = importance.index
    for f in range(1,len(sorted_feature_names)+1):
        dt_try = DecisionTreeClassifier(max_depth=depth, random_state=42)
        dt_try.fit(X_train[sorted_feature_names[0:f]], y_train)
        y_pred = dt_try.predict(X_test[sorted_feature_names[0:f]])
        # scores
        class_report = classification_report(y_test, y_pred, output_dict = True)
        macro_score = class_report['macro avg']['f1-score']
        weighted_score = class_report['weighted avg']['f1-score']
        tuple_feat = [f, macro_score, weighted_score, sorted_feature_names[0:f]]
        print(tuple_feat)
        if(macro_score >= f1_threshold):
            return tuple_feat
        else:
            continue

In [7]:
# Performs a grid search on the depth of the tree using different combinations of features
def analyze_models_features_DT(depths, X_train, y_train, X_test, y_test, max_feats):
    for depth in depths:
        print("Depth:", depth)
        importance = get_feature_importance_DT(depth, X_train, y_train)
        print(importance)
        get_fewest_features_DT(depth, importance[0:max_feats], X_train, y_train, X_test, y_test, 99, 'macro')

In [8]:
# Function to fit the final models which we will translate to P4
def fit_final_model_DT(few, depth, X_train, y_train):    
    dt_final = DecisionTreeClassifier(max_depth= depth, random_state=42)
    dt_final.fit(X_train[few], y_train)
    return dt_final

In [9]:
# Get classification report of a given model
def show_performance(X_test, y_test, model, features, target_names):
    # testing with out-of-sample data
    X_tested = X_test[features]
    y_pred = model.predict(X_tested)
    class_report = classification_report(y_test, y_pred, target_names=target_names, output_dict = True)
    return class_report

In [10]:
# Dump model to file for later conversion to M/A
def save_model(RF, filename):
    pickle.dump(RF, open(filename, 'wb'))

In [11]:
# Assign labels based on MAC addresses
def label_flows(IoT_Train, IoT_Device_List):
    IoT_Train['Label_New'] = len(IoT_Train)*[0]
    for i in range(len(IoT_Device_List)):
        IoT_Train['Label_New'] = np.where((IoT_Train['eth.src']==IoT_Device_List["MAC ADDRESS"][i]), 
                                          IoT_Device_List["List of Devices"][i], IoT_Train['Label_New'])

    for i in range(len(IoT_Device_List)):
        IoT_Train['Label_New'] = np.where((IoT_Train['eth.dst']==IoT_Device_List["MAC ADDRESS"][i]) & 
                                       (IoT_Train['eth.src']=="14:cc:20:51:33:ea"), 
                                      IoT_Device_List["List of Devices"][i], IoT_Train['Label_New'])

    IoT_Train = IoT_Train[IoT_Train['Label_New']!="TPLink Router Bridge LAN (Gateway)"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="0"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="Nest Dropcam"]
    IoT_Train = IoT_Train[IoT_Train['Label_New']!="MacBook/Iphone"]

    return IoT_Train

In [12]:
# Load and clean test data
def prepare_test(packet_data, IoT_Device_List):

    packet_data.columns = ["frame.time_relative","ip.src","ip.dst","tcp.srcport","tcp.dstport","ip.len",
                       "tcp.flags.syn","tcp.flags.ack","tcp.flags.push","tcp.flags.fin",
                       "tcp.flags.reset","tcp.flags.ece","ip.proto","udp.srcport","udp.dstport",
                       "eth.src","eth.dst"]
    packet_data = packet_data[(packet_data["ip.proto"] != "1,17") & (packet_data["ip.proto"] != "1,6")].reset_index(drop=True)
    packet_data = packet_data.dropna(subset=['ip.proto'])
    packet_data["ip.src"] = packet_data["ip.src"].astype(str)
    packet_data["ip.dst"] = packet_data["ip.dst"].astype(str)
    packet_data["ip.proto"] = packet_data["ip.proto"].astype('int')
    packet_data["ip.len"] = packet_data["ip.len"].astype("int")
    packet_data["tcp.srcport"] = packet_data["tcp.srcport"].astype('Int64').fillna(0)
    packet_data["tcp.dstport"] = packet_data["tcp.dstport"].astype('Int64').fillna(0)
    packet_data["udp.srcport"] = packet_data["udp.srcport"].astype('Int64').fillna(0)
    packet_data["udp.dstport"] = packet_data["udp.dstport"].astype('Int64').fillna(0)
    packet_data["tcp.flags.syn"] = packet_data["tcp.flags.syn"].astype('Int64').fillna(0)
    packet_data["tcp.flags.ack"] = packet_data["tcp.flags.ack"].astype('Int64').fillna(0)
    packet_data["tcp.flags.push"] = packet_data["tcp.flags.push"].astype('Int64').fillna(0)
    packet_data["tcp.flags.fin"] = packet_data["tcp.flags.fin"].astype('Int64').fillna(0)
    packet_data["tcp.flags.reset"] = packet_data["tcp.flags.reset"].astype('Int64').fillna(0)
    packet_data["tcp.flags.ece"] = packet_data["tcp.flags.ece"].astype('Int64').fillna(0)
    packet_data["srcport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.srcport"], packet_data["udp.srcport"])
    packet_data["dstport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.dstport"], packet_data["udp.dstport"])
    packet_data["srcport"] = packet_data["srcport"].astype('Int64')
    packet_data["dstport"] = packet_data["dstport"].astype('Int64')
    packet_data = packet_data.drop(["tcp.srcport","tcp.dstport","udp.srcport","udp.dstport"],axis=1)
    packet_data["label"] = [0] * len(packet_data)
    for i in range(len(IoT_Device_List)):
        packet_data["label"] = np.where((packet_data["eth.src"]==IoT_Device_List["MAC ADDRESS"][i]), 
                                          IoT_Device_List["List of Devices"][i], packet_data["label"])
    for i in range(len(IoT_Device_List)):
        packet_data["label"] = np.where((packet_data["eth.dst"] ==IoT_Device_List["MAC ADDRESS"][i]) & 
                                       (packet_data["eth.src"]=="14:cc:20:51:33:ea"), 
                                      IoT_Device_List["List of Devices"][i], packet_data["label"])
    packet_data = packet_data[packet_data['label']!="TPLink Router Bridge LAN (Gateway)"]
    packet_data = packet_data[packet_data['label']!="0"]
    packet_data = packet_data[packet_data['label']!="Nest Dropcam"]
    packet_data = packet_data[packet_data['label']!="MacBook/Iphone"]
    
    return packet_data

In [13]:
# Get test data features and labels
def get_xtest_ytest(Test):
    X_test = Test[['ip.len', 'tcp.flags.syn', 'tcp.flags.ack', 'tcp.flags.push','tcp.flags.fin', \
        'tcp.flags.reset', 'tcp.flags.ece', 'ip.proto','srcport', 'dstport']]
    y_test = Test['label'].replace(classes, range(len(classes)))
    return X_test, y_test

In [14]:
# Get train data features and labels
def get_xtrain_ytrain(Test):
    X_train = Test[['ip.len', 'tcp.flags.syn', 'tcp.flags.ack', 'tcp.flags.push',
                   'tcp.flags.fin', 'tcp.flags.reset', 'tcp.flags.ece', 'ip.proto','srcport', 'dstport']]
    y_train = Test['Label_New'].replace(classes, range(len(classes)))
    return X_train, y_train

In [15]:
# Assign new labels based on groups
def assign_new_labels(IoT_Train, Groups, Group_Names):
    IoT_Train['Group_Label'] = len(IoT_Train)*[0]
    for group, group_name in zip(Groups, Group_Names):
        IoT_Train['Group_Label'] = np.where((IoT_Train['Label_New'].isin(group)), group_name, IoT_Train['Group_Label'])
    IoT_Train = IoT_Train[IoT_Train['Group_Label']!="0"]
    return IoT_Train

In [None]:
# Analyze models for a group
def analyze_group_models(IoT_Train,  IoT_Test, Group, depths):
    IoT_Train_group = IoT_Train[IoT_Train['Label_New'].isin(Group)]
    IoT_Test_group  = IoT_Test[IoT_Test['label'].isin(Group)]
    print("Group members:", IoT_Train_group["Label_New"].unique())
    # get train and test for the group
    X_train_group, y_train_group = get_xtrain_ytrain(IoT_Train_group)
    X_test_group, y_test_group = get_xtest_ytest(IoT_Test_group)
    # analyze model
    analyze_models_features_DT(depths, X_train_group, y_train_group, X_test_group, y_test_group, 10)
    # uncomment line below if you want to run evaluaton for RFs too
    #analyze_models_features([7,8,9,10,11], [2,3,4,5], X_train_group, y_train_group, X_test_group, y_test_group, 10)
    

In [16]:
# final models for each group, outputs the model and the classification report
def get_final_model(IoT_Train, IoT_Test, group_name, feats, depth, name):    
    
    IoT_Train_group = IoT_Train[IoT_Train['Label_New'].isin(group_name)]
    IoT_Test_group  = IoT_Test[IoT_Test['label'].isin(group_name)]
    
    print("Group members:", IoT_Train_group["Label_New"].unique())

    # get train and test for the group
    X_train_group, y_train_group = get_xtrain_ytrain(IoT_Train_group)
    X_test_group, y_test_group = get_xtest_ytest(IoT_Test_group)

    # fit model for the group
    model_gr = fit_final_model_DT(feats, depth, X_train_group, y_train_group)

    # save model as .sav file for later conversion in M/A entries
    save_model(model_gr, "2nd_stage_"+name+'_model.sav')
    
    #Prune check
    nbitsp4 = model_gr.tree_.node_count - model_gr.tree_.n_leaves
    print("Number of bits required to encode final code words in P4: ", nbitsp4)

    c_report = show_performance(X_test_group, y_test_group, model_gr, feats, "macro", group_name)

    return c_report, model_gr

### Model training and testing

In [17]:
# Load and process train data
with open("../Data/IOT_Train_Pickle.pkl", 'rb') as fp:
    IoT_Train = pickle.load(fp)
IoT_Train = label_flows(IoT_Train, IoT_Device_List)   

In [18]:
# Load and process test data
IoT_Test_csv = pd.read_csv("../Data/Test_Data.txt", sep="|")
IoT_Test_csv = prepare_test(IoT_Test_csv, IoT_Device_List)
IoT_Test = IoT_Test_csv.dropna(axis=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


### Feature and model selection per group of devices

In [None]:
## Feature and model selection for each group of devices
analyze_group_models(IoT_Train,  IoT_Test, Plugs,      [4,5,6,7,8,9,10])
analyze_group_models(IoT_Train,  IoT_Test, Appliances, [4,5,6,7,8,9,10])
analyze_group_models(IoT_Train,  IoT_Test, Sensors,    [4,5,6,7,8,9,10])
analyze_group_models(IoT_Train,  IoT_Test, Video,      [4,5,6,7,8,9,10])
analyze_group_models(IoT_Train,  IoT_Test, Video,      [4,5,6,7,8,9,10])

In [19]:
# selected features for each group's model
feats_plugs = ['srcport', 'dstport']
feats_appliances = ['srcport', 'dstport', 'ip.len', 'ip.proto', 'tcp.flags.push']
feats_sensors = ['srcport', 'dstport']
feats_video = ['ip.len', 'srcport', 'tcp.flags.ack', 'dstport']
feats_computers_DT = ['ip.len', 'dstport', 'srcport', 'ip.proto']

### Fit models of individual groups with selected features and depths - a .sav file is generated for each model

In [20]:
# 6 is the selected depth from model analysis
c_report_plugs, final_model_plugs = get_final_model(IoT_Train, IoT_Test, Plugs, feats_plugs, 6, "Plugs")

Group members: ['Belkin Wemo switch' 'TP-Link Smart plug' 'Light Bulbs LiFX Smart Bulb'
 'iHome']
Number of bits required to encode final code words in P4:  29


In [21]:
c_report_video, final_model_video = get_final_model(IoT_Train, IoT_Test, Plugs, feats_video, 9, "Video")

Group members: ['Belkin Wemo switch' 'TP-Link Smart plug' 'Light Bulbs LiFX Smart Bulb'
 'iHome']
Number of bits required to encode final code words in P4:  47


In [22]:
c_report_app, final_model_app = get_final_model(IoT_Train, IoT_Test, Plugs, feats_appliances, 8, "Appliances")

Group members: ['Belkin Wemo switch' 'TP-Link Smart plug' 'Light Bulbs LiFX Smart Bulb'
 'iHome']
Number of bits required to encode final code words in P4:  45


In [23]:
c_report_sens, final_model_sens = get_final_model(IoT_Train, IoT_Test, Sensors, feats_sensors, 4, "Sensors")

Group members: ['Belkin wemo motion sensor' 'NEST Protect smoke alarm'
 'Withings Aura smart sleep sensor']
Number of bits required to encode final code words in P4:  11


In [24]:
c_report_comp, final_model_comp = get_final_model(IoT_Train, IoT_Test, Computers, feats_computers_DT, 10, "Computers")

Group members: ['MacBook' 'Laptop']
Number of bits required to encode final code words in P4:  471
