In [None]:
#import useful libraries for analysis and modeling
import pandas as pd
import numpy as np
from sklearn import tree
from scipy import stats
import os
import pickle
import sys
import tempfile
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.tree import export_graphviz, DecisionTreeClassifier
pd.options.mode.chained_assignment = None
from IPython.display import display, HTML
import warnings

# Filter all warnings
warnings.filterwarnings("ignore")


In [None]:
# list of 26 devices in dataset
classes = ['Dropcam', 'HP Printer', 'Netatmo Welcome', 'Withings Smart Baby Monitor', 'Netatmo weather station',\
           'Smart Things', 'Amazon Echo', 'Samsung SmartCam','TP-Link Day Night Cloud camera', 'Triby Speaker',\
              'Belkin Wemo switch', 'TP-Link Smart plug', 'PIX-STAR Photo-frame','Belkin wemo motion sensor',\
                     'Samsung Galaxy Tab', 'NEST Protect smoke alarm', 'Withings Smart scale', 'IPhone',\
                            'MacBook', 'Withings Aura smart sleep sensor','Light Bulbs LiFX Smart Bulb',\
                            'Blipcare Blood Pressure meter','iHome', 'Insteon Camera', 'Android Phone', 'Laptop']
classes_df = pd.DataFrame(classes, columns=['class'])

# list of all extracted features
feats_all = ["ip.len","ip.ttl","tcp.flags.syn","tcp.flags.ack","tcp.flags.push","tcp.flags.fin","tcp.flags.rst",\
            "tcp.flags.ece","ip.proto","srcport","dstport","ip.hdr_len","tcp.window_size_value","tcp.hdr_len","udp.length",\
            "Min Packet Length","Max Packet Length","Packet Length Mean","Packet Length Total","UDP Len Min","UDP Len Max",\
                "Flow IAT Min","Flow IAT Max","Flow IAT Mean","Flow Duration",\
                    "SYN Flag Count","ACK Flag Count","PSH Flag Count","FIN Flag Count","RST Flag Count","ECE Flag Count"]

# list of easy to compute online features - without means
feats_easy = ["ip.len","ip.ttl","tcp.flags.syn","tcp.flags.ack","tcp.flags.push","tcp.flags.fin","tcp.flags.rst",\
            "tcp.flags.ece","ip.proto","srcport","dstport","ip.hdr_len","tcp.window_size_value","tcp.hdr_len","udp.length",\
            "Min Packet Length","Max Packet Length","Packet Length Total","UDP Len Min","UDP Len Max",\
                "Flow IAT Min","Flow IAT Max","Flow Duration","SYN Flag Count","ACK Flag Count",\
                    "PSH Flag Count","FIN Flag Count","RST Flag Count","ECE Flag Count"]

feats_no_time = ["ip.len","ip.ttl","tcp.flags.syn","tcp.flags.ack","tcp.flags.push","tcp.flags.fin","tcp.flags.rst",\
            "tcp.flags.ece","ip.proto","srcport","dstport","tcp.window_size_value","tcp.hdr_len","udp.length",\
            "Min Packet Length","Max Packet Length","Packet Length Total",\
                "SYN Flag Count","ACK Flag Count","PSH Flag Count","FIN Flag Count","RST Flag Count","ECE Flag Count"]

 ### Helper Functions

In [None]:
""" Function to save trained model to pickle"""
def save_model(RF, filename):
    pickle.dump(RF, open(filename, 'wb'))

"""
Function to get labels and indices of the Test set.
"""
def get_test_labels(IoT_Test):
    array_of_indices = []
    unique_labels = IoT_Test["Label"].unique()
    for lab in unique_labels:
        index = classes_df[classes_df['class'] == lab].index.values[0]
        array_of_indices.append(index)
    return unique_labels, array_of_indices

"""
Function to Fit model based on optimal values of depth and number of estimators and use it
to compute feature importance for all the features.
"""
def get_feature_importance(depth, n_tree, max_leaf, X_train, y_train, weight_of_samples):
    
    rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, random_state=42, bootstrap=False,n_jobs=10)
    rf_opt.fit(X_train, y_train, sample_weight=weight_of_samples)
    feature_importance = pd.DataFrame(rf_opt.feature_importances_)
    feature_importance.index = X_train.columns
    feature_importance = feature_importance.sort_values(by=list(feature_importance.columns),axis=0,ascending=False)
    
    return feature_importance

"""
Function to Fit model based on optimal values of depth and number of estimators and feature importance
to find the fewest possible features to exceed the previously attained score with all selected features
"""
def get_fewest_features(depth, n_tree, max_leaf, importance):    
    sorted_feature_names = importance.index
    # print('sorted_feature_names: ', sorted_feature_names)
    features = []
    for f in range(1,len(sorted_feature_names)+1):
        features.append(sorted_feature_names[0:f])
    return features


def get_x_y_flow(Dataset, feats):    
    X = Dataset[feats]
    y = Dataset['Label'].replace(classes, range(len(classes)))
    sample_nature = Dataset['sample_nature']
    return X, y, sample_nature


"""
Function to calculate the score of the joint model
"""
def get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test, unique_labels,array_of_indices,weight_of_samples):
    model = RandomForestClassifier(max_depth=depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, n_jobs=10,
                                    random_state=42, bootstrap=False)
    
    model.fit(X_train[feats], y_train, sample_weight=weight_of_samples)
    y_pred = model.predict(X_test[feats])

    y_test = [int(label) for label in y_test.values]
    y_pred = [int(label) for label in y_pred]

    class_report = classification_report(y_test, y_pred, labels=unique_labels, target_names=array_of_indices, output_dict = True)

    macro_score = class_report['macro avg']['f1-score']
    weighted_score = class_report['weighted avg']['f1-score']

    return model, class_report, macro_score, weighted_score, y_pred

"""
Function to calculate the score of the joint model in terms of Packet-Level metric
"""
def expand_rows_and_get_scores(y_true, y_pred, sample_nature, multiply,unique_labels,array_of_indices):
    expanded_y_true = []
    expanded_y_pred = []
    
    for true_label, pred_label, nature, mult in zip(y_true, y_pred, sample_nature, multiply):
        if nature == 'flw':
            expanded_y_true.extend([true_label] * (mult+1))
            expanded_y_pred.extend([pred_label] * (mult+1))
        else:
            expanded_y_true.append(true_label)
            expanded_y_pred.append(pred_label)
    
    num_samples = len(expanded_y_true)

    expanded_y_true = [int(label) for label in expanded_y_true]
    expanded_y_pred = [int(label) for label in expanded_y_pred]
    #
    macro_f1 = classification_report(expanded_y_true, expanded_y_pred, labels=unique_labels, target_names=array_of_indices, output_dict=True)['macro avg']['f1-score']
    weighted_f1 = classification_report(expanded_y_true, expanded_y_pred, output_dict=True)['weighted avg']['f1-score']
    
    return num_samples, macro_f1, weighted_f1

"""
Function to calculate the score of the joint model in classifying the flow and classifying the first N-1 packets
"""
def compute_flow_pkt_scores(y_pred, y_test, sample_nature,unique_labels,array_of_indices):

    # Create a data frame with the three columns
    df = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test, 'sample_nature': sample_nature})
    
    # Split the data frame into two data frames based on sample_nature
    pkt_df = df[df['sample_nature'] == 'pkt']
    flw_df = df[df['sample_nature'] == 'flw']
    
    # Compute macro and weighted F1 scores for pkt_df
    pkt_df_y_true = [int(label) for label in pkt_df['y_test'].values]
    pkt_df_y_pred = [int(label) for label in pkt_df['y_pred']]

    # labels=array_of_indices, target_names=unique_labels,
    pkt_macro_f1 = classification_report(pkt_df_y_true, pkt_df_y_pred, labels=unique_labels, target_names=array_of_indices, output_dict=True)['macro avg']['f1-score']
    pkt_weighted_f1 = classification_report(pkt_df_y_true, pkt_df_y_pred, labels=unique_labels, target_names=array_of_indices, output_dict=True)['weighted avg']['f1-score']
    
    # Compute macro and weighted F1 scores for flw_df
    flw_df_y_true = [int(label) for label in flw_df['y_test'].values]
    flw_df_y_pred = [int(label) for label in flw_df['y_pred']]

    flw_macro_f1 = classification_report(flw_df_y_true, flw_df_y_pred, labels=unique_labels, target_names=array_of_indices, output_dict=True)['macro avg']['f1-score']
    flw_weighted_f1 = classification_report(flw_df_y_true, flw_df_y_pred, labels=unique_labels, target_names=array_of_indices, output_dict=True)['weighted avg']['f1-score']

    return pkt_macro_f1, pkt_weighted_f1, flw_macro_f1, flw_weighted_f1


In [None]:
'''
Function to check the conditions and detect if the packet is the Nth packet or
before the Nth packet and assign values to clssify them accordingly
'''
def assign_sample_nature(row):
    if (row["Min Packet Length"] == -1 and
        row["Max Packet Length"] == -1 and
        row["Flow IAT Min"] == -1 and
        row["Flow IAT Max"] == -1):
        return "pkt"
    else:
        return "flw"

 ### Function for grid search on hyperparameters (depth, number of trees, number of features, maximum number of leaves), features and models with a given N value

In [None]:
# Analyze models
def analyze_models(classes, model_type, depths, n_trees, X_train, y_train, X_test, y_test, samples_nature, y_multiply, max_leaf,  test_labels, test_indices, max_feats, filename, weight_of_samples):
    #open file to save ouput of analysis
    with open(filename, "w") as res_file:
        print('depth;tree;no_feats;pkt_macro_f1;pkt_weighted_f1;flw_macro_f1;flw_weighted_f1;F1_macro;F1_weighted;num_samples;PL_Macro_F1;PL_Weighted_F1;N_Leaves;feats', file=res_file)
        if model_type == 'RF':
            # FOR EACH (depth, n_tree, feat)
            for depth in depths:
                for n_tree in n_trees:
                    for leaf in max_leaf:
                        # get feature orders to use (ordered in terms of feature importances)
                        importance = get_feature_importance(depth, n_tree, leaf, X_train, y_train, weight_of_samples)
                        importance = importance[0:max_feats]
                        m_feats = get_fewest_features(depth, n_tree, leaf, importance) 
                        for feats in m_feats:
                            # GET all the scores
                            model, c_report, macro_f1, weight_f1, y_pred = get_scores(classes, depth, n_tree, feats, leaf, X_train, y_train, X_test, y_test,  test_indices, test_labels, weight_of_samples)
                            #
                            pkt_macro_f1, pkt_weighted_f1, flw_macro_f1, flw_weighted_f1 = compute_flow_pkt_scores(y_pred, y_test, samples_nature, test_indices, test_labels)
                            #
                            num_samples, PL_macro_f1, PL_weighted_f1 = expand_rows_and_get_scores(y_test, y_pred, samples_nature, y_multiply, test_indices, test_labels)
                            #
                            # Save the model analysis results
                            print(str(depth)+';'+str(n_tree)+';'+str(len(feats))+';'+str(pkt_macro_f1)+';'+str(pkt_weighted_f1)+';'+str(flw_macro_f1)+';'+str(flw_weighted_f1)+';'+str(macro_f1)+';'+str(weight_f1)+';'+str(num_samples)+';'+str(PL_macro_f1)+';'+str(PL_weighted_f1)+';'+str(leaf)+";"+str(list(feats)), file=res_file)
    print("Analysis Complete. Check output file.")


 ### Model Analysis - Flows with the first n packets

In [None]:
# Takes desired number of packets (N), the output file to store values, and the feature list to use in the model analysis
def analyze_model_n_packets(npkts, outfile, feats_to_use):    

    # Load Train and Test data
    train_data = pd.read_csv("train_data_"+str(npkts)+"_pkts.csv")
    test_data = pd.read_csv("test_data_"+str(npkts)+"_pkts.csv")
    #
    #
    # Generate a dataframe with the information of number of packets of each flow in Test Data
    flow_pkt_counts = pd.read_csv("test_data_flow_packet_counts.csv")
    flow_count_dict = flow_pkt_counts.set_index("flow.id")["count"].to_dict()
    # Map the values from flow_pkt_counts to test_data based on the "Flow ID" column
    test_data["pkt_count"] = test_data["Flow ID"].map(flow_count_dict)
    #
    all_minus_one = (test_data['Min Packet Length'] == -1) & (test_data['Max Packet Length'] == -1) & (test_data['Packet Length Mean'] == -1)
    # Assign values to the multiply column based on the conditions
    test_data['multiply'] = np.where(all_minus_one, 1, test_data['pkt_count'] - npkts)
    #
    test_labels, test_indices = get_test_labels(test_data)
    print("Num Labels: ", len(test_labels))
    #
    #
    train_data = train_data.sample(frac=1, random_state=42)
    test_data  = test_data.sample(frac=1, random_state=42)

    train_data = train_data.dropna(subset=['srcport', 'dstport']) 
    test_data  = test_data.dropna(subset=['srcport', 'dstport'])
    #
    # Assign 'pkt' to all the packets before Nth packet and 'flw' to all Nth packets to classify them accordingly.
    train_data['sample_nature'] = train_data.apply(assign_sample_nature, axis=1)
    test_data['sample_nature']  = test_data.apply(assign_sample_nature, axis=1)
    
    # Assign weights to the packets for training. 
    ## 1: if the packet is before Nth  N: If the packet is the Nth packets (N: the rank of the packet for which the Flow-Level inference is triggered.)
    train_data['weight'] = np.where(train_data['sample_nature'] == 'flw', npkts, 1)
    weight_of_samples = list(train_data['weight'])

    # Get Variables and Labels
    y_multiply = test_data['multiply'].astype(int)
    X_train, y_train, sample_nat_train = get_x_y_flow(train_data, feats_to_use)
    X_test,  y_test, sample_nat_test  = get_x_y_flow(test_data, feats_to_use)

    leaves   = [41,85,129,173,217,261,305,349,393,437,481,500]
    depths   = [9,10,11,12,13,14,15,16,17,18,19,20]
    trees    = [1,2,3,4,5]
    max_feat = 10

    # Run model analysis with the given search space (depth, number of trees, number of features, maximum number of leaves, and the value of 'N')
    analyze_models(classes, "RF", depths, trees, X_train, y_train, X_test, y_test, sample_nat_test, y_multiply, leaves, test_labels, test_indices, max_feat, outfile, weight_of_samples)
    


In [None]:
#### Run for different N values (N=2 to 10) - (N: the rank of the packet for which the Flow-Level inference is triggered.)
for nd in range(2,10):
    print("Number of Packets for Flow Features: ", nd)
    f_name = "unsw_models_results_"+str(nd)+"pkts.csv"
    analyze_model_n_packets(nd, f_name, feats_no_time)

#### Generate the chosen model and save to later convert into M/A entries

In [None]:
# model size, N value, and list of the features that are used in the chosen model
depth = 14
n_of_trees = 3
max_n_leaves = 481
npkts = 3
feats_to_use = ['tcp.window_size_value', 'ip.len', 'srcport', 'dstport', 'ip.ttl', 'tcp.hdr_len']

In [None]:
# Load Train and Test data
train_data = pd.read_csv("train_data_"+str(npkts)+"_pkts.csv")
test_data = pd.read_csv("test_data_"+str(npkts)+"_pkts.csv")
#
#
# Generate a dataframe with the information of number of packets of each flow in Test Data
flow_pkt_counts = pd.read_csv("test_data_flow_packet_counts")
flow_count_dict = flow_pkt_counts.set_index("flow.id")["count"].to_dict()
# Map the values from flow_pkt_counts to test_data based on the "Flow ID" column
test_data["pkt_count"] = test_data["Flow ID"].map(flow_count_dict)
#
all_minus_one = (test_data['Min Packet Length'] == -1) & (test_data['Max Packet Length'] == -1) & (test_data['Packet Length Mean'] == -1)
# Assign values to the multiply column based on the conditions
test_data['multiply'] = np.where(all_minus_one, 1, test_data['pkt_count'] - npkts)
#
test_labels, test_indices = get_test_labels(test_data)
print("Num Labels: ", len(test_labels))
#
#
train_data = train_data.sample(frac=1, random_state=42)
test_data  = test_data.sample(frac=1, random_state=42)

train_data = train_data.dropna(subset=['srcport', 'dstport']) 
test_data  = test_data.dropna(subset=['srcport', 'dstport'])
#
# Assign 'pkt' to all the packets before Nth packet and 'flw' to all Nth packets to classify them accordingly.
train_data['sample_nature'] = train_data.apply(assign_sample_nature, axis=1)
test_data['sample_nature']  = test_data.apply(assign_sample_nature, axis=1)

# Assign weights to the packets for training. 
## 1: if the packet is before Nth  N: If the packet is the Nth packets (N: the rank of the packet for which the Flow-Level inference is triggered.)
train_data['weight'] = np.where(train_data['sample_nature'] == 'flw', npkts, 1)
weight_of_samples = list(train_data['weight'])

# Get Variables and Labels
y_multiply = test_data['multiply'].astype(int)
X_train, y_train, sample_nat_train = get_x_y_flow(train_data, feats_to_use)
X_test,  y_test, sample_nat_test  = get_x_y_flow(test_data, feats_to_use)

In [None]:
# Generate the model
model_unsw_jewel_14_3_6_N3, c_report, macro_f1, weight_f1, y_pred = get_scores(classes, depth, n_of_trees, feats_to_use, max_n_leaves, X_train, y_train, X_test, y_test,  test_indices, test_labels, weight_of_samples)

In [None]:
# save model for onward conversion into M/A entries
save_model(model_unsw_jewel_14_3_6_N3, 'model_unsw_jewel_14_3_6_n3.sav')