In [1]:
import glob
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
pd.options.mode.chained_assignment = None
from IPython.display import display, HTML

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Functions for model analysis

# Save model function
def save_model(RF, filename):
    pickle.dump(RF, open(filename, 'wb'))

# Feature Importance
"""
Function to Fit model based on optimal values of depth and number of estimators and use it
to compute feature importance for all the features.
"""
def get_feature_importance(depth, n_tree, max_leaf, X_train, y_train):
    
    rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, random_state=42, bootstrap=False)
    rf_opt.fit(X_train, y_train)
    feature_importance = pd.DataFrame(rf_opt.feature_importances_)
    feature_importance.index = X_train.columns
    feature_importance = feature_importance.sort_values(by=list(feature_importance.columns),axis=0,ascending=False)
    
    return feature_importance

"""
Function to Fit model based on optimal values of depth and number of estimators and feature importance
to find the fewest possible features to exceed the previously attained score with all selected features
"""
def get_fewest_features(depth, n_tree, max_leaf, importance):    
    sorted_feature_names = importance.index
    # print('sorted_feature_names: ', sorted_feature_names)
    features = []
    for f in range(1,len(sorted_feature_names)+1):
        features.append(sorted_feature_names[0:f])
    # print('features:', features)
    return features

## Get Scores of model with given parameters
def get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test):
    model = RandomForestClassifier(max_depth=depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, n_jobs=4,
                                    random_state=42, bootstrap=False)
    
    model.fit(X_train[feats], y_train)
    y_pred = model.predict(X_test[feats])

    class_report = classification_report(y_test, y_pred, target_names=classes, output_dict = True)
    macro_score = class_report['macro avg']['f1-score']
    weighted_score = class_report['weighted avg']['f1-score']

    return model, class_report, macro_score, weighted_score, y_pred

# Get X and Y from Dataset
def get_x_y_flow(Dataset, classes):    
    X = Dataset[['Min Packet Length', 'Max Packet Length',
       'Packet Length Total', 'Packet Count',
       'Current Packet Length', 'Flow IAT Min', 'Flow IAT Max',
       'Flow Duration', 'SYN Flag Count', 'ACK Flag Count',
       'PSH Flag Count', 'FIN Flag Count',
       'Source Port', 'Destination Port', 'Protocol']]
    y = Dataset['Label'].replace(classes, range(len(classes)))
    return X, y

# Analyze Models to find best model
def analyze_models(classes, model_type, depths, n_trees, X_train, y_train, X_test, y_test, max_leaf,outfile):
    with open(outfile, "w") as res_file:
        print('depth;tree;n_feat;macro;weighted;feats', file=res_file)
        if model_type == 'RF':
            # FOR EACH (depth, n_tree, feat)
            for depth in depths:
                for n_tree in n_trees:
                    # get feature orders to use
                    importance = get_feature_importance(depth, n_tree, max_leaf, X_train, y_train)
                    m_feats = get_fewest_features(depth, n_tree, max_leaf, importance) 
                    for feats in m_feats:
                        # Get the scores with the given (depth, n_tree, feat)
                        model, c_report, macro_f1, weight_f1, y_pred = get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test)
                        print(str(depth)+';'+str(n_tree)+';'+str(len(feats))+';'+str(macro_f1)+';'+str(weight_f1)+';'+str(list(feats)), file=res_file)
    print("Analysis Complete. Check output file.")
    return []

In [3]:
classes = ['ssl', 'bittorrent', 'http', 'edonkey', 'pop3', 'skype', 'imap', 'smtp']

In [4]:
# Load Train and Test data - this data is for the fist 3 packets of each flow
train_data = pd.read_csv("unibs2009_train_3_pkt.csv")
test_data = pd.read_csv("unibs2009_test_3_pkt.csv")

# Get Variables and Labels
X_train, y_train = get_x_y_flow(train_data, classes)
X_test,  y_test  = get_x_y_flow(test_data, classes)

In [5]:
# Run model analysis
all_results = analyze_models(classes, "RF", [5,6,7,8,9,10], [2,3,5], X_train, y_train, X_test, y_test, 500, "Unibs_models.csv")

Analysis Complete. Check output file.


In [6]:
# Check model analysis resutls
results_analysis = pd.read_csv("Unibs_models.csv", sep=";")
results_analysis = results_analysis.sort_values(by=['macro', 'weighted'], ascending=False)
results_analysis.reset_index(drop=True, inplace=True)

In [7]:
results_analysis[0:10]

Unnamed: 0,depth,tree,n_feat,macro,weighted,feats
0,8,3,5,0.971553,0.996094,"['Destination Port', 'Max Packet Length', 'Pac..."
1,9,2,6,0.970211,0.995532,"['Destination Port', 'Max Packet Length', 'Pac..."
2,8,2,6,0.969207,0.994943,"['Destination Port', 'Max Packet Length', 'Pac..."
3,8,2,5,0.968745,0.99622,"['Destination Port', 'Max Packet Length', 'Pac..."
4,9,3,6,0.968511,0.995738,"['Destination Port', 'Max Packet Length', 'Cur..."
5,8,3,6,0.967768,0.996078,"['Destination Port', 'Max Packet Length', 'Pac..."
6,7,2,5,0.96629,0.99604,"['Destination Port', 'Max Packet Length', 'Pac..."
7,10,3,5,0.966145,0.995863,"['Destination Port', 'Max Packet Length', 'Cur..."
8,9,3,5,0.965914,0.995253,"['Destination Port', 'Max Packet Length', 'Cur..."
9,8,5,5,0.965042,0.996078,"['Destination Port', 'Current Packet Length', ..."


In [8]:
# Get features of the best model
results_analysis['feats'][0]

"['Destination Port', 'Max Packet Length', 'Packet Length Total', 'Current Packet Length', 'ACK Flag Count']"

In [9]:
select_feats = ['Destination Port', 'Max Packet Length', 'Packet Length Total', 'Current Packet Length', 'ACK Flag Count']

In [11]:
# Retrain the best model and get its scores
model, class_report, macro_score, weighted_score, y_pred =  get_scores(classes, 8, 3, select_feats, 500, X_train, y_train, X_test, y_test)

In [12]:
model

RandomForestClassifier(bootstrap=False, max_depth=8, max_leaf_nodes=500,
                       n_estimators=3, n_jobs=4, random_state=42)

In [13]:
# Save the best model for future use
save_model(model, "model_unibs_8_3_5.sav")