In [1]:
import pickle
import os

from scapy.layers.inet import TCPOptions

from math import log, ceil

import numpy             as np
import pandas            as pd
import tensorflow        as tf
import matplotlib.pyplot as plt

from sklearn                 import svm
from sklearn.compose         import make_column_transformer
from sklearn.preprocessing   import MinMaxScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics         import plot_confusion_matrix, ConfusionMatrixDisplay, top_k_accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# opt_to_num = {opt:i  for i,opt  in enumerate(TCPOptions[1].keys())}
# opt_to_num["NONE"] = -1

def get_os_label(nmap_output):
    osclass = nmap_output[0]['osclass'][0]
    return f"{osclass['osfamily']}_{osclass['osgen']}"
    # return osclass['osfamily']


def preprocess_curr_addr_features(curr_addr_features, curr_addr_label):
    curr_addr_features['label'] = curr_addr_label
    # curr_addr_features['ttl'] //= 10
    curr_addr_features['ttl'] = 2 ** ceil(log(curr_addr_features['ttl'], 2))
    # curr_addr_features['w_size'] //= 1000
    # curr_addr_features['mss'] //= 100
    curr_addr_features.pop('mss')

    return curr_addr_features

In [3]:
label_dir   = 'C:\\Users\\ofir\\OneDrive - mail.tau.ac.il\\personal\\my_p0f\\data\\ips_label_dict'
feature_dir = 'C:\\Users\\ofir\\OneDrive - mail.tau.ac.il\\personal\\my_p0f\\data\\ips_feature_dict'
all_data    = []

for filename in sorted(os.listdir(label_dir))[:7000]:    
    # print(f'working on {filename}...')
    feature_file = open(f'{feature_dir}\\{filename}', 'rb')
    label_file   = open(f'{label_dir}\\{filename}', 'rb')

    ip_to_features = pickle.load(feature_file)
    ip_to_label    = pickle.load(label_file)
    
    for addr in ip_to_features:
        if  addr in ip_to_label           and  \
            ip_to_features[addr] != None  and  \
            ip_to_label[addr]    != []         :
            
            curr_addr_features = ip_to_features[addr]
            curr_addr_label    = get_os_label(ip_to_label[addr])
            
            if len(curr_addr_label) != 0:
                try:
                    curr_addr_features = preprocess_curr_addr_features(curr_addr_features, curr_addr_label)
                    all_data.append(curr_addr_features)
                except Exception as e:
                    print(f'error on {filename}')
                    raise e

    feature_file.close()
    label_file.close()

all_data_df = pd.DataFrame.from_dict(all_data)
all_data_df['opts_str'] = all_data_df['opts'].copy(deep=True)
all_data_df['opts'] = LabelEncoder().fit_transform(all_data_df['opts'])

In [None]:
with pd.option_context("display.max_rows", 1000):
    print(all_data_df['label'].value_counts())

In [4]:
# get rid of datapoints with a rare OS
os_counts   = all_data_df.groupby('label').size().to_dict()
common_os   = [os  for os in os_counts  if  os_counts[os] > 4000]
all_data_df = all_data_df.loc[all_data_df['label'].isin(common_os)]

# balance the dataset
for label in all_data_df['label'].unique():
    tmp_df      = all_data_df[all_data_df['label'] == label]
    all_data_df = all_data_df[all_data_df['label'] != label]
    all_data_df = pd.concat([all_data_df, tmp_df[:4000]])

# shuffle dataframe
all_data_df = all_data_df.sample(frac=1)

# create feature matrix
NUM_CLASSES = all_data_df['label'].nunique()

X = all_data_df.drop('label', axis=1).drop('opts_str', axis=1)

In [12]:
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
# print(all_data_df.sort_values('label'))

In [13]:
with pd.option_context("display.max_rows", 1000):
    print(all_data_df['label'].value_counts())

Linux_2.6.X      4000
Windows_2016     4000
Linux_5.X        4000
Linux_None       4000
OpenBSD_4.X      4000
Windows_2012     4000
Linux_3.X        4000
FreeBSD_6.X      4000
embedded_None    4000
Linux_4.X        4000
Name: label, dtype: int64


In [None]:
with pd.option_context("display.max_rows", 1000):
    for column in all_data_df.columns:
        print(column)
        print(all_data_df[column].value_counts())

In [None]:
y      = LabelEncoder().fit_transform(all_data_df['label'])
labels = sorted(np.unique(y))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11235)

ct = make_column_transformer(
    # (MinMaxScaler(), ['w_size', 'mss']),
    # (OneHotEncoder(handle_unknown="ignore"), ['w_scale'] + ['opt%d' % (i) for i in range(10)]),
    (OneHotEncoder(handle_unknown="ignore"), ['w_size', 'df', 'df+', 'df-', 'fo+', 'fo-', 'ecn', 'seq0', 'ttl', 'w_scale', 'opts']),
    remainder='passthrough'
)
ct.fit(X_train)

X_train_normal, X_test_normal = ct.transform(X_train), ct.transform(X_test)


svm_classifier = svm.SVC(kernel='linear', probability=True, C=1, decision_function_shape='ovo').fit(X_train_normal, y_train)
y_preds = svm_classifier.predict_proba(X_test_normal)
print(f'linear score (accuracy)\t: {svm_classifier.score(X_test_normal, y_test)}')
print(f'\tlinear score (top-2)\t: {top_k_accuracy_score(y_test, y_preds, k=2, labels=labels)}')

svm_classifier = svm.SVC(kernel='rbf',    probability=True, C=1, decision_function_shape='ovo').fit(X_train_normal, y_train)
y_preds = svm_classifier.predict_proba(X_test_normal)
print(f'rbf score (accuracy)\t: {svm_classifier.score(X_test_normal, y_test)}')
print(f'\tlinear score (top-2)\t: {top_k_accuracy_score(y_test, y_preds, k=2, labels=labels)}')

svm_classifier = svm.SVC(kernel='poly',   probability=True, C=c, decision_function_shape='ovo').fit(X_train_normal, y_train)
y_preds = svm_classifier.predict_proba(X_test_normal)
print(f'poly score (accuracy)\t: {svm_classifier.score(X_test_normal, y_test)}')
print(f'\tlinear score (top-2)\t: {top_k_accuracy_score(y_test, y_preds, k=2, labels=labels)}')

In [6]:
y = LabelBinarizer().fit_transform(all_data_df['label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11235)

# unique, counts = np.unique(list(map(lambda arr : list(arr).index(1), y_train)), return_counts=True)
# print(np.asarray((unique, counts)).T)
# print('----------------')
# unique, counts = np.unique(list(map(lambda arr : list(arr).index(1), y_test)), return_counts=True)
# print(np.asarray((unique, counts)).T)

ct = make_column_transformer(
    # (MinMaxScaler(), ['w_size', 'mss']),
    # (OneHotEncoder(handle_unknown="ignore"), ['w_scale'] + ['opt%d' % (i) for i in range(10)]),
    (OneHotEncoder(handle_unknown="ignore"), ['w_size', 'df', 'df+', 'df-', 'fo+', 'fo-', 'ecn', 'seq0', 'ttl', 'w_scale', 'opts']),
    remainder='passthrough'
)
ct.fit(X_train)

X_train_normal, X_test_normal = ct.transform(X_train), ct.transform(X_test)


tf.random.set_seed(11235)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(50, input_dim=X_train_normal.shape[1], activation='relu', kernel_initializer='he_uniform'),
    tf.keras.layers.Dense(30, activation='relu', kernel_initializer='he_uniform'),
    tf.keras.layers.Dense(20, activation='relu', kernel_initializer='he_uniform'),
    tf.keras.layers.Dense(10, activation='relu', kernel_initializer='he_uniform'),
    tf.keras.layers.Dense(len(y_train[0]), activation='softmax', kernel_initializer='he_uniform')
])

model_metrics = ['categorical_accuracy', 
                 tf.metrics.TopKCategoricalAccuracy(k=2)]
                
model.compile(loss='categorical_crossentropy', \
              #optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), \
              optimizer='sgd', \
              metrics=model_metrics, \
              steps_per_execution=50)

history = model.fit(X_train_normal, y_train, epochs=5000, batch_size=256, verbose=2)

Epoch 1/5000
125/125 - 1s - loss: 2.2039 - categorical_accuracy: 0.0937 - top_k_categorical_accuracy: 0.3063 - 1s/epoch - 10ms/step
Epoch 2/5000
125/125 - 0s - loss: 2.0707 - categorical_accuracy: 0.1627 - top_k_categorical_accuracy: 0.3323 - 266ms/epoch - 2ms/step
Epoch 3/5000
125/125 - 0s - loss: 1.9240 - categorical_accuracy: 0.2710 - top_k_categorical_accuracy: 0.4416 - 268ms/epoch - 2ms/step
Epoch 4/5000
125/125 - 0s - loss: 1.7623 - categorical_accuracy: 0.3574 - top_k_categorical_accuracy: 0.5759 - 288ms/epoch - 2ms/step
Epoch 5/5000
125/125 - 0s - loss: 1.6290 - categorical_accuracy: 0.3627 - top_k_categorical_accuracy: 0.6699 - 279ms/epoch - 2ms/step
Epoch 6/5000
125/125 - 0s - loss: 1.5460 - categorical_accuracy: 0.4333 - top_k_categorical_accuracy: 0.6880 - 270ms/epoch - 2ms/step
Epoch 7/5000
125/125 - 0s - loss: 1.4856 - categorical_accuracy: 0.4803 - top_k_categorical_accuracy: 0.6916 - 295ms/epoch - 2ms/step
Epoch 8/5000
125/125 - 0s - loss: 1.4352 - categorical_accuracy:

In [None]:
print(history.history['categorical_accuracy'][-1], '\t', history.history['top_k_categorical_accuracy'][-1])

fig, axs = plt.subplots(2)
plt.xlabel('epoch')
axs[0].plot(history.history['top_k_categorical_accuracy'][3:])
axs[1].plot(history.history['loss'][3:])
plt.show()

In [None]:
class estimator:
  _estimator_type = ''
  classes_=[]
  def __init__(self, model, classes):
    self.model = model
    self._estimator_type = 'classifier'
    self.classes_ = classes
  def predict(self, X):
    y_prob= self.model.predict(X)
    y_pred = y_prob.argmax(axis=1)
    return y_pred

classifier = estimator(model, list(all_data_df['label'].unique()))

figsize = (12,12)
tmp = ConfusionMatrixDisplay.from_estimator(estimator=classifier, 
                                            X=X_test_normal, 
                                            y=y_test.argmax(axis=1), 
                                            cmap='Blues', 
                                            normalize='true', 
                                            ax=plt.subplots(figsize=figsize)[1])