### Imports & small functions

In [1]:
import pickle
import os
from math import log, ceil

from scapy.layers.inet import TCPOptions

import numpy             as np
import pandas            as pd

from sklearn.compose         import make_column_transformer
from sklearn.preprocessing   import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
# opt_to_num = {opt:i  for i,opt  in enumerate(TCPOptions[1].keys())}
# opt_to_num["NONE"] = -1

def get_os_label(nmap_output):
    osclass = nmap_output[0]['osclass'][0]
    return f"{osclass['osfamily']}_{osclass['osgen']}"
    # return osclass['osfamily']


def preprocess_curr_addr_features(curr_addr_features, curr_addr_label):
    curr_addr_features['label'] = curr_addr_label
    # curr_addr_features['ttl'] //= 10
    curr_addr_features['ttl'] = 2 ** ceil(log(curr_addr_features['ttl'], 2))
    # curr_addr_features['w_size'] //= 1000
    # curr_addr_features['mss'] //= 100
    curr_addr_features.pop('mss')

    return curr_addr_features

### Load information from files & create the dataset 

In [3]:
label_dir   = 'C:\\Users\\ofir\\OneDrive - mail.tau.ac.il\\personal\\my_p0f\\data\\ips_label_dict'
feature_dir = 'C:\\Users\\ofir\\OneDrive - mail.tau.ac.il\\personal\\my_p0f\\data\\ips_feature_dict'
all_data    = []

for filename in sorted(os.listdir(label_dir))[:7000]:    
    # print(f'working on {filename}...')
    feature_file = open(f'{feature_dir}\\{filename}', 'rb')
    label_file   = open(f'{label_dir}\\{filename}', 'rb')

    ip_to_features = pickle.load(feature_file)
    ip_to_label    = pickle.load(label_file)
    
    for addr in ip_to_features:
        if  addr in ip_to_label           and  \
            ip_to_features[addr] != None  and  \
            ip_to_label[addr]    != []         :
            
            curr_addr_features = ip_to_features[addr]
            curr_addr_label    = get_os_label(ip_to_label[addr])
            
            if len(curr_addr_label) != 0:
                try:
                    curr_addr_features = preprocess_curr_addr_features(curr_addr_features, curr_addr_label)
                    all_data.append(curr_addr_features)
                except Exception as e:
                    print(f'error on {filename}')
                    raise e

    feature_file.close()
    label_file.close()

all_data_df = pd.DataFrame.from_dict(all_data)
all_data_df['opts_str'] = all_data_df['opts'].copy(deep=True)
all_data_df['opts'] = LabelEncoder().fit_transform(all_data_df['opts'])

In [4]:
# with pd.option_context("display.max_rows", 1000):
#     print(all_data_df['label'].value_counts())

### Balance, shuffle, split & encode

In [5]:
# get rid of datapoints with a rare OS
os_counts   = all_data_df.groupby('label').size().to_dict()
common_os   = [os  for os in os_counts  if  os_counts[os] > 4000]
all_data_df = all_data_df.loc[all_data_df['label'].isin(common_os)]

# balance the dataset
for label in all_data_df['label'].unique():
    tmp_df      = all_data_df[all_data_df['label'] == label]
    all_data_df = all_data_df[all_data_df['label'] != label]
    all_data_df = pd.concat([all_data_df, tmp_df[:4000]])

# shuffle dataframe
all_data_df = all_data_df.sample(frac=1)

In [6]:
X = all_data_df.drop('label', axis=1).drop('opts_str', axis=1)
y = LabelEncoder().fit_transform(all_data_df['label'])

NUM_CLASSES = all_data_df['label'].nunique()
labels      = sorted(np.unique(y))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11235)

ct = make_column_transformer(
    # (MinMaxScaler(), ['w_size', 'mss']),
    # (OneHotEncoder(handle_unknown="ignore"), ['w_scale'] + ['opt%d' % (i) for i in range(10)]),
    (OneHotEncoder(handle_unknown="ignore"), ['w_size', 'df', 'df+', 'df-', 'fo+', 'fo-', 'ecn', 'seq0', 'ttl', 'w_scale', 'opts']),
    remainder='passthrough'
)
ct.fit(X_train)

X_train_normal, X_test_normal = ct.transform(X_train), ct.transform(X_test)