In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the Data

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc

In [4]:
PROJECT_PATH = '/content/drive/My Drive/Datasets/ML Project'
WITH_PREPROCESSING = False
MODELS_BASE_PATH = f'{PROJECT_PATH}/models'
MODEL_PATH = f'{MODELS_BASE_PATH}/{"with" if WITH_PREPROCESSING else "without"}'

DATASETS_PATH = f'{PROJECT_PATH}/data/recommended'
TRAINING_PATH = f'{DATASETS_PATH}/training/training.csv'
TESTING_PATH = f'{DATASETS_PATH}/test/test.csv'
INDEX_COL = 'pkSeqID'
COLUMNS_ORDERED = [
    'min', 'max', 'mean', 'stddev',
    'saddr', 'sport', 'daddr', 'dport',
    'srate', 'drate',
    'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP',
    'state_number', 'proto',
    'seq',
    'attack', 'category', 'subcategory'
]

In [5]:
def get_model_path(name, with_pre: bool | None):
  if with_pre is None:
    return f'{MODELS_BASE_PATH}/{name}.joblib'
  return f'{MODELS_BASE_PATH}/{"with" if with_pre else "without"}/{name}.joblib'

In [6]:
import joblib

def save_model(model, name, with_pre):
    joblib.dump(model, get_model_path(name, with_pre))

In [7]:
training = pd.read_csv(TRAINING_PATH, index_col=INDEX_COL)[COLUMNS_ORDERED]
training.head()

Unnamed: 0_level_0,min,max,mean,stddev,saddr,sport,daddr,dport,srate,drate,N_IN_Conn_P_SrcIP,N_IN_Conn_P_DstIP,state_number,proto,seq,attack,category,subcategory
pkSeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3142762,0.0,4.031619,2.687519,1.900363,192.168.100.150,6551,192.168.100.3,80,0.494549,0.0,100,100,4,udp,251984,1,DDoS,UDP
2432264,3.85693,4.012924,3.934927,0.078003,192.168.100.150,5532,192.168.100.3,80,0.256493,0.0,38,100,3,tcp,256724,1,DDoS,TCP
1976315,2.9741,3.609205,3.341429,0.268666,192.168.100.147,27165,192.168.100.3,80,0.29488,0.0,100,100,3,tcp,62921,1,DDoS,TCP
1240757,0.0,4.942302,3.222832,1.823185,192.168.100.150,48719,192.168.100.3,80,0.461435,0.0,63,63,4,udp,99168,1,DoS,UDP
3257991,2.979995,4.994452,3.983222,0.822418,192.168.100.147,22461,192.168.100.3,80,1.002999,0.0,100,100,4,udp,105063,1,DDoS,UDP


In [8]:
testing = pd.read_csv(TESTING_PATH, index_col=INDEX_COL)[COLUMNS_ORDERED]

# Final Pipeline

After trying out a bunch of techniques for preprocessing, the scores only worsened.

So a simple pipeline is all that is needed for this particular dataset.

## Handling Target

In [9]:
def process_categories(cat):
  if str(cat).lower().startswith('theft'):
    return 'Theft'
  if str(cat).lower().startswith('normal'):
    return 'Normal'
  if cat == 'DoS HTTP':
    return 'DoS TCP'
  if cat == 'DDoS HTTP':
    return 'DDoS TCP'
  return cat


In [10]:
from sklearn.preprocessing import LabelEncoder

cat_encoder_label = LabelEncoder()

In [11]:
X_train = training.drop(['attack' ,'category', 'subcategory'], axis=1)
y_train_cat = pd.DataFrame({'category': (training['category'] + ' ' + training['subcategory']).map(process_categories)})
y_train_label = cat_encoder_label.fit_transform(y_train_cat['category'])

In [12]:
X_test = testing.drop(['attack', 'category', 'subcategory'], axis=1)
y_test_cat = pd.DataFrame({'category': (testing['category'] + ' ' + testing['subcategory']).map(process_categories)})
y_test_label = cat_encoder_label.transform(y_test_cat['category'])

In [13]:
del training
del testing
gc.collect()

0

## Preprocessing Pipeline

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer, StandardScaler

In [15]:
def process_port(p):
  return int(p, 16) if str(p).startswith('0x') else int(p)

def process_ports(ports: pd.DataFrame):
  return ports.map(process_port)


In [16]:
DEFAULT_RATE_SHIFT = 1.1

In [17]:
def shift_and_log(data, shift=DEFAULT_RATE_SHIFT): # 1.1 So that the output has no zero values, and a small change is not that significant
  return np.log10(data + shift)

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

srate_idx, drate_idx = [list(X_train.columns).index('srate'), list(X_train.columns).index('drate')]
class CombinedFeatureAdder(BaseEstimator, TransformerMixin):
  def __init__(self, normalize=True) -> None:
    super().__init__()
    self.normalize = normalize

  def fit(self, X, y=None):
    return self

  def transform(self, X: pd.DataFrame, y=None):
    srate_to_drate = np.log10(X.loc[:, 'srate'] + DEFAULT_RATE_SHIFT) / np.log10(X.loc[:, 'drate'] + DEFAULT_RATE_SHIFT)
    if self.normalize:
      return X.assign(srate_to_drate=np.log1p(srate_to_drate))
    else:
      return X.assign(srate_to_drate=srate_to_drate)

In [49]:
class BaseModelPipeline:
  def __init__(self, name: str, with_pre: bool | None, has_transformer: bool | None = None, clustering: bool = False) -> None:
    self.name = name
    self.with_pre = with_pre
    self.model = joblib.load(get_model_path(self.name, self.with_pre))
    self.pipeline_ = None
    self.transformer_ = has_transformer
    self.is_clustering_ = clustering
    self.cluster_mapping_ = None

  def init_cluster_mapping(self, X, y):
    """
    Loads/Saves mapping from cluster to category
    """
    if not self.is_clustering_:
      raise ValueError('Can only use init_cluster_mapping with clustering algorithms')
    if self.cluster_mapping_ is not None:
      return self
    df = X.copy()
    df['cluster'] = self.predict(X)
    df['category'] = y
    pivot = (
        df
        .groupby(['cluster', 'category'])
        .size()
        .groupby(level=0)
        .apply(lambda x: 100 * x / x.sum())
        .unstack(fill_value=0)
    )
    pivot['top'] = pivot.idxmax(axis=1)
    out = {}
    for i in pivot.index:
      out[i[0]] = pivot.loc[i,'top']
    self.cluster_mapping_ = out
    return self

  def fit_transform(self, X):
    """
    Transforms input data to proper format, must output a dataframe with correct columns for model
    """
    pass

  def pipeline(self):
    """
    Return complete pipeline with preprocessing, mutually exclusive with fit_transform
    """
    return None

  def category(self, encoded):
    """
    Returns predictions but as a category (without encoding of target)
    """
    if self.is_clustering_:
      if self.cluster_mapping_ is not None:
        return np.array(list(map(lambda e: self.cluster_mapping_[e], encoded)))
    return ''

  def predict_label(self, X):
    """
    Utility function: Returns predictions for input but as a category, not a number
    """
    return self.category(self.predict(X))

  def encode_label(self, y):
    """
    Encodes label to a number as per the model definition, can be useful for calculating scores
    """
    pass

  def predict(self, X):
    """
    Output integer output of the model, or final prediction of the full pipeline
    """
    if self.pipeline_ is None:
      self.pipeline_ = self.pipeline()
      if self.pipeline_ is not None:
        return self.pipeline_.predict()
    if self.transformer_ is not None:
      X_prep = self.fit_transform(X)
      return self.model.predict(X_prep)
    raise ValueError('Either pipeline or transformer function need to be defined and return a non-None value.')


In [20]:
%%script true
TO_DROP = ['saddr', 'daddr', 'seq']

data_cleaner = ColumnTransformer([
  ('drop', 'drop', TO_DROP),
  ('encode', OneHotEncoder(), ['proto']),
  ('port', FunctionTransformer(func=process_ports), ['sport', 'dport']),
  ('rate', FunctionTransformer(func=shift_and_log), ['srate', 'drate']),
], remainder='passthrough')

preprocessing = Pipeline([
    ('augment', CombinedFeatureAdder()),
    ('clean', data_cleaner),
    ('std', StandardScaler()),
])

In [21]:
%%script true
X_train_cleaned = preprocessing.fit_transform(X_train)

# Predictors definition

## Utils

In [22]:
TO_DROP = ['saddr', 'daddr', 'seq']

DEFAULT_TRANSFROM_STEPS = [
  ('drop', 'drop', TO_DROP),
  ('encode', OneHotEncoder(), ['proto']),
  ('port', FunctionTransformer(func=process_ports), ['sport', 'dport']),
]


In [23]:
def get_pipeline_path(with_pre: bool, name: str):
  return f'{MODELS_BASE_PATH}/{"with" if with_pre else "without"}/pipelines/{name}.joblib'

In [24]:
def save_pipeline(model, with_pre: bool, name: str):
  joblib.dump(model, get_pipeline_path(with_pre, name))

In [47]:
def save_and_validate(model, with_pre: bool, name: str, X):
  out = model.predict(X)
  save_pipeline(model, with_pre, name)
  model = joblib.load(get_pipeline_path(with_pre, name))
  out2 = model.predict(X)
  del model
  gc.collect()
  return out == out2

In [46]:
def save_and_validate_cluster(model, with_pre: bool, name: str, X, X_train=X_train, y_train=y_train_cat):
  out = model.init_cluster_mapping(X_train, y_train).predict_label(X)
  save_pipeline(model, with_pre, name)
  model = joblib.load(get_pipeline_path(with_pre, name))
  out2 = model.init_cluster_mapping(X_train, y_train).predict_label(X)
  del model
  gc.collect()
  return out == out2

## Basic Processing Models

### Random Forest

In [27]:
class RandomForestNoPreprocPipeline(BaseModelPipeline):
  def __init__(self) -> None:
    self.categories_map = {'DDoS UDP': 0, 'DDoS TCP': 1, 'DoS UDP': 2, 'DoS TCP': 3, 'Reconnaissance OS_Fingerprint': 4, 'Reconnaissance Service_Scan': 5, 'Normal': 6, 'Theft': 7}
    self.encoder_ = None
    super().__init__('random_forest', False, True)

  def fit_transform(self, X):
    out = X.drop(['saddr', 'daddr', 'seq'], axis=1)

    if self.encoder_ is None:
      self.encoder_ = joblib.load(get_model_path('random_forest_encoder', False))

    feature_cat_cols = ["sport","dport","proto"]
    out[feature_cat_cols] = self.encoder_.transform(out[feature_cat_cols].astype(str))
    return out[self.model.feature_names_in_]

  def encode_label(self, y):
    if 'category' in y.columns:
      return y['category'].map(lambda l: self.categories_map[l])

  def category(self, encoded):
    return np.array(list(map(lambda e: [k for k in self.categories_map if self.categories_map[k] == e][0], encoded)))

In [28]:
%%script true
rf_no_pre = RandomForestNoPreprocPipeline()

In [29]:
%%script true
save_and_validate(rf_no_pre, False, 'random_forest', X_test)

### XGBoost

In [30]:
class XGBoostNoPreprocPipeline(BaseModelPipeline):
  def __init__(self) -> None:
    self.categories = ['DDoS TCP', 'DDoS UDP', 'DoS TCP', 'DoS UDP', 'Normal',
       'Reconnaissance OS_Fingerprint', 'Reconnaissance Service_Scan',
       'Theft']
    super().__init__('xgboost_model_final', False, True)

  def fit_transform(self, X: pd.DataFrame):
    cols_processed = [
        'proto', 'sport', 'dport', 'state_number',
        'mean', 'stddev', 'min', 'max', 'srate', 'drate',
        'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP'
    ]
    out = X[cols_processed].copy()
    for col in ['sport', 'dport']:
        out[col] = pd.to_numeric(out[col], errors='coerce').fillna(0)

    out = pd.get_dummies(out, columns=['proto'], drop_first=True)
    return out

  def encode_label(self, y):
    if 'category' in y.columns:
      return y['category'].map()

  def category(self, encoded):
    return np.array(list(map(lambda e: self.categories[e], encoded)))

In [31]:
%%script true
xgb_no_pre = XGBoostNoPreprocPipeline()

In [32]:
%%script true
save_and_validate(xgb_no_pre, False, 'xgboost', X_test)

### KNN

In [70]:
class KNNNoPreprocPipeline(BaseModelPipeline):
  def __init__(self) -> None:
    super().__init__('knn', False, True)

  def fit_transform(self, X: pd.DataFrame):
    features = list(self.model.feature_names_in_)
    out = X.reset_index(drop=False)
    transformer = ColumnTransformer([
      ('port', FunctionTransformer(func=process_ports), ['sport', 'dport']),
    ], remainder='passthrough')
    out = pd.DataFrame(transformer.fit_transform(out[features]), columns=features)
    return out

  def encode_label(self, y):
    return y

In [71]:
%%script true
knn_no_pre = KNNNoPreprocPipeline()

In [72]:
%%script true
save_and_validate(knn_no_pre, False, 'knn', X_test[:5000])

['min', 'max', 'mean', 'stddev', 'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP', 'drate', 'srate', 'sport', 'dport', 'proto', 'state_number']


KeyboardInterrupt: 

### GMM

In [50]:
class GMMNoPreprocPipeline(BaseModelPipeline):
  def __init__(self) -> None:
    super().__init__('gmm_model', False, True, True)
    self.scaler = joblib.load(get_model_path('gmm_scaler', False))

  def fit_transform(self, X: pd.DataFrame):
    out = X.drop(['saddr', 'daddr', 'seq'], axis=1).dropna()[[
      "proto",
      "sport", "dport",
      "min", "max", "mean", "stddev",
      "state_number",
      "N_IN_Conn_P_SrcIP", "N_IN_Conn_P_DstIP",
      "srate", "drate"
    ]]
    out['sport'] = out['sport'].map(process_port)
    out['dport'] = out['dport'].map(process_port)
    # to remove
    proto_map = {"tcp": 0, "udp": 1, "icmp": 2, "arp": 3, "ipv6-icmp": 4}
    out['proto'] = out['proto'].map(lambda pr: proto_map.get(pr, -1))
    out = self.scaler.transform(out)
    return out


In [51]:
%%script true
gmm_no_pre = GMMNoPreprocPipeline()

In [52]:
%%script true
save_and_validate_cluster(gmm_no_pre, False, 'gmm', X_test)

array([ True,  True,  True, ...,  True,  True,  True])

### KMeans

In [79]:
class KMeansNoPreprocPipeline(BaseModelPipeline):
  def __init__(self) -> None:
    super().__init__('kmeans', False, True, True)
    self.scaler = joblib.load(get_model_path('gmm_scaler', False))

  def fit_transform(self, X: pd.DataFrame):
    out = X[self.model.feature_names_in_]
    out.loc[:, 'sport'] = out['sport'].map(process_port)
    out.loc[:, 'dport'] = out['dport'].map(process_port)
    return out

In [80]:
kmeans_no_pre = KMeansNoPreprocPipeline()

In [None]:
left, right = y_test_cat.align(pd.Series(kmeans_no_pre.init_cluster_mapping(X_train, y_train_cat).predict_label(X_test)), axis=1, copy=False)
print(sum(left != right))