In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the Data

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc

In [4]:
PROJECT_PATH = '/content/drive/My Drive/Datasets/ML Project'
MODEL_NAME = 'linear_regression'
WITH_PREPROCESSING = False
FIGURES_PATH = f'{PROJECT_PATH}/figures/{MODEL_NAME}/{"with" if WITH_PREPROCESSING else "without"}'
MODELS_BASE_PATH = f'{PROJECT_PATH}/models'
MODEL_PATH = f'{MODELS_BASE_PATH}/{"with" if WITH_PREPROCESSING else "without"}'
def get_figure_path(name: str):
  return f'{FIGURES_PATH}/{name}.png'

DATASETS_PATH = f'{PROJECT_PATH}/data/recommended'
TRAINING_PATH = f'{DATASETS_PATH}/training/training.csv'
TESTING_PATH = f'{DATASETS_PATH}/test/test.csv'
INDEX_COL = 'pkSeqID'
COLUMNS_ORDERED = [
    'min', 'max', 'mean', 'stddev',
    'saddr', 'sport', 'daddr', 'dport',
    'srate', 'drate',
    'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP',
    'state_number', 'proto',
    'seq',
    'attack', 'category', 'subcategory'
]

In [5]:
def get_model_path(name = MODEL_NAME, with_pre: bool | None = WITH_PREPROCESSING ):
  if with_pre is None:
    return f'{MODELS_BASE_PATH}/{name}.joblib'
  return f'{MODELS_BASE_PATH}/{"with" if with_pre else "without"}/{name}.joblib'

In [6]:
import joblib

def save_model(model, name = MODEL_NAME, with_pre = WITH_PREPROCESSING):
    joblib.dump(model, get_model_path(name, with_pre))

In [7]:
training = pd.read_csv(TRAINING_PATH, index_col=INDEX_COL)[COLUMNS_ORDERED]
training.head()

Unnamed: 0_level_0,min,max,mean,stddev,saddr,sport,daddr,dport,srate,drate,N_IN_Conn_P_SrcIP,N_IN_Conn_P_DstIP,state_number,proto,seq,attack,category,subcategory
pkSeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3142762,0.0,4.031619,2.687519,1.900363,192.168.100.150,6551,192.168.100.3,80,0.494549,0.0,100,100,4,udp,251984,1,DDoS,UDP
2432264,3.85693,4.012924,3.934927,0.078003,192.168.100.150,5532,192.168.100.3,80,0.256493,0.0,38,100,3,tcp,256724,1,DDoS,TCP
1976315,2.9741,3.609205,3.341429,0.268666,192.168.100.147,27165,192.168.100.3,80,0.29488,0.0,100,100,3,tcp,62921,1,DDoS,TCP
1240757,0.0,4.942302,3.222832,1.823185,192.168.100.150,48719,192.168.100.3,80,0.461435,0.0,63,63,4,udp,99168,1,DoS,UDP
3257991,2.979995,4.994452,3.983222,0.822418,192.168.100.147,22461,192.168.100.3,80,1.002999,0.0,100,100,4,udp,105063,1,DDoS,UDP


In [8]:
testing = pd.read_csv(TESTING_PATH, index_col=INDEX_COL)[COLUMNS_ORDERED]

# Final Pipeline

After trying out a bunch of techniques for preprocessing, the scores only worsened.

So a simple pipeline is all that is needed for this particular dataset.

## Handling Target

In [9]:
def process_categories(cat):
  if str(cat).lower().startswith('theft'):
    return 'Theft'
  if str(cat).lower().startswith('normal'):
    return 'Normal'
  if cat == 'DoS HTTP':
    return 'DoS TCP'
  if cat == 'DDoS HTTP':
    return 'DDoS TCP'
  return cat


In [10]:
from sklearn.preprocessing import LabelEncoder

cat_encoder_label = LabelEncoder()

In [11]:
X_train = training.drop(['attack' ,'category', 'subcategory'], axis=1)
y_train_cat = pd.DataFrame({'category': (training['category'] + ' ' + training['subcategory']).map(process_categories)})
y_train_label = cat_encoder_label.fit_transform(y_train_cat['category'])

In [12]:
X_test = testing.drop(['attack', 'category', 'subcategory'], axis=1)
y_test_cat = pd.DataFrame({'category': (testing['category'] + ' ' + testing['subcategory']).map(process_categories)})
y_test_label = cat_encoder_label.transform(y_test_cat['category'])

In [13]:
del training
del testing
gc.collect()

5

## Preprocessing Pipeline

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer, StandardScaler

In [15]:
def process_port(p):
  return int(p, 16) if str(p).startswith('0x') else int(p)

def process_ports(ports: pd.DataFrame):
  return ports.map(process_port)


In [16]:
DEFAULT_RATE_SHIFT = 1.1

In [17]:
def shift_and_log(data, shift=DEFAULT_RATE_SHIFT): # 1.1 So that the output has no zero values, and a small change is not that significant
  return np.log10(data + shift)

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

srate_idx, drate_idx = [list(X_train.columns).index('srate'), list(X_train.columns).index('drate')]
class CombinedFeatureAdder(BaseEstimator, TransformerMixin):
  def __init__(self, normalize=True) -> None:
    super().__init__()
    self.normalize = normalize

  def fit(self, X, y=None):
    return self

  def transform(self, X: pd.DataFrame, y=None):
    srate_to_drate = np.log10(X.loc[:, 'srate'] + DEFAULT_RATE_SHIFT) / np.log10(X.loc[:, 'drate'] + DEFAULT_RATE_SHIFT)
    if self.normalize:
      return X.assign(srate_to_drate=np.log1p(srate_to_drate))
    else:
      return X.assign(srate_to_drate=srate_to_drate)

In [102]:
class BaseModelPipeline:
  def __init__(self, name: str, with_pre: bool | None, has_transformer: bool | None = None) -> None:
    self.name = name
    self.with_pre = with_pre
    self.model = joblib.load(get_model_path(self.name, self.with_pre))
    self.pipeline_ = None
    self.transformer_ = has_transformer

  def fit_transform(self, X):
    pass

  def pipeline(self):
    return None

  def category(self, encoded):
    return ''

  def predict_label(self, X):
    return self.category(self.predict(X))

  def encode_label(self, y):
    pass

  def predict(self, X):
    if self.pipeline_ is None:
      self.pipeline_ = self.pipeline()
      if self.pipeline_ is not None:
        return self.pipeline_.predict()
    if self.transformer_ is not None:
      X_prep = self.fit_transform(X)
      return self.model.predict(X_prep)
    raise ValueError('Either pipeline or transformer function need to be defined and return a non-None value.')


In [44]:
%%script true
TO_DROP = ['saddr', 'daddr', 'seq']

data_cleaner = ColumnTransformer([
  ('drop', 'drop', TO_DROP),
  ('encode', OneHotEncoder(), ['proto']),
  ('port', FunctionTransformer(func=process_ports), ['sport', 'dport']),
  ('rate', FunctionTransformer(func=shift_and_log), ['srate', 'drate']),
], remainder='passthrough')

preprocessing = Pipeline([
    ('augment', CombinedFeatureAdder()),
    ('clean', data_cleaner),
    ('std', StandardScaler()),
])

In [45]:
%%script true
X_train_cleaned = preprocessing.fit_transform(X_train)

# Predictors definition

In [46]:
TO_DROP = ['saddr', 'daddr', 'seq']

DEFAULT_TRANSFROM_STEPS = [
  ('drop', 'drop', TO_DROP),
  ('encode', OneHotEncoder(), ['proto']),
  ('port', FunctionTransformer(func=process_ports), ['sport', 'dport']),
]


## Basic Processing Models

In [66]:
NO_PREPROCESSING_PIPELINE_PATH = f'{MODELS_BASE_PATH}/without/pipelines'

In [72]:
def get_pipeline_path(with_pre: bool, name: str):
  return f'{MODELS_BASE_PATH}/{"with" if with_pre else "without"}/pipelines/{name}.joblib'

In [73]:
def save_pipeline(model, with_pre: bool, name: str):
  joblib.dump(model, get_pipeline_path(with_pre, name))

### Random Forest

In [47]:
from sklearn.preprocessing import OrdinalEncoder

In [107]:
class RandomForestNoPreprocPipeline(BaseModelPipeline):
  def __init__(self) -> None:
    self.categories_map = {'DDoS UDP': 0, 'DDoS TCP': 1, 'DoS UDP': 2, 'DoS TCP': 3, 'Reconnaissance OS_Fingerprint': 4, 'Reconnaissance Service_Scan': 5, 'Normal': 6, 'Theft': 7}
    self.encoder_ = None
    super().__init__('random_forest', False, True)

  def fit_transform(self, X):
    out = X.drop(['saddr', 'daddr', 'seq'], axis=1)

    if self.encoder_ is None:
      self.encoder_ = joblib.load(get_model_path('random_forest_encoder', False))

    feature_cat_cols = ["sport","dport","proto"]
    out[feature_cat_cols] = self.encoder_.transform(out[feature_cat_cols].astype(str))
    return out[self.model.feature_names_in_]

  def encode_label(self, y):
    return y['category'].map(lambda l: self.categories_map[l])

  def category(self, encoded):
    return np.array(list(map(lambda e: [k for k in self.categories_map if self.categories_map[k] == e][0], encoded)))

In [108]:
predictor = RandomForestNoPreprocPipeline()

In [109]:
print(predictor.encode_label(y_test_cat))

pkSeqID
792371     2
2056418    1
2795650    0
2118009    1
303688     3
          ..
1571905    2
2787099    0
2255382    1
588946     3
2577420    1
Name: category, Length: 733705, dtype: int64


In [101]:
predictor.predict(X_test)

array(['DoS UDP', 'DDoS TCP', 'DDoS UDP', ..., 'DDoS TCP', 'DoS TCP',
       'DDoS TCP'], dtype='<U29')

### XGBoost

In [63]:
class XGBoostNoPreprocPipeline(BaseModelPipeline):
  def __init__(self) -> None:
    self.categories = ['DDoS TCP', 'DDoS UDP', 'DoS TCP', 'DoS UDP', 'Normal',
       'Reconnaissance OS_Fingerprint', 'Reconnaissance Service_Scan',
       'Theft']
    super().__init__('xgboost_model_final', None, True)

  def fit_transform(self, X: pd.DataFrame):
    cols_processed = [
        'proto', 'sport', 'dport', 'state_number',
        'mean', 'stddev', 'min', 'max', 'srate', 'drate',
        'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP'
    ]
    out = X[cols_processed].copy()
    for col in ['sport', 'dport']:
        out[col] = pd.to_numeric(out[col], errors='coerce').fillna(0)

    out = pd.get_dummies(out, columns=['proto'], drop_first=True)
    return out

  def category(self, encoded):
    return np.array(list(map(lambda e: self.categories[e], encoded)))

In [74]:
predictor = XGBoostNoPreprocPipeline()

In [75]:
predictor.predict(X_train)

array(['DDoS UDP', 'DDoS TCP', 'DDoS TCP', ..., 'DoS UDP', 'DoS TCP',
       'DoS TCP'], dtype='<U29')

In [76]:
save_pipeline(predictor, False, 'xgboost')

In [77]:
pipe = joblib.load(get_pipeline_path(False, 'xgboost'))

In [78]:
pipe.predict(X_train)

array(['DDoS UDP', 'DDoS TCP', 'DDoS TCP', ..., 'DoS UDP', 'DoS TCP',
       'DoS TCP'], dtype='<U29')