In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the Data

In [28]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import gc

In [29]:
PROJECT_PATH = '/content/drive/My Drive/MLProject'
MODEL_NAME = 'K-neirest-neighbors'
WITH_PREPROCESSING = True
FIGURES_PATH = f'{PROJECT_PATH}/figures/{MODEL_NAME}/{"with" if WITH_PREPROCESSING else "without"}'
MODEL_PATH = f'{PROJECT_PATH}/models/{"with" if WITH_PREPROCESSING else "without"}'
def get_figure_path(name: str):
  return f'{FIGURES_PATH}/{name}.png'

DATASETS_PATH = f'{PROJECT_PATH}/data/recommended'
TRAINING_PATH = f'{DATASETS_PATH}/training/training.csv'
TESTING_PATH = f'{DATASETS_PATH}/test/test.csv'
INDEX_COL = 'pkSeqID'
COLUMNS_ORDERED = [
    'min', 'max', 'mean', 'stddev',
    'saddr', 'sport', 'daddr', 'dport',
    'srate', 'drate',
    'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP',
    'state_number', 'proto',
    'seq',
    'attack', 'category', 'subcategory'
]

In [30]:
import joblib

def save_model(model, name = MODEL_NAME):
    joblib.dump(model, f'{MODEL_PATH}/{name}.joblib')

In [31]:
training = pd.read_csv(TRAINING_PATH, index_col=INDEX_COL)[COLUMNS_ORDERED]
training = training.sample(n=500000, random_state=42)
training.head()

Unnamed: 0_level_0,min,max,mean,stddev,saddr,sport,daddr,dport,srate,drate,N_IN_Conn_P_SrcIP,N_IN_Conn_P_DstIP,state_number,proto,seq,attack,category,subcategory
pkSeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1467749,0.0,4.909888,3.848087,1.924755,192.168.100.150,10141,192.168.100.3,80,0.605122,0.0,100,100,4,udp,64006,1,DoS,UDP
3624793,0.003553,0.003553,0.003553,0.0,192.168.100.150,2969,192.168.100.3,981,281.452301,0.0,100,100,1,tcp,1005,1,Reconnaissance,Service_Scan
1183354,0.0,4.880579,2.330006,1.979408,192.168.100.149,56228,192.168.100.5,80,0.220999,0.0,52,52,4,udp,41765,1,DoS,UDP
661039,0.010402,3.719587,2.723464,1.567404,192.168.100.148,14896,192.168.100.6,80,0.231343,0.0,66,66,4,udp,43762,1,DoS,UDP
2363723,0.0,3.350417,2.151402,1.5246,192.168.100.148,31041,192.168.100.3,80,0.314317,0.0,87,100,3,tcp,188183,1,DDoS,TCP


In [32]:
testing = pd.read_csv(TESTING_PATH, index_col=INDEX_COL)[COLUMNS_ORDERED]

# Final Pipeline

After trying out a bunch of techniques for preprocessing, the scores only worsened.

So a simple pipeline is all that is needed for this particular dataset.

## Handling Target

In [33]:
def process_categories(cat):
  if str(cat).lower().startswith('theft'):
    return 'Theft'
  if str(cat).lower().startswith('normal'):
    return 'Normal'
  if cat == 'DoS HTTP':
    return 'DoS TCP'
  if cat == 'DDoS HTTP':
    return 'DDoS TCP'
  return cat


In [34]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

cat_encoder_hot = OneHotEncoder()
cat_encoder_label = LabelEncoder()

In [35]:
X_train = training.drop(['attack' ,'category', 'subcategory'], axis=1)
y_train_cat = pd.DataFrame({'category': (training['category'] + ' ' + training['subcategory']).map(process_categories)})
y_train_hot = cat_encoder_hot.fit_transform(y_train_cat)
y_train_label = cat_encoder_label.fit_transform(y_train_cat['category'])

In [36]:
X_test = testing.drop(['attack', 'category', 'subcategory'], axis=1)
y_test_cat = pd.DataFrame({'category': (testing['category'] + ' ' + testing['subcategory']).map(process_categories)})
y_test_hot = cat_encoder_hot.transform(y_test_cat)
y_test_label = cat_encoder_label.transform(y_test_cat['category'])

In [37]:
del training
del testing
gc.collect()

37

## Preprocessing Pipeline

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer, StandardScaler

In [39]:
def process_port(p):
  return int(p, 16) if str(p).startswith('0x') else int(p)

def process_ports(ports: pd.DataFrame):
  return ports.map(process_port)


In [40]:
DEFAULT_RATE_SHIFT = 1.1

In [41]:
def shift_and_log(data, shift=DEFAULT_RATE_SHIFT): # 1.1 So that the output has no zero values, and a small change is not that significant
  return np.log10(data + shift)

In [42]:
from sklearn.base import BaseEstimator, TransformerMixin

srate_idx, drate_idx = [list(X_train.columns).index('srate'), list(X_train.columns).index('drate')]
class CombinedFeatureAdder(BaseEstimator, TransformerMixin):
  def __init__(self, normalize=True) -> None:
    super().__init__()
    self.normalize = normalize

  def fit(self, X, y=None):
    return self

  def transform(self, X: pd.DataFrame, y=None):
    srate_to_drate = np.log10(X.loc[:, 'srate'] + DEFAULT_RATE_SHIFT) / np.log10(X.loc[:, 'drate'] + DEFAULT_RATE_SHIFT)
    if self.normalize:
      return X.assign(srate_to_drate=np.log1p(srate_to_drate))
    else:
      return X.assign(srate_to_drate=srate_to_drate)

In [43]:
TO_DROP = ['saddr', 'daddr', 'seq']

data_cleaner = ColumnTransformer([
  ('drop', 'drop', TO_DROP),
  ('encode', OneHotEncoder(handle_unknown="ignore"), ['proto']),
  ('port', FunctionTransformer(func=process_ports), ['sport', 'dport']),
  ('rate', FunctionTransformer(func=shift_and_log), ['srate', 'drate']),
], remainder='passthrough')

preprocessing = Pipeline([
    ('augment', CombinedFeatureAdder()),
    ('clean', data_cleaner),
    ('std', StandardScaler()),
])

In [44]:
%%script true
X_train_cleaned = preprocessing.fit_transform(X_train)

## Full Pipeline

In [45]:
from cuml.linear_model import LogisticRegression

In [46]:
pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('model', KNeighborsClassifier(n_neighbors=5)), # Use your best model
])

In [47]:
pipeline.fit(X_train, y_train_label)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [48]:
predictions = pipeline.predict(X_test)

In [49]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test_label, predictions)
print(f'Model accuracy: {score}')

Model accuracy: 0.9962246406934667


In [50]:
save_model(pipeline)

# Verify Model Usage

In [51]:
model = joblib.load(f'{MODEL_PATH}/{MODEL_NAME}.joblib')
new_preds = model.predict(X_test)

In [52]:
score = accuracy_score(y_test_label, new_preds)
print(f'Model accuracy: {score}')

Model accuracy: 0.9962246406934667
