In [83]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading the data

In [84]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


PROJECT_PATH = '/content/drive/My Drive/Datasets/ML Project'
DATASETS_PATH = f'{PROJECT_PATH}/data/recommended'
FIGURE_PATH = f'{PROJECT_PATH}/figures/cleaning'
TRAINING_PATH = f'{DATASETS_PATH}/training/training.csv'
TESTING_PATH = f'{DATASETS_PATH}/test/test.csv'
INDEX_COL = 'pkSeqID'
BEST_FEATURE_ORDER = [
  'seq', 'min', 'max', 'mean', 'stddev',
  'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP',
  'drate', 'srate',
  'saddr', 'sport', 'daddr', 'dport', 'proto',
  'state_number',
  'attack', 'category', 'subcategory'
]

In [85]:
training = pd.read_csv(TRAINING_PATH, index_col=INDEX_COL)[BEST_FEATURE_ORDER]
training.head()

KeyboardInterrupt: 

In [None]:
testing = pd.read_csv(TESTING_PATH, index_col=INDEX_COL)[BEST_FEATURE_ORDER]

# General Analysis

In [None]:
print(f'Training Shape: {training.shape}')
print(f'Testing Shape: {testing.shape}')

In [None]:
training.info()

In [None]:
import numpy as np

numerical = [training.columns[i] for i in range(len(training.columns)) if training.dtypes.iloc[i] != 'O']
non_numerical = [training.columns[i] for i in range(len(training.columns)) if training.dtypes.iloc[i] == 'O']

print(f'There are {len(numerical)} numerical features, and {len(non_numerical)} non numerical features')

In [None]:
training.isna().sum(axis=0)

In [None]:
sum(training.duplicated())

Dataset has no duplicated rows, or missing values.

In [None]:
training.head()

# Handling Unnecessary Data

## saddr Column

In [None]:
training['saddr'].value_counts()

Most of the source addresses are for a local network, so they are not that much useful in a real-world scenario

**Conclusion:**
`saddr` probably should be dropped

In [None]:
TO_DROP = ['saddr']

## daddr Column

In [None]:
training['daddr'].value_counts()

In [None]:
non_local_addr = training.loc[training['daddr'].map(lambda addr: not str(addr).startswith('192.168')), 'daddr']
l_non_local = len(non_local_addr)
l_local = len(training['daddr']) - l_non_local
print(f"Non-Local to Local ratio: {l_non_local} / {l_local} = {l_non_local / l_local}")
non_local_addr.value_counts()

A high number of destination addresses are from a local network, likely not to occur in real world scenario.

**Conclusion:** `daddr` is to dropped

In [None]:
TO_DROP += ['daddr']
print(TO_DROP)

## seq Column

In [None]:
training.head()

In [None]:
training.describe()

The `seq` column is defined as the Argus (an open-source tool used as a data source for an **ML for anamoly detection** use case).

Since this model is meant to be general and the `seq` doesn't carry any useful information it should be dropped.

In [None]:
TO_DROP += ['seq']

In [None]:
training.head()

## attack Column

In [None]:
training['category'].value_counts()

In [None]:
print(f"# Normal traffic (using attack features) not classified as 'Normal' category: {sum(training.loc[training['attack'] == 0, 'category'] != 'Normal')}")
print(f"# Normal traffic (using category features) not equal to 0 for attack: {sum(training.loc[training['category'] == 'Normal', 'attack'] != 0)}")

The category already carries the information needed and stored by the attack column, so it is probably good to drop it

In [None]:
TO_DROP += ['attack']
print(TO_DROP)

## Removing Unnecessary Information

In [None]:
training.drop(TO_DROP, axis=1, inplace=True)
testing.drop(TO_DROP, axis=1, inplace=True)

# Analysis of Target Features

## Selecting a New Target

In [None]:
training['category'].value_counts()

In [None]:
training['subcategory'].value_counts()

Since the `subcategory` column also contains useful information about the type of attack it is useful to keep it as a target so
it is reasonable to have a single column combining both `category` and `subcategory`

In [None]:
training['category'] = training['category'] + ' ' + training['subcategory']
testing['category'] = testing['category'] + ' ' + testing['subcategory']

In [None]:
training.drop('subcategory', axis=1, inplace=True)
testing.drop('subcategory', axis=1, inplace=True)

In [None]:
training['category'].value_counts()

In [None]:
category_counts = training['category'].value_counts()
(category_counts / category_counts.max()) * 100

In [None]:
category_counts = training['category'].value_counts()
plt.figure(figsize=(8, 14))
colors = plt.cm.Set3(range(len(category_counts)))  # or use custom colors
plt.pie(category_counts, labels=None, colors=colors, autopct='%1.1f%%')
plt.legend(category_counts.index, loc='lower right', bbox_to_anchor=(1.2, 0))
plt.title('Pie Chart of Attack Categories (Before)')
plt.savefig(f'{FIGURE_PATH}/category_pie_before.png')

We also notice that the data is very skewed when it comes to attack types (`Theft Data_Exfiltration` is 0.0007% of the amount of `DDoS UDP` attack instances)
This is something that needs to be addressed or the results might be missleading.

One quick fix is to have the `Theft` category be one, since there are not much `Theft Data_Exfiltration` instances, so the category can be generalized to `Theft`

Also the `Normal Normal` category can be changed for presentation purposes

## Cleaning the Target

In [None]:
training['category'].value_counts()

In [None]:
from numpy._core.defchararray import startswith
clean_target = lambda cat: (
    'Theft' if str(cat).lower().startswith('theft') else (
        'Normal' if str(cat).lower().startswith('normal') else cat
    )
)
training['category'] = training['category'].map(clean_target)
training['category'].value_counts()

In [None]:
testing['category'] = testing['category'].map(clean_target)

Another change that could be done is merging http and tcp traffic, since http builds on top of tcp

so `DoS HTTP` can be merged with `DoS TCP`, and `DDoS HTTP` with `DDoS TCP`

In [None]:
clean_target = lambda cat: 'DoS TCP' if cat == 'DoS HTTP' else 'DDoS TCP' if cat == 'DDoS HTTP' else cat
training['category'] = training['category'].map(clean_target)
training['category'].value_counts()

In [None]:
testing['category'] = testing['category'].map(clean_target)

In [None]:
category_counts = training['category'].value_counts()
plt.figure(figsize=(8, 14))
colors = plt.cm.Set3(range(len(category_counts)))  # or use custom colors
plt.pie(category_counts, labels=None, colors=colors, autopct='%1.1f%%')
plt.legend(category_counts.index, loc='lower right', bbox_to_anchor=(1.2, 0))
plt.title('Pie Chart of Attack Categories (After)')
plt.savefig(f'{FIGURE_PATH}/category_pie_after.png')

# Checkpointing

In [None]:
TESTING_CLEANED = f'{DATASETS_PATH}/test/test_cleaned.csv'
TRAINING_CLEANED_PATH = f'{DATASETS_PATH}/training/training_cleaned.csv'

## Creation

In [None]:
tmp = training.reset_index(drop=False)
tmp.to_csv(TRAINING_CLEANED_PATH, index=False)
tmp.head()

In [None]:
tmp = testing.reset_index(drop=False)
tmp.to_csv(TESTING_CLEANED, index=False)

## Recovery

In [None]:
training = pd.read_csv(TRAINING_CLEANED_PATH, index_col=INDEX_COL)

In [None]:
testing = pd.read_csv(TESTING_CLEANED, index_col=INDEX_COL)

In [None]:
training.head()

# Categorical Features

## sport and dport

In [None]:
training['sport'].value_counts()

In [None]:
training['sport'][training['sport'].map(lambda port: str(port).startswith('0x'))]

In [None]:
training['dport'][training['dport'].map(lambda port: str(port).startswith('0x'))]

As we can see some of the port numbers are written in hexadecimal when it comes to `sport` and `dport`, so they need to be all converted to decimal, and the column type converted to int64

In [None]:
training.info()

In [None]:
convert_hex = lambda p: int(p) if not str(p).startswith('0x') else int(p, base=16)

In [None]:
training['sport'] = training['sport'].map(convert_hex)
training['sport']

In [None]:
training['dport'] = training['dport'].map(convert_hex)
training['dport']

In [None]:
testing['sport'] = testing['sport'].map(convert_hex)
testing['dport'] = testing['dport'].map(convert_hex)

In [None]:
training.info()

## Real Categorical Features

In [None]:
training['proto'].value_counts()

In [None]:
training['category'].value_counts()

Some of the columns are categorical and can be handled with `OneHotEncoder`, for example.

# Checkpointing

## Creation

In [None]:
tmp = training.reset_index(drop=False)
tmp.to_csv(TRAINING_CLEANED_PATH, index=False)
tmp.head()

In [None]:
tmp = testing.reset_index(drop=False)
tmp.to_csv(TESTING_CLEANED, index=False)

## Recovery

In [None]:
training = pd.read_csv(TRAINING_CLEANED_PATH, index_col=INDEX_COL)

In [None]:
testing = pd.read_csv(TESTING_CLEANED, index_col=INDEX_COL)

In [None]:
training['category'].value_counts()

# Data Exploration

In [None]:
training['category'].value_counts()