# Outlier Detection

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import sys
import math
import matplotlib.pyplot as plt
import datetime

In [None]:
inputFile = "data/two-hour-sample.parquet"

In [None]:
df = pd.read_parquet(inputFile)

print(df.shape)
print(df.head())

In [None]:
# Rename the columns because when I did this work I liked my names better
colnames = ["StartTime", "Dur", "Proto", "SrcAddr", "Sport", "Dir", "DstAddr",
            "Dport", "TotPkts", "TotBytes", "SrcBytes"]
df = df[colnames]
df.columns = ['timestamp', 'duration', 'proto', 'src_ip', 'src_port', 'direction', 'dest_ip',
                'dest_port', 'tot_pkts', 'tot_bytes', 'bytes_toclient']
df['row_id'] = df.index

In [None]:
# Clean up missing ports
df.src_port.fillna(0)
df.src_port.fillna(0)
df.replace(to_replace={'src_port': {float('NaN'): 0},
                        'dest_port': {float('NaN'): 0}}, inplace=True)

In [None]:
# Set a place holder for the example, normally this would be extracted from the timestamp
df['day'] = 1

## Feature Creation

In [None]:
#### Add Total Counts (How much overall traffic to this IP?)
totalCount = df.shape[0]

srcDf = df[['src_ip', 'proto']].groupby(
    'src_ip', as_index=False).count().rename({"proto": "src_count"}, axis=1)
print(srcDf.head())

destDf = df[['dest_ip', 'proto']].groupby(
    'dest_ip', as_index=False).count().rename({"proto": "dest_count"}, axis=1)
print(destDf.head())

src_joined = pd.merge(df, srcDf, how='left',
                        on='src_ip', suffixes=('', '_count'))
df2 = pd.merge(src_joined, destDf, how='left', on=[
                'dest_ip'], suffixes=('', '_count'))

In [None]:
##### Compute IP percentages
srcCol = df2.columns.get_loc('src_count')
destCol = df2.columns.get_loc('dest_count')

print(str(srcCol) + " " + str(destCol))
dfa = df2.assign(src_pct=df2.src_count / totalCount)
dfb = dfa.assign(dest_pct=dfa.dest_count / totalCount)

In [None]:
#### Compute Protocol Percentages

srcDf = dfb[['src_ip', 'proto', "day"]].groupby(
    ['src_ip', 'proto'], as_index=False).count().rename({"day": "src_proto_count"}, axis=1)
# print(srcDf.head())

destDf = dfb[['dest_ip', 'proto', 'day']].groupby(
    ['dest_ip', 'proto'], as_index=False).count().rename({"day": "dest_proto_count"}, axis=1)
# print(destDf.head())

src_joined = pd.merge(dfb, srcDf, how='left', on=[
                        'src_ip', 'proto'], suffixes=('', '_count'))
df3 = pd.merge(src_joined, destDf, how='left', on=[
                'dest_ip', 'proto'], suffixes=('', '_count'))

df4 = df3.assign(src_proto_pct=df3.src_proto_count / df3.src_count)
df5 = df4.assign(dest_proto_pct=df3.dest_proto_count / df3.dest_count)

In [None]:
#### Compute Protocol Port Percentages

### First compute total protocol counts overall

protoDf = df5[['proto', 'src_port']].groupby(
    'proto', as_index=False).count().rename({"src_port": "proto_count"}, axis=1)
df6 = pd.merge(df5, protoDf, how='left',
                on='proto', suffixes=('', '_count'))

protoSPortDf = df6[['proto', 'src_port', 'day']].groupby(
    ['proto', 'src_port'], as_index=False).count().rename({"day": "proto_src_port_count"}, axis=1)
df7 = pd.merge(df6, protoSPortDf, how='left', on=[
                'proto', 'src_port'], suffixes=('', '_count'))

df8 = df7.assign(
    proto_src_port_pct=df7.proto_src_port_count/df7.proto_count)

print(df8.head())

protoDPortDf = df8[['proto', 'dest_port', 'day']].groupby(
    ['proto', 'dest_port'], as_index=False).count().rename({"day": "proto_dest_port_count"}, axis=1)
df9 = pd.merge(df8, protoDPortDf, how='left', on=[
                'proto', 'dest_port'], suffixes=('', '_count'))

df10 = df9.assign(
    proto_dest_port_pct=df9.proto_dest_port_count/df9.proto_count)

In [None]:
# Compute standardized counts for number based features
scaler = preprocessing.StandardScaler()

df10['pkts_scaled'] = scaler.fit_transform(df10[['tot_pkts']])
df10['bytes_scaled'] = scaler.fit_transform(df10[['tot_bytes']])
df10['duration_scaled'] = scaler.fit_transform(df10[['duration']])

df = df10.assign(abs_pkts=abs(df10.pkts_scaled))
df = df.assign(abs_bytes=abs(df.bytes_scaled))
df = df.assign(abs_dur=abs(df.duration_scaled))

In [None]:
featureList = ['src_pct', 'dest_pct', 'src_proto_pct', 'dest_proto_pct',
                    'proto_src_port_pct', 'proto_dest_port_pct', 'abs_pkts']

In [None]:
# Check the shape of the full data
print(df.shape)

In [None]:
# Create a subset of the variables for training
trainDf = df[featureList]
print(trainDf.shape)
print(trainDf.head())

In [None]:
# Import Outlier Math
from scipy import stats
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

In [None]:
rng = np.random.RandomState(42)

# Example settings
n_samples = 100000
outliers_fraction = 0.01  # TODO: Tweak this parameter
clusters_separation = [0, 1, 2]

In [None]:
# Set up the possibility to run multiple outlier detectors
# For the purposes of time we will only run Local Outlier Factor
# Isolation Forest is another quick and easy one to try
classifiers = {
    # "svm": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
    #                                  kernel="rbf", gamma=0.1),
    # "rc": EllipticEnvelope(contamination=outliers_fraction),
    # "iso": IsolationForest(max_samples=n_samples,
    #                                     contamination=outliers_fraction,
    #                                     random_state=rng),
    "lof": LocalOutlierFactor(
        n_neighbors=25,
        contamination=outliers_fraction)
}

In [None]:
## Run the Model
for i, (clf_name, clf) in enumerate(classifiers.items()):
    now = datetime.datetime.now()
    print("Starting " + clf_name + " " + str(now))
    # fit the data and tag outliers
    if clf_name == "lof":
        y_pred = clf.fit_predict(trainDf)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(trainDf)
        scores_pred = clf.decision_function(trainDf)
        y_pred = clf.predict(trainDf)
    threshold = stats.scoreatpercentile(scores_pred,
                                        100 * outliers_fraction)

    print(clf_name)
    print(threshold)
    print(scores_pred)

    df[clf_name] = scores_pred
    df[clf_name + "_pred"] = y_pred
print(df.head())
print(df.shape)
print(df.size)

df.head()

now = datetime.datetime.now()
print("Complete " + str(now))

In [None]:
df.groupby("lof_pred").size()

In [None]:
plt.hist(df["lof_pred"])