#  Exploratory Data Analysis
This notebook highlights some simple, yet invaluable, exploratory data science techniques.

In [None]:
# Numpy and Pandas are data science heavy lifters
import numpy as np
import pandas as pd

In [38]:
# Read CSV Argus output from a file
filename = "data/two-hour-sample.csv"
df = pd.read_csv(filename)

In [None]:
# Shape is the number of rows and columns of the dataframe
df.shape

In [None]:
# Head prints the first several rows of the dataframe
df.head(20)

In [39]:
df.columns

Index([u'Seq', u'Cause', u'StartTime', u'Proto', u'SrcAddr', u'Sport', u'Dir',
       u'DstAddr', u'Dport', u'State', u'LastTime', u'Dur', u'TotBytes',
       u'SrcBytes', u'TotAppByte', u'SAppBytes', u'TotPkts', u'SrcPkts',
       u'Load', u'Rate', u'Flgs', u'SIntPkt', u'DIntPkt', u'SIntPktMax',
       u'SIntPktMin', u'DIntPktMax', u'DIntPktMin', u'SrcJitter', u'DstJitter',
       u'SrcWin', u'DstWin', u'TcpRtt', u'StdDev', u'RelTime', u'sMaxPktSz',
       u'sMinPktSz', u'dMaxPktSz', u'dMinPktSz', u'PCRatio'],
      dtype='object')

In [None]:
# `describe` computes "5-number" summaries of the numerical fields
df.describe()

In [None]:
# Get Unique Destination ports
df["Dport"].unique()

In [None]:
# Plot a Degree Distribution
import matplotlib.pyplot as plt
plt.hist(df.groupby("DstAddr").size())
plt.show()

In [None]:
# Select only DNS flows and draw BoxPlots
dns = df[df["Dport"] == 53]
dns.shape

In [None]:
dns[["TotPkts","TotBytes"]].plot(kind='box', subplots=True, layout=(
    1, 2), sharex=False, sharey=False)
plt.show()

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(df[["Dur","TotPkts", "TotBytes"]])
plt.show()