#  Exploratory Data Analysis
This notebook highlights some simple, yet invaluable, exploratory data science techniques.

In [None]:
# Numpy and Pandas are data science heavy lifters
import numpy as np
import pandas as pd

In [None]:
# Read CSV Argus output from a file
filename = "data/two-hour-sample.csv"
df = pd.read_csv(filename)

In [None]:
# Shape is the number of rows and columns of the dataframe
df.shape

In [None]:
# Head prints the first several rows of the dataframe
df.head(20)

In [None]:
df.columns

In [None]:
# `describe` computes "5-number" summaries of the numerical fields
df.describe()

In [None]:
# Get Unique Destination ports
df["Dport"].unique()

In [None]:
# Plot a Degree Distribution
import matplotlib.pyplot as plt
plt.hist(df.groupby("DstAddr").size())
plt.show()

In [None]:
# Select only DNS flows and draw BoxPlots
dns = df[df["Dport"] == 53]
dns.shape

In [None]:
dns[["TotPkts","TotBytes"]].plot(kind='box', subplots=True, layout=(
    1, 2), sharex=False, sharey=False)
plt.show()

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(df[["Dur","TotPkts", "TotBytes"]])
plt.show()