In [1]:
import boto3
import pandas as pd
from io import BytesIO
import numpy as np
import matplotlib.pyplot as plt

s3 = boto3.resource('s3')
for bucket in s3.buckets.all():
    print(bucket.name)


manifolddata


In [None]:
client = boto3.client('s3')
obj = client.get_object(Bucket='manifolddata', Key='week1.csv')
df = pd.read_csv(BytesIO(obj['Body'].read()))

In [None]:
df.head()

In [None]:
### Data exploration
df=df.iloc[:,[0,1,3,4,5,6,7,8]]
df.columns=['Date', 'Duration', 'Src_IP', 'Src_pt', 'Dst_IP', 'Dst_pt','Packets', 'Bytes']

In [None]:
#add an date column that is rounded to nearest hour, so we can use this as a timestep to see how frequently IP pairs occur in each timestep
df['Date']=pd.to_datetime(df['Date'], format="%Y-%m-%d %H:%M:%S.%f", errors = 'coerce')
df['date_hr']=pd.Series(df['Date']).dt.round("H")

In [None]:
df.head()

In [None]:
#create a pair column, which is a touple of the src and dst IP, sorted. 
#It does not matter which call came first, we simply want to know which pair occurs most frequently.

df['pairs']=list(zip(df.Src_IP, df.Dst_IP))
df['pairs']=df['pairs'].apply(sorted)
df['pairs2']=tuple(df['pairs'])

In [None]:
#xport dataframe for other use

df.to_csv('week1processed.csv')

In [None]:
df.head()

How many unique pairs of IP addresses are there in the entire dataset?

In [None]:
df['pairs2'].nunique()

How often does each pair occur in the entire datast?

In [None]:
pairs_count=df.groupby(['pairs2'], as_index=False)['Date'].count()
pairs_count.columns=['pairs','count']

In [None]:
plt.hist(pairs_count['count'], bins=35)
plt.ylabel('frequency')
plt.xlabel('Number of times IP pair is repeated')

Group the data by the hour timestep, count how many times each pair occurs in that hour

In [None]:
hour_group=df.groupby(['date_hr', 'pairs2'], as_index=False)['Date'].count()
hour_group.columns=[['date_hr', 'pairs', 'count']]

In [None]:
hour_group.head()

Group the data by hour timestep, count how many unique pairs occur in that hour

In [None]:
hour_group2=df.groupby(['date_hr', 'pairs2'], as_index=False)['Date'].count()
hour_group_distinct_pairs=hour_group2.groupby(['date_hr'], as_index=False)['pairs2'].count()


In [None]:
hour_group_distinct_pairs.head()

In [None]:
plt.plot(hour_group_distinct_pairs['date_hr'], hour_group_distinct_pairs['pairs2'])
plt.ylabel('Number of Distinct IP pairs')


In [None]:
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

fig, ax = plt.subplots()
ax.plot(hour_group_distinct_pairs['date_hr'], hour_group_distinct_pairs['pairs2'])

myFmt = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(myFmt)
ax.annotate('Weekend', xy=('2017-8-6', 100), xytext=('2017-8-6', 500),
            arrowprops=dict(facecolor='black', shrink=0.05),
            )

## Rotate date labels automatically
fig.autofmt_xdate()
plt.show()

How many rows are in 1 day of data?

In [None]:
df