Attempt to make forecasts on the frequency of each IP pair using facebook Prophet model adjusted with a hierarchical or "grouped" structure. We will treat each pair as a "group", but allow them all to inform eachother.


Import data and transform it into correct time-series format.

For our purposed to get a count or frequency of each pair, we need some timestep to group these data on. For simplicity we will use a 1-hr timestep.

In [21]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO
from mlxtend.preprocessing import TransactionEncoder
import random
from fbprophet import Prophet
import htsprophet
from htsprophet.hts import hts, orderHier, makeWeekly
from htsprophet.htsPlot import plotNode, plotChild, plotNodeComponents

In [2]:
#load data

client = boto3.client('s3')
obj = client.get_object(Bucket='manifolddata', Key='week1.csv')
df = pd.read_csv(BytesIO(obj['Body'].read()))


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df=df.iloc[:,[0,1,3,4,5,6,7,8]]
df.columns=['Date', 'Duration', 'Src_IP', 'Src_pt', 'Dst_IP', 'Dst_pt','Packets', 'Bytes']
#add an date column that is rounded to nearest hour, so we can use this as a timestep to see how frequently IP pairs occur in each timestep
df['Date']=pd.to_datetime(df['Date'], format="%Y-%m-%d %H:%M:%S.%f", errors = 'coerce')
df['date_hr']=pd.Series(df['Date']).dt.round("H")
#create a pair column, which is a touple of the src and dst IP, sorted. 
#It does not matter which call came first, we simply want to know which pair occurs most frequently.

df['pairs']=list(zip(df.Src_IP, df.Dst_IP))
df['pairs']=df['pairs'].apply(sorted)
df['pairs2']=tuple(df['pairs'])


In [4]:
def convert_si_to_number(x):
    total_stars = 0
    if 'k' in x:
        if len(x) > 1:
            total_stars = float(x.replace('k', '')) * 1000 # convert k to a thousand
    elif 'M' in x:
        if len(x) > 1:
            total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
    elif 'B' in x:
        total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
    else:
        total_stars = int(x) # Less than 1000
    
    return int(total_stars)

In [5]:
df.Bytes=df.Bytes.astype('str')
test_list=df.Bytes

#[i for i, s in enumerate(test_list) if 'M' in s]#show where the M errors are happening

test_list= [convert_si_to_number(x) for x in test_list]
df.Bytes=test_list #bring it back into the dataframe

In [26]:
#Group the data by hour timestep and get a frequency of occurance for each pair

hr_group=df.groupby(['date_hr', 'pairs2'],as_index=False)['Date'].count()
hr_group.pairs2=hr_group.pairs2.astype(str) #need to convert tuple to string for hts function
hr_group.head() #the pairs2 column is our 'group'

Unnamed: 0,date_hr,pairs2,Date
0,2017-08-02,"('10000_34', '192.168.210.54')",6
1,2017-08-02,"('10001_213', '192.168.210.54')",53
2,2017-08-02,"('10002_162', '192.168.210.57')",10
3,2017-08-02,"('10002_174', '192.168.210.54')",20
4,2017-08-02,"('10002_174', '192.168.210.57')",30


In [27]:
hr_group.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158239 entries, 0 to 158238
Data columns (total 3 columns):
date_hr    158239 non-null datetime64[ns]
pairs2     158239 non-null object
Date       158239 non-null int64
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 4.8+ MB


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8185992 entries, 0 to 8185991
Data columns (total 11 columns):
Date        datetime64[ns]
Duration    float64
Src_IP      object
Src_pt      int64
Dst_IP      object
Dst_pt      float64
Packets     int64
Bytes       int64
date_hr     datetime64[ns]
pairs       object
pairs2      object
dtypes: datetime64[ns](2), float64(2), int64(3), object(4)
memory usage: 687.0+ MB


In [None]:
#tell prophet what is our grouping variable
data2, nodes = orderHier(hr_group, 1)

In [None]:
#run time series model

myDict = hts(data2, freq='H', nodes) 
#the default mehthod
#OLS" - optimal combination by Original Least Squares (Default), 