Attempt to make forecasts on the frequency of each IP pair using facebook Prophet model adjusted with a hierarchical or "grouped" structure. We will treat each pair as a "group", but allow them all to inform eachother.


Import data and transform it into correct time-series format.

For our purposed to get a count or frequency of each pair, we need some timestep to group these data on. For simplicity we will use a 1-hr timestep.

In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO
from mlxtend.preprocessing import TransactionEncoder
import random
from fbprophet import Prophet
import htsprophet
from htsprophet.hts import hts, orderHier, makeWeekly
from htsprophet.htsPlot import plotNode, plotChild, plotNodeComponents

In [2]:
#load data

client = boto3.client('s3')
obj = client.get_object(Bucket='manifolddata', Key='week1.csv')
df = pd.read_csv(BytesIO(obj['Body'].read()))


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df=df.iloc[:,[0,1,3,4,5,6,7,8]]
df.columns=['Date', 'Duration', 'Src_IP', 'Src_pt', 'Dst_IP', 'Dst_pt','Packets', 'Bytes']
#add an date column that is rounded to nearest hour, so we can use this as a timestep to see how frequently IP pairs occur in each timestep
df['Date']=pd.to_datetime(df['Date'], format="%Y-%m-%d %H:%M:%S.%f", errors = 'coerce')
df['date_hr']=pd.Series(df['Date']).dt.round("H")
#create a pair column, which is a touple of the src and dst IP, sorted. 
#It does not matter which call came first, we simply want to know which pair occurs most frequently.

df['pairs']=list(zip(df.Src_IP, df.Dst_IP))
df['pairs']=df['pairs'].apply(sorted)
df['pairs2']=tuple(df['pairs'])


In [4]:
def convert_si_to_number(x):
    total_stars = 0
    if 'k' in x:
        if len(x) > 1:
            total_stars = float(x.replace('k', '')) * 1000 # convert k to a thousand
    elif 'M' in x:
        if len(x) > 1:
            total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
    elif 'B' in x:
        total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
    else:
        total_stars = int(x) # Less than 1000
    
    return int(total_stars)

In [5]:
df.Bytes=df.Bytes.astype('str')
test_list=df.Bytes

#[i for i, s in enumerate(test_list) if 'M' in s]#show where the M errors are happening

test_list= [convert_si_to_number(x) for x in test_list]
df.Bytes=test_list #bring it back into the dataframe

In [19]:
#Group the data by hour timestep and get a frequency of occurance for each pair

hr_group=df.groupby(['date_hr', 'pairs2'],as_index=False)['Date'].count()
hr_group.pairs2=hr_group.pairs2.astype(str) #need to convert tuple to string for hts function
hr_group=hr_group.rename(columns={'Date':'frequency'})
hr_group.head() #the pairs2 column is our 'group'

Unnamed: 0,date_hr,pairs2,frequency
0,2017-08-02,"('10000_34', '192.168.210.54')",6
1,2017-08-02,"('10001_213', '192.168.210.54')",53
2,2017-08-02,"('10002_162', '192.168.210.57')",10
3,2017-08-02,"('10002_174', '192.168.210.54')",20
4,2017-08-02,"('10002_174', '192.168.210.57')",30


In [35]:
hr_group.pairs2.nunique()

44751

In [20]:
hr_group2=hr_group[0:2000]

In [37]:
#tell prophet what is our grouping variable
data2, nodes = orderHier(hr_group2, 1)

In [53]:
len(data2.columns)

1217

In [38]:
nodes

[[120]]

In [54]:
node1=np.repeat(1, 1215)
sum(node1)

1215

In [None]:
Prophet(growth=growth, changepoints=changepoints1, n_changepoints=n_changepoints1, yearly_seasonality=yearly_seasonality, weekly_seasonality=weekly_seasonality, holidays=holidays, seasonality_prior_scale=seasonality_prior_scale, \
                            holidays_prior_scale=holidays_prior_scale, changepoint_prior_scale=changepoint_prior_scale, mcmc_samples=mcmc_samples, interval_width=interval_width, uncertainty_samples=uncertainty_samples)

In [61]:
#run time series model
#myDict = hts(data2, 52, nodes, holidays = holidays, method = "FP", transform = "BoxCox")
myDict = hts(data2,1, nodes=[[1215]], freq='H', holidays = None) 


#the default mehthod
#OLS" - optimal combination by Original Least Squares (Default), 

ValueError: holidays must be a DataFrame with 'ds' and 'holiday' columns.

In [15]:
#%% Random data (Change this to whatever data you want)
date = pd.date_range("2015-04-02", "2017-07-17")
date = np.repeat(date, 10)
medium = ["Air", "Land", "Sea"]
businessMarket = ["Birmingham","Auburn","Evanston"]
platform = ["Stone Tablet","Car Phone"]
mediumDat = np.random.choice(medium, len(date))
busDat = np.random.choice(businessMarket, len(date))
platDat = np.random.choice(platform, len(date))
sessions = np.random.randint(1000,10000,size=(len(date),1))
data = pd.DataFrame(date, columns = ["day"])
data["medium"] = mediumDat
data["platform"] = platDat
data["businessMarket"] = busDat
data["sessions"] = sessions

In [16]:
data.head()

Unnamed: 0,day,medium,platform,businessMarket,sessions
0,2015-04-02,Land,Stone Tablet,Evanston,4542
1,2015-04-02,Air,Stone Tablet,Auburn,4460
2,2015-04-02,Sea,Car Phone,Evanston,6009
3,2015-04-02,Sea,Stone Tablet,Birmingham,5908
4,2015-04-02,Air,Stone Tablet,Evanston,1472


In [28]:
data1 = makeWeekly(data)
##
# Put the data in the format to run HTS, and get the nodes input (a list of list that describes the hierarchical structure)
##
data3, nodes3 = orderHier(data, 1, 2, 3)

In [34]:
nodes3

[[3], [2, 2, 2], [3, 3, 3, 3, 3, 3]]

In [30]:
data3.head()

Unnamed: 0,time,Total,Land,Air,Sea,Land_Stone Tablet,Land_Car Phone,Air_Stone Tablet,Air_Car Phone,Sea_Stone Tablet,...,Air_Stone Tablet_Birmingham,Air_Car Phone_Evanston,Air_Car Phone_Auburn,Air_Car Phone_Birmingham,Sea_Stone Tablet_Evanston,Sea_Stone Tablet_Auburn,Sea_Stone Tablet_Birmingham,Sea_Car Phone_Evanston,Sea_Car Phone_Auburn,Sea_Car Phone_Birmingham
0,2015-04-02,51722,10065.0,12861.0,28796.0,4542.0,5523.0,5932.0,6929.0,5908.0,...,1.0,1.0,6929.0,1.0,1.0,1.0,5908.0,14210.0,8678.0,1.0
1,2015-04-03,63599,20020.0,23203.0,20376.0,7754.0,12266.0,8507.0,14696.0,8228.0,...,1.0,8560.0,1.0,6136.0,8228.0,1.0,1.0,1.0,7586.0,4562.0
2,2015-04-04,59830,19390.0,39071.0,1369.0,17043.0,2347.0,24928.0,14143.0,1369.0,...,10377.0,1.0,5865.0,8278.0,1369.0,1.0,1.0,1.0,1.0,1.0
3,2015-04-05,35307,12694.0,8772.0,13841.0,4160.0,8534.0,5319.0,3453.0,12075.0,...,2184.0,1.0,1.0,3453.0,1.0,12075.0,1.0,1766.0,1.0,1.0
4,2015-04-06,43560,20756.0,10114.0,12690.0,20756.0,1.0,10114.0,1.0,1.0,...,2107.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12690.0,1.0
