## Setup

In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go


In [2]:
df = pd.read_csv('../data/computer_security.csv')

In [3]:
df.head()

Unnamed: 0,date,l_ipn,r_asn,f
0,2006-07-01,0,701,1
1,2006-07-01,0,714,1
2,2006-07-01,0,1239,1
3,2006-07-01,0,1680,1
4,2006-07-01,0,2514,1


## Assigning outliers using IQR method 

In [4]:
def iqr_outliers(obs):
    """
    Method determines if observation is an outlier or not 
    """
    q1, q3 = np.percentile(obs, [25, 75])
    iqr = q3 - q1
    upper = q3 + (iqr * 1.5)
    lower = q1 - (iqr * 1.5)
    
    return [1 if (i > upper) or (i < lower) else 0 for i in obs]
    

In [5]:
def row_outliers(vals):
    res = iqr_outliers(vals)
    return res

In [6]:
df['outlier'] = np.nan

for n in range(10):
    num = n
    mask = (df['l_ipn'] == num)
    df_valid = df[mask]
    df.loc[mask, 'outlier'] = row_outliers(df_valid['f'].values)
    

In [7]:
df.head()

Unnamed: 0,date,l_ipn,r_asn,f,outlier
0,2006-07-01,0,701,1,0.0
1,2006-07-01,0,714,1,0.0
2,2006-07-01,0,1239,1,0.0
3,2006-07-01,0,1680,1,0.0
4,2006-07-01,0,2514,1,0.0


## Preparing data 

In [8]:
# working on the assumption that compromised ips would have increased number of outlier r_asn's over time
df_agg = df.groupby(['date', 'l_ipn'])[['date','f','outlier']].sum().reset_index()

In [9]:
df_agg.head()

Unnamed: 0,date,l_ipn,f,outlier
0,2006-07-01,0,106,3.0
1,2006-07-01,1,640,3.0
2,2006-07-01,2,1677,4.0
3,2006-07-01,3,22,2.0
4,2006-07-01,4,184,1.0


In [10]:
df_agg['out_f_prop'] = df_agg['outlier'] / df_agg['f']

In [11]:
df_agg.head()

Unnamed: 0,date,l_ipn,f,outlier,out_f_prop
0,2006-07-01,0,106,3.0,0.028302
1,2006-07-01,1,640,3.0,0.004687
2,2006-07-01,2,1677,4.0,0.002385
3,2006-07-01,3,22,2.0,0.090909
4,2006-07-01,4,184,1.0,0.005435


In [12]:
df_agg['out_total_out_prop'] = df_agg['outlier'] / df_agg['f']

In [13]:
df_agg2 = df_agg.copy()

In [14]:
df_agg2 = df_agg2.groupby('date')['outlier'].sum().reset_index()
df_agg2.rename(columns={'outlier':'daily_total_outliers'}, inplace=True)

In [15]:
df_agg_total = df_agg.copy()

In [16]:
df_agg_total = df_agg_total.merge(df_agg2, on="date")

In [17]:
df_agg_total.head()

Unnamed: 0,date,l_ipn,f,outlier,out_f_prop,out_total_out_prop,daily_total_outliers
0,2006-07-01,0,106,3.0,0.028302,0.028302,33.0
1,2006-07-01,1,640,3.0,0.004687,0.004687,33.0
2,2006-07-01,2,1677,4.0,0.002385,0.002385,33.0
3,2006-07-01,3,22,2.0,0.090909,0.090909,33.0
4,2006-07-01,4,184,1.0,0.005435,0.005435,33.0


In [18]:
df_agg_total['out_total_out_prop'] = df_agg_total['outlier']/df_agg_total['daily_total_outliers']

In [19]:
df_agg_total.head()

Unnamed: 0,date,l_ipn,f,outlier,out_f_prop,out_total_out_prop,daily_total_outliers
0,2006-07-01,0,106,3.0,0.028302,0.090909,33.0
1,2006-07-01,1,640,3.0,0.004687,0.090909,33.0
2,2006-07-01,2,1677,4.0,0.002385,0.121212,33.0
3,2006-07-01,3,22,2.0,0.090909,0.060606,33.0
4,2006-07-01,4,184,1.0,0.005435,0.030303,33.0


## Plot

In [20]:
## Offline mode
# from plotly.offline import init_notebook_mode, iplot
# init_notebook_mode(connected=True)

### Plotting Number of Outliers to Number of Flows per IP

In [23]:
# Create the data object
num = 0
ip_0_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='red', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])

num = 1
ip_1_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='blue', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])


num = 2
ip_2_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='green', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])

num = 3
ip_3_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='purple', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])

num = 4
ip_4_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='gray', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])

num = 5
ip_5_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='orange', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])

num = 6
ip_6_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='yellow', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])

num = 7
ip_7_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='pink', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])

num = 8
ip_8_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='turquoise', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])

num = 9
ip_9_data = go.Scatter(x=df_agg[df_agg['l_ipn'] == num]['date'].values,
                        y=df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values,
                        line=go.scatter.Line(color='brown', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/F Ratio' for x in df_agg[df_agg['l_ipn'] == num]['out_f_prop'].values])



# # Create a layout with two yaxes
layout = go.Layout(height=600, width=1000, font=dict(size=18),
                   title='Number of Outliers to Number of Flows per IP',
                   xaxis=dict(title='Date', type='date'),
                   yaxis=dict(title='Outlier/F Ratio'),                   
                   yaxis2=dict(title='IP 0', color='red'),
                   # Add a second yaxis to the right of the plot
                   yaxis3=dict(title='IP 1', color='blue',
                                          overlaying='y',
                                          side='right'),
                   yaxis4=dict(title='IP 2', color='green',
                                          overlaying='y',
                                          side='right'),
                   yaxis5=dict(title='IP 3', color='purple',
                                          overlaying='y',
                                          side='right'),
                   yaxis6=dict(title='IP 4', color='gray',
                                          overlaying='y',
                                          side='right'),
                   yaxis7=dict(title='IP 5', color='orange',
                                          overlaying='y',
                                          side='right'),
                   yaxis8=dict(title='IP 6', color='yellow',
                                          overlaying='y',
                                          side='right'),
                   yaxis9=dict(title='IP 7', color='pink',
                                          overlaying='y',
                                          side='right'),
                   yaxis10=dict(title='IP 8', color='turquoise',
                                          overlaying='y',
                                          side='right'),
                   yaxis11=dict(title='IP 9', color='brown',
                                          overlaying='y',
                                          side='right')
                   )

fig = go.Figure(data=[ip_0_data, ip_1_data, ip_2_data, ip_3_data,
                      ip_4_data, ip_5_data, ip_6_data, ip_7_data,
                      ip_8_data, ip_9_data], layout=layout)
# iplot(fig)
py.iplot(fig, filename = "Number of Outliers to Number of Flows per IP")



Consider using IPython.display.IFrame instead



### Plotting Number of Outliers to Daily Total Outliers per IP

In [24]:
# Create the data object

# red, blue, green, purple, gray, orange, yellow, pink, turquoise, brown

num = 0
ip_0_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='red', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])

num = 1
ip_1_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='blue', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])


num = 2
ip_2_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='green', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])

num = 3
ip_3_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='purple', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])

num = 4
ip_4_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='gray', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])

num = 5
ip_5_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='orange', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])

num = 6
ip_6_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='yellow', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])

num = 7
ip_7_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='pink', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])

num = 8
ip_8_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='turquoise', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])

num = 9
ip_9_data = go.Scatter(x=df_agg_total[df_agg_total['l_ipn'] == num]['date'].values,
                        y=df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values,
                        line=go.scatter.Line(color='brown', width = 0.8),
                           opacity=0.8,
                           name= f'IP {num}',
                           text=[f'IP {num}: {x:.3f} Outlier/Total Outliers' 
                                 for x in df_agg_total[df_agg_total['l_ipn'] == num]['out_total_out_prop'].values])



# # Create a layout with two yaxes
layout = go.Layout(height=600, width=1000, font=dict(size=18),
                   title='Number of Outliers to Daily Total Outliers per IP',
                   xaxis=dict(title='Date', type='date'),
                   yaxis=dict(title='Outlier/Total Outliers Ratio'),                   
                   yaxis2=dict(title='IP 0', color='red'),
                   # Add a second yaxis to the right of the plot
                   yaxis3=dict(title='IP 1', color='blue',
                                          overlaying='y',
                                          side='right'),
                   yaxis4=dict(title='IP 2', color='green',
                                          overlaying='y',
                                          side='right'),
                   yaxis5=dict(title='IP 3', color='purple',
                                          overlaying='y',
                                          side='right'),
                   yaxis6=dict(title='IP 4', color='gray',
                                          overlaying='y',
                                          side='right'),
                   yaxis7=dict(title='IP 5', color='orange',
                                          overlaying='y',
                                          side='right'),
                   yaxis8=dict(title='IP 6', color='yellow',
                                          overlaying='y',
                                          side='right'),
                   yaxis9=dict(title='IP 7', color='pink',
                                          overlaying='y',
                                          side='right'),
                   yaxis10=dict(title='IP 8', color='turquoise',
                                          overlaying='y',
                                          side='right'),
                   yaxis11=dict(title='IP 9', color='brown',
                                          overlaying='y',
                                          side='right')
                   )

fig = go.Figure(data=[ip_0_data, ip_1_data, ip_2_data, ip_3_data,
                      ip_4_data, ip_5_data, ip_6_data, ip_7_data,
                      ip_8_data, ip_9_data], layout=layout)
# iplot(fig)
py.iplot(fig, filename = "Number of Outliers to Daily Total Outliers per IP")
