In [253]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import pylab
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
import re
import random
from keras.models import Sequential
from keras.layers import Dense
import itertools

%matplotlib inline

pd.set_option('display.max_columns', None)

In [269]:
## Read Data, Normalize Columns ##


def read_clean(data_loc):
    output_data = pd.read_csv(data_loc)
    output_data.columns = output_data.columns.str.replace(' ', '')
    output_data.columns = [x.lower() for x in output_data.columns]
    output_data.rename(columns={'__time':'time'}, inplace=True)
    return output_data
    
data = read_clean(r'C:\Users\mattd\Documents\embeddings\raw_flow_data/jan01.csv')
data_meta = read_clean(r'C:\Users\mattd\Documents\embeddings\raw_flow_data/jan01.csv')

In [266]:
data.columns

Index(['time', 'apphttphost', 'appid', 'appname', 'applicationtimemax',
       'avcdelayapplicationsum', 'averageapplicationdelay',
       'averageclientnetworkdelay', 'averagenetworkdelay',
       'averageresponsedelay', 'averageservernetworkdelay',
       'averagetxnduration', 'biflowdirection', 'businessrelevance',
       'clientbytes', 'clientip', 'clientnetworkdelay', 'clientnetworktimemax',
       'clientpackets', 'clienttoservernetworkdelay',
       'clienttoserverresponsedelay', 'count', 'destip', 'destport', 'egrintf',
       'ingrintf', 'newconnectionscount', 'parentaccountid', 'protocol',
       'retransmittedclientpackets', 'serverbytes', 'serverip',
       'servernetworkdelay', 'servernetworktimemax', 'serverresponsecount',
       'sourceip', 'sourceport', 'trafficclass', 'txncount', 'txndurationsum',
       'vendor', 'vhid', 'vlid'],
      dtype='object')

In [267]:
data_meta.columns

Index(['time', 'apphttphost', 'appid', 'appname', 'applicationtimemax',
       'avcdelayapplicationsum', 'averageapplicationdelay',
       'averageclientnetworkdelay', 'averagenetworkdelay',
       'averageresponsedelay', 'averageservernetworkdelay',
       'averagetxnduration', 'biflowdirection', 'businessrelevance',
       'clientbytes', 'clientip', 'clientnetworkdelay', 'clientnetworktimemax',
       'clientpackets', 'clienttoservernetworkdelay',
       'clienttoserverresponsedelay', 'count', 'destip', 'destport', 'egrintf',
       'ingrintf', 'newconnectionscount', 'parentaccountid', 'protocol',
       'retransmittedclientpackets', 'serverbytes', 'serverip',
       'servernetworkdelay', 'servernetworktimemax', 'serverresponsecount',
       'sourceip', 'sourceport', 'trafficclass', 'txncount', 'txndurationsum',
       'vendor', 'vhid', 'vlid'],
      dtype='object')

In [268]:
## Slice Data

def slice_data(data):
    pkt_cnt_cols = ['connserverpacketscnt', 'connserverpacketscnt']
    data = data[data[pkt_cnt_cols].sum(axis=1) > 50]
    return data

#data = slice_data(data)
#data_meta = slice_data(data_meta)

In [240]:

def inspect_cols(data):
    print("## Data Has {0} Rows And {1} Columns ##".format(len(data), len(data.columns)))
    print('## Counts Of Column Dtypes ##')
    print(data.dtypes.value_counts())
    
inspect_cols(data)
print('\n')
inspect_cols(data_meta)

## Data Has 150010 Rows And 43 Columns ##
## Counts Of Column Dtypes ##
int64     28
object    15
dtype: int64


## Data Has 150010 Rows And 43 Columns ##
## Counts Of Column Dtypes ##
int64     28
object    15
dtype: int64


In [270]:
## Drop Columns With A Single Unique Value And Other Columns As Needed ##

def drop_cols(data,cols_to_drop):
    for col in data.columns:
        if (len(data[col].unique()) == 1) or (col in cols_to_drop):
            data.drop(col, inplace=True, axis=1)
            #print("Feature '{0}' was dropped.".format(col))
    return data
        
cols_to_drop = ['ipttl', 'apphttphost', 'appid', 'txncount', 'txndurationsum', 'serverresponsecount', 'servernetworktimemax'
               , 'servernetworkdelay', 'count', 'clientpackets', 'destip', 'clientnetworktimemax']
data = drop_cols(data, cols_to_drop)
data_meta = drop_cols(data_meta, cols_to_drop)

In [271]:
## Correct DateTime

hour_dict = {8: 'am', 9: 'am', 10: 'am', 11: 'am', 12: 'am', 13: 'am', 14: 'am', 15: 'am', 17: 'morn', 18: 'morn', 19: 'morn', 20: 'morn', 21: 'afternoon', 22: 'afternoon', 23: 'afternoon', 0: 'afternoon', 1: 'afternoon', 2: 'evening', 3: 'evening', 4: 'evening', 5: 'evening', 6: 'evening', 7: 'evening', 16: 'morn'}
day_dict = {0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4: 'friday', 5: 'saturday', 6: 'sunday'}
def to_datetime(data,time_col):
    
    data['time_col'] = pd.to_datetime(data[time_col])
    data['time_col_hour'] = data['time_col'].dt.hour
    data['time_col_day'] = data['time_col'].dt.weekday
    data = data.drop(time_col, 1)
    data = data.drop('time_col', 1)
    data['time_hour'] = pd.Series(data['time_col_hour'], index=data.index).map(hour_dict)
    data['time_day'] = pd.Series(data['time_col_day'], index=data.index).map(day_dict)
    data = data.drop('time_col_hour', 1)
    data = data.drop('time_col_day', 1)
    return data
        
time_col = 'time'
data = to_datetime(data, time_col)
data_meta = to_datetime(data_meta, time_col)

In [248]:
data.head()

Unnamed: 0,appname,avcdelayapplicationsum,averageapplicationdelay,averageclientnetworkdelay,averagenetworkdelay,averageresponsedelay,averageservernetworkdelay,averagetxnduration,biflowdirection,businessrelevance,clientbytes,clientip,clientnetworkdelay,clienttoservernetworkdelay,clienttoserverresponsedelay,egrintf,ingrintf,newconnectionscount,retransmittedclientpackets,serverbytes,serverip,sourceip,trafficclass,time_hour,time_day
0,cisco_ms-rpc,110,6,0,245,250,244,249,1,y,6808,10.16.15.124,4,1960,3959,2,1,8,0,6544,10.193.25.23,10.16.15.124,transactional-data,afternoon,tuesday
1,cisco_ms-rpc,467,14,0,58,72,58,77,1,y,14692,10.16.15.124,5,879,2457,2,1,15,4,5124,10.32.112.145,10.16.15.124,transactional-data,afternoon,tuesday
2,cisco_ms-rpc,1125,9,0,37,46,37,339,1,y,28377,10.16.15.124,1,187,7932,2,1,5,1,33225,10.32.112.64,10.16.15.124,transactional-data,afternoon,tuesday
3,cisco_ms-rpc,34,1,0,80,80,79,80,1,y,8895,10.16.15.124,1,160,2255,2,1,2,0,4626,10.45.62.222,10.16.15.124,transactional-data,afternoon,tuesday
4,cisco_ms-rpc,230,10,0,248,258,248,321,1,y,16561,10.16.15.124,1,2239,5824,2,1,9,0,7819,172.21.128.31,10.16.15.124,transactional-data,afternoon,tuesday


In [257]:
data.shape

(150010, 25)

In [259]:
data_meta.shape

(150010, 25)

In [89]:
def inspect_col_labels(data):
    for col in data.columns:
        if data[col].dtypes == 'object':
            unique_cat = len(data[col].unique())
            #print("Categorical Feature '{0}' has {1} unique labels".format(col, unique_cat))
        elif True:
            feature_min = int(data[col].min())
            feature_max = int(data[col].max())
            feature_mean = int(data[col].mean())
            feature_median = int(data[col].median())
            #print("Numerical Feature '{0}' has min:{1} max:{2} mean:{3} median:{4}".format(
            #col, feature_min, feature_max, feature_mean, feature_median))
            
inspect_col_labels(data)
#inspect_col_labels(data_meta)

In [40]:
## Calculate AVC Stats, Map Per-Server Info Back To Each Flow
def avc_stats(data):
    avc_df = data.pivot_table(['clientnetworkdelay', 'serverresponsecount', 'newconnectionscount', 'connappdelaysum', 'conntoservernetwdelaysum'], index='connipv4responderaddr', aggfunc='sum')
    avc_csd_srv_avg = avc_df['conntoclientnetwdelaysum'] / avc_df['conncountnew']
    avc_csd_srv_avg.fillna(0, inplace=True)
    avc_csd_srv_avg_dict = avc_csd_srv_avg.to_dict()
    data['avc_csd_srv_avg'] = pd.Series(data['connipv4responderaddr'], index=data.index).map(avc_csd_srv_avg_dict)

    avc_ssd_srv_avg = avc_df['conntoservernetwdelaysum'] / avc_df['conncountnew']
    avc_ssd_srv_avg.fillna(0, inplace=True)
    avc_ssd_srv_avg_dict = avc_ssd_srv_avg.to_dict()
    data['avc_ssd_srv_avg'] = pd.Series(data['connipv4responderaddr'], index=data.index).map(avc_ssd_srv_avg_dict)

    avc_ad_srv_avg = avc_df['connappdelaysum'] / avc_df['connserverrespcnt']
    avc_ad_srv_avg.fillna(0, inplace=True)
    avc_ad_srv_avg_dict = avc_ad_srv_avg.to_dict()
    data['avc_ad_srv_avg'] = pd.Series(data['connipv4responderaddr'], index=data.index).map(avc_ad_srv_avg_dict)
    return data

#data = avc_stats(data)
#data_meta = avc_stats(data_meta)

In [153]:
data.columns

Index(['appname', 'avcdelayapplicationsum', 'averageapplicationdelay',
       'averageclientnetworkdelay', 'averagenetworkdelay',
       'averageresponsedelay', 'averageservernetworkdelay',
       'averagetxnduration', 'biflowdirection', 'businessrelevance',
       'clientbytes', 'clientip', 'clientnetworkdelay',
       'clienttoservernetworkdelay', 'clienttoserverresponsedelay', 'egrintf',
       'ingrintf', 'newconnectionscount', 'retransmittedclientpackets',
       'serverbytes', 'serverip', 'sourceip', 'trafficclass', 'time_col_hour',
       'time_col_day'],
      dtype='object')

In [41]:
## Map Per-Server Data Back To Each Flow


def map_sums(data):
    map_sums_cols = ['connhistogramlate', 'connclientretriescnt']
    for col in map_sums_cols:
        map_dict = data.pivot_table(map_sums_cols, index='connipv4responderaddr', aggfunc='sum')[col].to_dict()
        data[col+'_for_srv'] = pd.Series(data['connipv4responderaddr'], index=data.index).map(map_dict)
    return data

#data = map_sums(data)
#data_meta = map_sums(data_meta)

In [42]:
data_meta.head()

Unnamed: 0,connipv4initiatoraddr,connipv4responderaddr,connresponderport,appname,ipv4srcaddr,ipv4dstaddr,intfinput,intfoutput,conninitiator,conncountnew,connsumdur,connresponderoctets,connserverpacketscnt,conninitiatoroctets,connclientpacketscnt,connservernetworkbytescnt,connclientnetworkbytescnt,httphost,ipdscp,conntoserverrespdelaysum,connserverrespcnt,connhistogramlate,conntoservernetwdelaysum,conntoservernetwdelaymax,conntoclientnetwdelaysum,conntoclientnetwdelaymax,connclientretriescnt,connclientservernetwdelaysum,connappdelaysum,connappdelaymax,connclientserverrespdelaysum,conntotaltransactiondurationsum,conntotaltransactiondurationmin,conntotaltransactiondurationmax,conntransactioncnt,avc_csd_srv_avg,avc_ssd_srv_avg,avc_ad_srv_avg,connhistogramlate_for_srv,connclientretriescnt_for_srv
6,10.19.167.16,10.32.167.111,5308.0,layer7 ssl,10.32.167.111,10.19.167.16,Gi0/0/1,Gi0/0/0,Reverse initiator,8.0,42.0,13274.0,165.0,21743.0,210.0,21918.0,32739.0,,0x00,4209.0,82.0,0.0,404.0,67.0,4.0,1.0,1.0,408.0,316.0,20.0,4271.0,7430.0,41.0,479.0,82.0,0.397059,51.397059,3.488487,0.0,5.0
16,10.16.15.3,10.200.250.16,60979.0,layer7 unknown,10.200.250.16,10.16.15.3,Gi0/0/1,Gi0/0/0,Reverse initiator,0.0,15.0,1521.0,38.0,49100.0,46.0,3041.0,50940.0,,0x00,6545.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6545.0,578.0,6545.0,7801.0,533.0,1256.0,13.0,0.0,0.0,535.741935,1.0,0.0
82,10.33.194.45,10.19.165.13,445.0,layer7 cifs,10.33.194.45,10.19.165.13,Gi0/0/1,Gi0/0/0,Initiator,0.0,15.0,7433.0,50.0,6488.0,62.0,9433.0,9004.0,,0x00,14.0,31.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,14.0,1.0,2959.0,901.0,0.0,888.0,30.0,212.0,0.0,0.262136,0.0,8.0
83,10.33.195.190,10.19.112.169,3389.0,port ms-wbt,10.33.195.190,10.19.112.169,Gi0/0/1,Gi0/0/0,Initiator,0.0,15.0,952.0,52.0,6010.0,72.0,3032.0,8890.0,,0x22,57.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,28.0,375.0,7455.0,394.0,6577.0,3.0,0.0,0.0,6.625,0.0,0.0
84,10.0.4.201,10.19.4.201,7800.0,NBAR riverbed_appliance,10.0.4.201,10.19.4.201,Gi0/0/1,Gi0/0/0,Initiator,3.0,47.0,9324.0,57.0,6461.0,50.0,12296.0,9069.0,,0x00,4.0,14.0,0.0,0.0,0.0,88.0,88.0,1.0,88.0,4.0,1.0,180.0,5319.0,1.0,4627.0,13.0,38.709459,0.108108,1.362468,16.0,10.0


In [272]:
## Map Categorical Variables Whose Labels Occur Less Than 5% Of The Time To Single "Rare" Label ##
## Map Numerical Variables Considered Categories Whose Labels Occur Less Than 5% Of The Time To Single Arbitrary Number ##

special_numerical_vars = ['connresponderport']

def rare_labeling(data, variable, rare_threshold, rare_label):
    # function to encode labels into numbers
    # each label will be assigned an ordinal number from 0 onwards

    temp = data.groupby([variable])[variable].count()/np.float(len(data))
    rare_cat = [x for x in temp.loc[temp<rare_threshold].index.values]
    data[variable] = np.where(data[variable].isin(rare_cat), rare_label, data[variable])
    
    #labels_dict = {k:i for i, k in enumerate(data[variable].unique(), 0)}
    #data.loc[:, variable] = data.loc[:, variable].map(labels_dict)

for variable in data.select_dtypes(include=['object']).columns:
    rare_labeling(data, variable, 0.02, 'Rare')
    rare_labeling(data_meta, variable, 0.02, 'Rare')
    
for variable in special_numerical_vars:
    #rare_labeling(data, variable, 0.05, 65535)
    pass

In [261]:
data.head()

Unnamed: 0,appname,avcdelayapplicationsum,averageapplicationdelay,averageclientnetworkdelay,averagenetworkdelay,averageresponsedelay,averageservernetworkdelay,averagetxnduration,biflowdirection,businessrelevance,clientbytes,clientip,clientnetworkdelay,clienttoservernetworkdelay,clienttoserverresponsedelay,egrintf,ingrintf,newconnectionscount,retransmittedclientpackets,serverbytes,serverip,sourceip,trafficclass,time_hour,time_day
0,cisco_ms-rpc,110,6,0,245,250,244,249,1,y,6808,10.16.15.124,4,1960,3959,2,1,8,0,6544,10.193.25.23,10.16.15.124,transactional-data,afternoon,tuesday
1,cisco_ms-rpc,467,14,0,58,72,58,77,1,y,14692,10.16.15.124,5,879,2457,2,1,15,4,5124,10.32.112.145,10.16.15.124,transactional-data,afternoon,tuesday
2,cisco_ms-rpc,1125,9,0,37,46,37,339,1,y,28377,10.16.15.124,1,187,7932,2,1,5,1,33225,10.32.112.64,10.16.15.124,transactional-data,afternoon,tuesday
3,cisco_ms-rpc,34,1,0,80,80,79,80,1,y,8895,10.16.15.124,1,160,2255,2,1,2,0,4626,10.45.62.222,10.16.15.124,transactional-data,afternoon,tuesday
4,cisco_ms-rpc,230,10,0,248,258,248,321,1,y,16561,10.16.15.124,1,2239,5824,2,1,9,0,7819,172.21.128.31,10.16.15.124,transactional-data,afternoon,tuesday


In [158]:
## Show Numbers Of Outliers ##

def show_outliers(data, outlier_cols):
    for col in outlier_cols:
        IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
        lower_fence = data[col].quantile(0.25) - (IQR * 10.0)
        upper_fence = data[col].quantile(0.75) + (IQR * 10.0)
        upper_outliers = len(data[data[col] > upper_fence][col])
        lower_outliers = len(data[data[col] < lower_fence][col])
        #print("Feature '{0}' has {1} upper outliers".format(col, upper_outliers))
        #print("Feature '{0}' has {1} lower outliers\n".format(col, lower_outliers))
    
    
outlier_cols = data.select_dtypes(include=['float64', 'int64', 'int32', 'float32']).columns
show_outliers(data, outlier_cols)
#show_outliers(data_meta, outlier_cols)

In [46]:
data_meta.head()

Unnamed: 0,connipv4initiatoraddr,connipv4responderaddr,connresponderport,appname,ipv4srcaddr,ipv4dstaddr,intfinput,intfoutput,conninitiator,conncountnew,connsumdur,connresponderoctets,connserverpacketscnt,conninitiatoroctets,connclientpacketscnt,connservernetworkbytescnt,connclientnetworkbytescnt,httphost,ipdscp,conntoserverrespdelaysum,connserverrespcnt,connhistogramlate,conntoservernetwdelaysum,conntoservernetwdelaymax,conntoclientnetwdelaysum,conntoclientnetwdelaymax,connclientretriescnt,connclientservernetwdelaysum,connappdelaysum,connappdelaymax,connclientserverrespdelaysum,conntotaltransactiondurationsum,conntotaltransactiondurationmin,conntotaltransactiondurationmax,conntransactioncnt,avc_csd_srv_avg,avc_ssd_srv_avg,avc_ad_srv_avg,connhistogramlate_for_srv,connclientretriescnt_for_srv
6,Rare,10.32.167.111,5308.0,layer7 ssl,10.32.167.111,Rare,Gi0/0/1,Gi0/0/0,Reverse initiator,8.0,42.0,13274.0,165.0,21743.0,210.0,21918.0,32739.0,,0x00,4209.0,82.0,0.0,404.0,67.0,4.0,1.0,1.0,408.0,316.0,20.0,4271.0,7430.0,41.0,479.0,82.0,0.397059,51.397059,3.488487,0.0,5.0
16,Rare,Rare,65535.0,layer7 unknown,Rare,Rare,Gi0/0/1,Gi0/0/0,Reverse initiator,0.0,15.0,1521.0,38.0,49100.0,46.0,3041.0,50940.0,,0x00,6545.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6545.0,578.0,6545.0,7801.0,533.0,1256.0,13.0,0.0,0.0,535.741935,1.0,0.0
82,Rare,10.19.165.13,445.0,layer7 cifs,Rare,10.19.165.13,Gi0/0/1,Gi0/0/0,Initiator,0.0,15.0,7433.0,50.0,6488.0,62.0,9433.0,9004.0,,0x00,14.0,31.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,14.0,1.0,2959.0,901.0,0.0,888.0,30.0,212.0,0.0,0.262136,0.0,8.0
83,Rare,Rare,65535.0,Rare,Rare,Rare,Gi0/0/1,Gi0/0/0,Initiator,0.0,15.0,952.0,52.0,6010.0,72.0,3032.0,8890.0,,Rare,57.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,28.0,375.0,7455.0,394.0,6577.0,3.0,0.0,0.0,6.625,0.0,0.0
84,Rare,10.19.4.201,7800.0,NBAR riverbed_appliance,Rare,10.19.4.201,Gi0/0/1,Gi0/0/0,Initiator,3.0,47.0,9324.0,57.0,6461.0,50.0,12296.0,9069.0,,0x00,4.0,14.0,0.0,0.0,0.0,88.0,88.0,1.0,88.0,4.0,1.0,180.0,5319.0,1.0,4627.0,13.0,38.709459,0.108108,1.362468,16.0,10.0


In [47]:
## Optional ##
## Replace Outliers With Mean  ##

def replace_outliers(data, lower_quant, upper_quant, iqr_val, outlier_cols, outlier_exclude_cols):
    for col in outlier_cols:
        if col not in outlier_exclude_cols:
            IQR = data[col].quantile(upper_quant) - data[col].quantile(lower_quant)
            lower_fence = data[col].quantile(lower_quant) - (IQR * iqr_val)
            upper_fence = data[col].quantile(upper_quant) + (IQR * iqr_val)
            upper_outliers = len(data[data[col] > upper_fence][col])
            lower_outliers = len(data[data[col] < lower_fence][col])
            mean_val = data[col].mean()
            data[col] = np.where(data[col] >= upper_fence, mean_val, data[col])
            data[col] = np.where(data[col] <= lower_fence, mean_val, data[col])
    return data

        
outlier_cols = data.select_dtypes(include=['float64', 'int64', 'int32', 'float32']).columns
outlier_exclude_cols = ['connresponderport']

#data = replace_outliers(data, 0.25, 0.75, 3.0, outlier_cols, outlier_exclude_cols)
#data_meta = replace_outliers(data_meta, 0.25, 0.75, 3.0, outlier_cols, outlier_exclude_cols)

In [48]:
data_meta.head()

Unnamed: 0,connipv4initiatoraddr,connipv4responderaddr,connresponderport,appname,ipv4srcaddr,ipv4dstaddr,intfinput,intfoutput,conninitiator,conncountnew,connsumdur,connresponderoctets,connserverpacketscnt,conninitiatoroctets,connclientpacketscnt,connservernetworkbytescnt,connclientnetworkbytescnt,httphost,ipdscp,conntoserverrespdelaysum,connserverrespcnt,connhistogramlate,conntoservernetwdelaysum,conntoservernetwdelaymax,conntoclientnetwdelaysum,conntoclientnetwdelaymax,connclientretriescnt,connclientservernetwdelaysum,connappdelaysum,connappdelaymax,connclientserverrespdelaysum,conntotaltransactiondurationsum,conntotaltransactiondurationmin,conntotaltransactiondurationmax,conntransactioncnt,avc_csd_srv_avg,avc_ssd_srv_avg,avc_ad_srv_avg,connhistogramlate_for_srv,connclientretriescnt_for_srv
6,Rare,10.32.167.111,5308.0,layer7 ssl,10.32.167.111,Rare,Gi0/0/1,Gi0/0/0,Reverse initiator,8.0,42.0,13274.0,165.0,21743.0,210.0,21918.0,32739.0,,0x00,4209.0,82.0,0.0,404.0,67.0,4.0,1.0,1.0,408.0,316.0,20.0,4271.0,7430.0,41.0,479.0,82.0,0.397059,51.397059,3.488487,0.0,5.0
16,Rare,Rare,65535.0,layer7 unknown,Rare,Rare,Gi0/0/1,Gi0/0/0,Reverse initiator,0.0,15.0,1521.0,38.0,49100.0,46.0,3041.0,50940.0,,0x00,6545.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6545.0,578.0,6545.0,7801.0,533.0,1256.0,13.0,0.0,0.0,535.741935,1.0,0.0
82,Rare,10.19.165.13,445.0,layer7 cifs,Rare,10.19.165.13,Gi0/0/1,Gi0/0/0,Initiator,0.0,15.0,7433.0,50.0,6488.0,62.0,9433.0,9004.0,,0x00,14.0,31.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,14.0,1.0,2959.0,901.0,0.0,888.0,30.0,212.0,0.0,0.262136,0.0,8.0
83,Rare,Rare,65535.0,Rare,Rare,Rare,Gi0/0/1,Gi0/0/0,Initiator,0.0,15.0,952.0,52.0,6010.0,72.0,3032.0,8890.0,,Rare,57.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0,28.0,375.0,7455.0,394.0,6577.0,3.0,0.0,0.0,6.625,0.0,0.0
84,Rare,10.19.4.201,7800.0,NBAR riverbed_appliance,Rare,10.19.4.201,Gi0/0/1,Gi0/0/0,Initiator,3.0,47.0,9324.0,57.0,6461.0,50.0,12296.0,9069.0,,0x00,4.0,14.0,0.0,0.0,0.0,88.0,88.0,1.0,88.0,4.0,1.0,180.0,5319.0,1.0,4627.0,13.0,38.709459,0.108108,1.362468,16.0,10.0


In [49]:
## Visualize Dist Plots For Various Normalization  Methods ##

# for col in data.select_dtypes(include=['float64', 'int64', 'int32', 'float32']).columns:
#     scaler = RobustScaler(copy=True, quantile_range=(25, 75), with_centering=True,with_scaling=True)
#     data1 = data[col]
#     data2 = pd.DataFrame(scaler.fit_transform(data[[col]]))

#     data3 = data[col] = np.log(data[col].replace([np.inf, -np.inf], np.nan).fillna(.000001).replace(0, .000001))

#     data4 = (data[col] - data[col].mean())/data[col].std()

#     data5 = (data[col]-data[col].min())/(data[col].max()-data[col].min())

#     data1.reset_index(drop=True, inplace=True)
#     data2.reset_index(drop=True, inplace=True)
#     data3.reset_index(drop=True, inplace=True)
#     data4.reset_index(drop=True, inplace=True)
#     data5.reset_index(drop=True, inplace=True)


#     df = pd.concat([data1, data2, data3, data4, data5],axis=1, ignore_index=True)
#     df.columns = ['original', 'robust_scaler', 'basic_log', 'z_scale', 'min_max']
#     fig,ax=plt.subplots(2, 2, figsize=(16,8))

#     sns.distplot(df['original'], ax=ax[0, 0], axlabel='{0}_{1}'.format('original', col))
#     sns.distplot(df['basic_log'], ax=ax[1, 0], axlabel='{0}_{1}'.format('basic_log', col))
#     sns.distplot(df['robust_scaler'], ax=ax[1, 1], axlabel='{0}_{1}'.format('robust_scaler', col))
#     sns.distplot(df['z_scale'], ax=ax[0, 1], axlabel='{0}_{1}'.format('z_scale', col))
#     fig.show()

In [262]:
data.shape

(150010, 25)

In [263]:
data_meta.shape

(150010, 25)

In [273]:
for col in data.select_dtypes(include=['object']).columns:
    print("Feature '{0}' has {1} unique labels".format(col, len(data[col].unique())))

Feature 'appname' has 6 unique labels
Feature 'businessrelevance' has 3 unique labels
Feature 'clientip' has 4 unique labels
Feature 'serverip' has 6 unique labels
Feature 'sourceip' has 5 unique labels
Feature 'trafficclass' has 4 unique labels
Feature 'time_hour' has 4 unique labels
Feature 'time_day' has 1 unique labels


In [274]:
## Normalize Data

def normalize_data(data):
    for col in data.select_dtypes(include=['float64', 'int64', 'int32', 'float32']).columns:
        #scaler = RobustScaler(copy=True, quantile_range=(25, 75), with_centering=True,with_scaling=True)
        #data6 = data[col]
        #data7 = pd.DataFrame(scaler.fit_transform(data[[col]]))
        data[col] = np.log(data[col].replace([np.inf, -np.inf], np.nan).fillna(.000001).replace(0, .000001))
        #data8 = np.log(data8)
        #print(data8)

        #data9 = (data[col] - data[col].mean())/data[col].std()

        #data10 = (data[col]-data[col].min())/(data[col].max()-data[col].min())

        #data[col] = data6 ## For No Change
        #data[col] = data7 ## For robust_scaler
        #data[col] = data8 ## For basic_log
        #data[col] = data9 ## For z_scale
        #data[col] = data10 ## For min_max
    return data
        
data = normalize_data(data)

In [160]:
data.head()

Unnamed: 0,appname,avcdelayapplicationsum,averageapplicationdelay,averageclientnetworkdelay,averagenetworkdelay,averageresponsedelay,averageservernetworkdelay,averagetxnduration,biflowdirection,businessrelevance,clientbytes,clientip,clientnetworkdelay,clienttoservernetworkdelay,clienttoserverresponsedelay,egrintf,ingrintf,newconnectionscount,retransmittedclientpackets,serverbytes,serverip,sourceip,trafficclass,time_col_hour,time_col_day
0,cisco_ms-rpc,4.70048,1.791759,-13.815511,5.501258,5.521461,5.497168,5.517453,0.0,y,8.825854,10.16.15.124,1.386294,7.5807,8.283747,0.693147,0.0,2.079442,-13.815511,8.786304,Rare,10.16.15.124,transactional-data,-13.815511,0.0
1,cisco_ms-rpc,6.146329,2.639057,-13.815511,4.060443,4.276666,4.060443,4.343805,0.0,y,9.595058,10.16.15.124,1.609438,6.778785,7.806696,0.693147,0.0,2.70805,1.386294,8.541691,Rare,10.16.15.124,transactional-data,-13.815511,0.0
2,cisco_ms-rpc,7.025538,2.197225,-13.815511,3.610918,3.828641,3.610918,5.826,0.0,y,10.253334,10.16.15.124,0.0,5.231109,8.97866,0.693147,0.0,1.609438,0.0,10.411058,Rare,10.16.15.124,transactional-data,-13.815511,0.0
3,cisco_ms-rpc,3.526361,0.0,-13.815511,4.382027,4.382027,4.369448,4.382027,0.0,y,9.093245,10.16.15.124,0.0,5.075174,7.720905,0.693147,0.0,0.693147,-13.815511,8.439448,Rare,10.16.15.124,transactional-data,-13.815511,0.0
4,cisco_ms-rpc,5.438079,2.302585,-13.815511,5.513429,5.55296,5.513429,5.771441,0.0,y,9.714806,10.16.15.124,0.0,7.713785,8.669743,0.693147,0.0,2.197225,-13.815511,8.964312,Rare,10.16.15.124,transactional-data,-13.815511,0.0


Feature 'appname' has 6 unique labels
Feature 'businessrelevance' has 3 unique labels
Feature 'clientip' has 4 unique labels
Feature 'serverip' has 6 unique labels
Feature 'sourceip' has 5 unique labels
Feature 'trafficclass' has 4 unique labels


In [275]:
data.shape

(150010, 25)

In [276]:
data_meta.shape

(150010, 25)

In [280]:
## Create Dummy Variables For Remaining Categorical Labels ##

def create_dummies(data):
    for col in data.select_dtypes(include=['object']).columns:
        print(col)
        dummies = pd.get_dummies(data[col], prefix=col, dummy_na=False)
        data = data.drop(col, 1)
        data = pd.concat([data, dummies], axis=1)
    return data
        
data = create_dummies(data)
data_meta = create_dummies(data_meta)

appname
businessrelevance
clientip
serverip
sourceip
trafficclass
time_hour
time_day
appname
businessrelevance
clientip
serverip
sourceip
trafficclass
time_hour
time_day


In [283]:
data.shape

(150010, 50)

In [284]:
data_meta.shape

(150010, 50)

In [32]:
## Change As-Type ##
for col in data.select_dtypes(include=['uint8']).columns:
    data[col] = data[col].astype('float64')

In [163]:
data.shape

(150010, 39)

In [281]:
data.to_csv(r'C:\Users\mattd\Documents\embeddings\current_test2/tensors.csv', sep='\t', header=False, index=False)

In [282]:
data_meta.to_csv(r'C:\Users\mattd\Documents\embeddings\current_test2/tensors_meta.csv', sep='\t', index=False)

In [285]:
data.columns

Index(['avcdelayapplicationsum', 'averageapplicationdelay',
       'averageclientnetworkdelay', 'averagenetworkdelay',
       'averageresponsedelay', 'averageservernetworkdelay',
       'averagetxnduration', 'biflowdirection', 'clientbytes',
       'clientnetworkdelay', 'clienttoservernetworkdelay',
       'clienttoserverresponsedelay', 'egrintf', 'ingrintf',
       'newconnectionscount', 'retransmittedclientpackets', 'serverbytes',
       'appname_Rare', 'appname_cisco_cifs', 'appname_cisco_ms-rpc',
       'appname_cisco_mysql', 'appname_cisco_riverbed-appliance',
       'appname_cisco_ssl', 'businessrelevance_Rare', 'businessrelevance_d',
       'businessrelevance_y', 'clientip_10.16.15.124',
       'clientip_10.19.167.210', 'clientip_10.32.113.46', 'clientip_Rare',
       'serverip_10.16.15.136', 'serverip_10.19.165.13',
       'serverip_10.19.4.201', 'serverip_10.32.167.111',
       'serverip_10.39.255.10', 'serverip_Rare', 'sourceip_10.16.15.124',
       'sourceip_10.19.167.210', 

In [297]:
data_cols = [ 'serverip_10.16.15.136', 'serverip_10.19.165.13',
       'serverip_10.19.4.201', 'serverip_10.32.167.111',
       'serverip_10.39.255.10', 'serverip_Rare', 'time_hour_afternoon',
       'time_hour_am', 'time_hour_evening', 'time_hour_morn',
       'time_day_tuesday']

len(data_cols)

target_col = 'averageapplicationdelay'
# data_cols = data.loc[:, data.columns != target_col].columns
model = Sequential()
model.add(Dense(output_dim=1, input_dim=len(data_cols)))
model.compile(loss='mse', optimizer='sgd')

  if sys.path[0] == '':


In [296]:
print('Training -----------')

data_less_target = data[data_cols]
data_target = data[target_col]
for step in range(1000):
    cost = model.train_on_batch(data_less_target, data_target)
    if step % 100 == 0:
        print('train cost: ', cost)

Training -----------
train cost:  55.134975
train cost:  43.03091
train cost:  39.410473
train cost:  37.14876
train cost:  35.60746
train cost:  34.48488
train cost:  33.638252
train cost:  32.982666
train cost:  32.468426
train cost:  32.059734


In [298]:
len(data_cols)

11

In [61]:
for each in itertools.chain(itertools.combinations(data.columns, 2)):
    if 'connappdelaysum' in each and 'ipdscp_Rare' in each:
        print(each)
#itertools.chain(itertools.combinations(['A','B','C'], 2), itertools.combinations(['A', 'B', 'C,'], 1))


('connappdelaysum', 'ipdscp_Rare')
