In [1]:
# %matplotlib notebook
from sklearn.preprocessing import StandardScaler,QuantileTransformer,RobustScaler,Normalizer,MaxAbsScaler,MinMaxScaler,PowerTransformer
from sklearn.decomposition import PCA, KernelPCA, FactorAnalysis
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from scipy import stats as sps
from ipwhois import IPWhois
from pprint import pprint
from sklearn.compose import ColumnTransformer
from sklearn.manifold import Isomap
import time
from functools import reduce
import swifter

from pandas.plotting import scatter_matrix
import seaborn as sns; sns.set()
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import dask.dataframe as dd
import dask.array as da
from dask.diagnostics import ProgressBar
ProgressBar().register()

import socket

from sklearn.cluster import KMeans

from matplotlib.colors import ListedColormap
from netaddr import IPNetwork, IPAddress

from multiprocessing import Pool

# customer palette for visulization color code
customer_palette = sns.color_palette('Set1', 5)
customer_palette.append(sns.xkcd_palette(['greyish'])[0])
cmap = ListedColormap(['silver','crimson', 'deepskyblue','green','purple','orange'])
target_order = [1,2,3,4,5,0]


In [2]:
def transformer_bundle(X_train, is_get_instance=False, subset = False):
    if(not subset):
        transformer_dict = {'StandardScaler': StandardScaler(), 
                                'QuantileTransformer':QuantileTransformer(), 
                                'RobustScaler':RobustScaler(), 
                                'Normalizer':Normalizer(), 
                                'MaxAbsScaler':MaxAbsScaler(), 
                                'MinMaxScaler':MinMaxScaler(), 
                                'PowerTransformer':PowerTransformer()}
    
    else:
        transformer_dict = {'StandardScaler': ColumnTransformer(
                             [("standard", StandardScaler(), slice(8,56))],
                                remainder='passthrough'),
                            'QuantileTransformer':ColumnTransformer(
                             [("quantile", QuantileTransformer(), slice(8,56))],
                                remainder='passthrough'),
                            'RobustScaler':ColumnTransformer(
                             [("robust", RobustScaler(), slice(8,56))],
                                remainder='passthrough'),
                            'Normalizer':ColumnTransformer(
                             [("norm", Normalizer(), slice(8,56))],
                                remainder='passthrough'),
                            'MaxAbsScaler':ColumnTransformer(
                             [("maxabs", MaxAbsScaler(), slice(8,56))],
                                remainder='passthrough'),
                            'MinMaxScaler':ColumnTransformer(
                             [("minmax", MinMaxScaler(), slice(8,56))],
                                remainder='passthrough'),
                            'PowerTransformer':ColumnTransformer(
                             [("power", PowerTransformer(), slice(8,56))],
                                remainder='passthrough')
                           }
    X_train_trans_dict = {trs:transformer_dict[trs].fit_transform(X_train) for trs in transformer_dict.keys()} 
    X_train_trans_dict['NoTransformer'] = X_train.values
    #embedding = Isomap(n_components=30)
    embedding = PCA()
    X_train_pca_dict = {trs:embedding.fit_transform(X_train_trans_dict[trs]) for trs in transformer_dict.keys()}
    X_train_pca_dict['NoTransformer'] = PCA().fit_transform(X_train)
    
    instance_pca_dict = {trs:PCA().fit(X_train_trans_dict[trs]) for trs in transformer_dict.keys()}
    instance_pca_dict['NoTransformer'] = PCA().fit(X_train)
    
    if not is_get_instance:
        return X_train_trans_dict, X_train_pca_dict
    else:
        return X_train_trans_dict, X_train_pca_dict, instance_pca_dict
    
def pca_scatter_plot(X_pca_dict, y, highlight_list=None):
    y = np.array(y)
    labels=['\nmost labels', '\nsome labels', '\nno labels']

    fig, axes = plt.subplots(nrows=len(X_pca_dict.keys()), ncols=1, figsize=(12,6*len(X_pca_dict.keys())), sharey='row')
    for i in range(len(X_pca_dict.keys())):
        # the fancy modulo is just to make the "no transformation" come first...
        ind = (i-1)%len(X_pca_dict)
        trs = list(X_pca_dict.keys())[ind]
        X = X_pca_dict[trs]
        
        # if there are some dots to be highlighted..        
        if highlight_list is not None:
            style_list = np.array(['original']*X.shape[0])
            style_list[highlight_list] = 'syn_oli'
        else:
            style_list=None
            
        
        this_ax = axes[i]
        # plot the first 2 components
        sns.scatterplot(data=None, x=X[:,0], y=X[:,1], 
                        hue=y, hue_order=[1,2,3,4,5,0], 
                        style=style_list, 
                        alpha=0.6, palette=customer_palette, ax=this_ax)


        # set axis labels and title
        this_ax.set_xlabel('First component')
        this_ax.set_ylabel('Second component')
        this_ax.set_title(trs+labels[2], loc='left', fontsize='large')
        this_ax.legend(loc=2)
    plt.tight_layout(pad=0.5, )
    plt.show()
    
def pca3d_scatter_plot(X_pca_dict, y, highlight_list=None):
    y = np.array(y)
    labels=['\nmost labels', '\nsome labels', '\nno labels']

    fig, axes = plt.subplots(subplot_kw=dict(projection='3d'), nrows=len(X_pca_dict.keys()), ncols=1, figsize=(12,6*len(X_pca_dict.keys())), sharey='row')
    for i in range(len(X_pca_dict.keys())):
        ind = (i-1)%len(X_pca_dict)
        trs = list(X_pca_dict.keys())[ind]
        X = X_pca_dict[trs]           
        
        
        this_ax = axes[i]
        # plot the first 3 components
        this_ax.scatter(X[:,0], X[:,1], X[:,2],
                        alpha=0.6, c=y, edgecolor='w',cmap=cmap)
        

        # set axis labels and title
        this_ax.set_xlabel('First component')
        this_ax.set_ylabel('Second component')
        this_ax.set_zlabel('Third component')
        this_ax.w_xaxis.set_ticklabels([])
        this_ax.w_yaxis.set_ticklabels([])
        this_ax.w_zaxis.set_ticklabels([])
        this_ax.set_title(trs+labels[2], loc='left', fontsize='large')
        #this_ax.legend(loc='upper left')
    plt.tight_layout(pad=0.5, )
    plt.show()

def pca_plot_cumsum(instance_pca_dict):

    trans_list_tmp = list(instance_pca_dict.keys())[-1:]+ list(instance_pca_dict.keys())[:-1]
    cumsum_list = [np.concatenate([[0],instance_pca_dict[trans].explained_variance_ratio_.cumsum()]) for trans in trans_list_tmp]
    
    df_cumsum = pd.DataFrame(np.array(cumsum_list).T)
    df_cumsum.columns = trans_list_tmp

    fig, ax = plt.subplots(1,1,figsize=(12,6), sharex=True, sharey=True)
    
    df_cumsum.plot(ax=ax, color=sns.color_palette('Accent'), kind='line', marker='o')
    ax.grid()
    ax.legend(loc='lower right')
    ax.set_ylabel('explained_variance_cumsum')
    ax.set_xlabel('PCA components')
    ax.set_xlim([0,len(cumsum_list[0])-0.5])
    ax.set_ylim([0,1.05])
#     ax.set_title(ax.get_title(),loc='left', fontsize='x-large')
    plt.show()
    
def pca_cluster_contour_plot(X_pca_dict, labels, kmeans):
    fig, axes = plt.subplots(nrows=len(X_pca_dict.keys()), ncols=1,figsize=(8*1,6*len(X_pca_dict.keys())))
    for i in range(len(X_pca_dict.keys())):
        # the fancy modulo is just to make the "no transformation" come first...
        ind = (i-1)%len(X_pca_dict)
        trs = list(X_pca_dict.keys())[ind]
        X = X_pca_dict[trs]

        pred = kmeans.fit_predict(X[:,:2])    
        
        # relabel based on ground truth
        df_tmp = pd.concat([pd.DataFrame(X), labels.target_original, pd.DataFrame({'pred':pred})],axis = 1)
        df_tmp['pred'] = np.choose(pred,
                                   [df_tmp[df_tmp.pred==0].target_original.mode().values,
                                    df_tmp[df_tmp.pred==1].target_original.mode().values,
                                    df_tmp[df_tmp.pred==2].target_original.mode().values]).astype(np.int64)        

        ax = axes[i]
        for i in range(3):
            sns.kdeplot(data=df_tmp.iloc[:,0][df_tmp.pred==i], data2=df_tmp.iloc[:,1][df_tmp.pred==i],
                        shade=True,
                        color=sns.color_palette('Set1', desat=0.5)[i],
                        shade_lowest=False,
                        label='pred_'+str(i),
                        alpha=0.5,
                        ax=ax)
                   
        sns.scatterplot(x=df_tmp[df_tmp.target_original!=df_tmp.pred].iloc[:,0], 
                        y=df_tmp[df_tmp.target_original!=df_tmp.pred].iloc[:,1], 
                        color = 'orange',
                        marker= 'X',
                        s=150,
                        ax=ax, label='mismatch')

        sns.scatterplot(data=df_tmp, x=df_tmp.columns[0], y=df_tmp.columns[1], hue='target_original', palette='Set1',ax=ax, 
#                         label='none'
                       )
#         sns.scatterplot(data=df_tmp, x=df_tmp.columns[0], y=df_tmp.columns[1], hue=(df_tmp.target_original.astype('str').values), palette='Set1',ax=ax, 
# #                         label=['true_0','true_1,','true_2'], 
#                         legend=False,
#                        )
        ax.set_title(trs+'\nmismatch: {} (out of {})'.format(len(df_tmp[df_tmp.target_original!=df_tmp.pred]), len(df_tmp)),
                     loc='left', fontsize='x-large')
        ax.set_xlabel("1st PCA component")
        ax.set_ylabel("2nd PCA component")
#         ax.legend(loc='upper left')
        ax.legend(bbox_to_anchor=(1.01, 1.0), loc='upper left')
    plt.tight_layout()
    plt.show()


In [3]:
# MULTIMEDIA
def camera_subnets():
    camera_subnets = ['172.20.6.0','172.20.18','172.20.22','172.20.70','172.20.78','172.20.94','172.20.97.96','172.20.125.128','172.20.126','172.20.137','172.20.142','172.20.152.128','172.20.153','172.20.156','172.20.157','172.20.161','172.20.163.32','172.20.166.0','172.22.24.64','172.22.192.64','172.20.7.128','172.22.155.64']
    camera_masks = [7,8,8,8,8,8,5,7,8,8,8,4,8,8,8,8,5,8,5,6,7,5]
    for i in range(len(camera_subnets)):
        camera_subnets[i] = IPNetwork(camera_subnets[i] + '/' + str(32-camera_masks[i]))
    return camera_subnets


In [4]:
def printer_subnets():
    printer_subnets = ['128.138.72.192']
    printer_masks = [5]
    printer_subnet =[ IPNetwork('128.138.72.192/27') ]
    return printer_subnet

In [5]:
def phone_subnets():
    phones = ['10.2.15','10.2.65','10.2.254','10.4.48','10.4.80','10.4.128']
    mask = [8,8,8,8,8,8]
    for i in range(len(phones)):
        phones[i] = IPNetwork(phones[i] + '/' + str(32-mask[i]))
    return phones

In [6]:
def sensor_subnets():
    ip = ['128.138.47.248','128.138.99.80','128.138.178.72','128.138.235.248']
    mask = [3,3,3,3]
    for i in range(len(ip)):
        ip[i] = IPNetwork(ip[i] + '/' + str(32-mask[i]))
    return ip

In [7]:
def server_subnets():
    ip = ['128.138.74.32']
    mask = [4]
    return [IPNetwork(ip[0] + '/' + str(32-mask[0]))]


In [8]:
def check_in_subnets(ipstr, subnets):
    #subnets = backbone_subnets
    #print(ipstr)
    ip = IPAddress(str(ipstr))
    for subnet in subnets:
        #print(ip, subnet)
        if ip in subnet:
            return True
    return False

def private_subnets():
    ip = ['10.0.0.0','172.16.0.0','192.168.0.0']
    mask = [8,12,16]
    for i in range(len(ip)):
        ip[i] = IPNetwork(ip[i] + '/' + str(mask[i]))
    return ip


def cu_subnets():
    subnets = []
    with open('hosts.txt') as reader:
        for line in reader.readlines():
            l = line.rstrip().split(" | ")
            l[0] = IPNetwork(l[0])
            subnets.append(l)
    subnets.reverse()
    return subnets

def check_in_subnets_ret_name(ipstr, subnets):
    ip = IPAddress(str(ipstr))
    for subnet in subnets:
        if ip in subnet[0]:
            return subnet[1]
    return False

c = cu_subnets()
p = private_subnets();
def whois(x):
    s = check_in_subnets_ret_name(str(x), c)
    if(s != False):
        return s
    elif(check_in_subnets(str(x), p)):
        return "private"
    else:
        try:
            obj = IPWhois(str(x))
            results = obj.lookup_whois()
            l = [IPNetwork(results['asn_cidr']), results['asn_description']]
            c.append(l)
            return results['nets'][0]['name'] + " - " + results['nets'][0]['description']
        except:
            return "n/a"

In [9]:
def getmax(x):
    return x.max()
def getmin(x):
    return x.min()
def getmean(x):
    return np.mean(x)
def getvar(x):
    return np.var(x)
def getskew(x):
    return sps.skew(x)
def getkur(x):
    return sps.kurtosis(x)
def getstats(feature):
    stats = ['max','min','avg','var','skew','kur']
    function = [getmax, getmin, getmean, getvar, getskew, getkur]
    for i,stat in enumerate(stats):
        src= df.groupby(['srcaddr'])[feature].apply(lambda x: function[i](x)).reset_index()
        src.columns = ['ip','src'+stat+feature]
        src.set_index('ip', inplace=True)
        dest= df.groupby(['destaddr'])[feature].apply(lambda x: function[i](x)).reset_index()
        dest.columns = ['ip','dest'+stat+feature]
        dest.set_index('ip', inplace=True)
        src = pd.merge(src,dest, how='outer',left_index=True,right_index=True)
        if(i != 0):
            max_ = pd.merge(max_,src, how='outer',left_index=True,right_index=True)
            #print(max_)
        else:
            max_=src
            
    return max_
def logic(x):
    if x%2000 == 0:
        return False
    else:
        return True

def getstats(feature):
    stat = ['max','min','avg','var','skew','kur']
    #function = [np.max, np.min, getmean, getvar, getskew, getkur]
    
    srcmax= df.groupby(['srcaddr'], sort=False)[feature].max().reset_index()
    srcmax.columns = ['ip','src'+stat[0]+feature]
    srcmax.set_index('ip', inplace=True)
    destmax= df.groupby(['destaddr'], sort=False)[feature].max().reset_index()
    destmax.columns = ['ip','dest'+stat[0]+feature]
    destmax.set_index('ip', inplace=True)
    
    srcmin= df.groupby(['srcaddr'], sort=False)[feature].min().reset_index()
    srcmin.columns = ['ip','src'+stat[1]+feature]
    srcmin.set_index('ip', inplace=True)
    destmin= df.groupby(['destaddr'], sort=False)[feature].min().reset_index()
    destmin.columns = ['ip','dest'+stat[1]+feature]
    destmin.set_index('ip', inplace=True)
    
    srcmean= df.groupby(['srcaddr'], sort=False)[feature].mean().reset_index()
    srcmean.columns = ['ip','src'+stat[2]+feature]
    srcmean.set_index('ip', inplace=True)
    destmean= df.groupby(['destaddr'], sort=False)[feature].mean().reset_index()
    destmean.columns = ['ip','dest'+stat[2]+feature]
    destmean.set_index('ip', inplace=True)
    
    srcvar= df.groupby(['srcaddr'], sort=False)[feature].var().reset_index()
    srcvar.columns = ['ip','src'+stat[3]+feature]
    srcvar.set_index('ip', inplace=True)
    destvar= df.groupby(['destaddr'], sort=False)[feature].var().reset_index()
    destvar.columns = ['ip','dest'+stat[3]+feature]
    destvar.set_index('ip', inplace=True)
      
    srcskew= df.groupby(['srcaddr'], sort=False)[feature].apply(sps.skew).reset_index()
    srcskew.columns = ['ip','src'+stat[4]+feature]
    srcskew.set_index('ip', inplace=True)
    destskew= df.groupby(['destaddr'], sort=False)[feature].apply(sps.skew).reset_index()
    destskew.columns = ['ip','dest'+stat[4]+feature]
    destskew.set_index('ip', inplace=True)
    
    srckur= df.groupby(['srcaddr'], sort=False)[feature].apply(sps.kurtosis).reset_index()
    srckur.columns = ['ip','src'+stat[5]+feature]
    srckur.set_index('ip', inplace=True)
    destkur= df.groupby(['destaddr'], sort=False)[feature].apply(sps.kurtosis).reset_index()
    destkur.columns = ['ip','dest'+stat[5]+feature]
    destkur.set_index('ip', inplace=True)
    
    frames=[srcmax,destmax,srcmin,destmin,srcvar,destvar,srcskew,destskew,srckur,destkur]
    max_ = reduce(lambda  left,right: pd.merge(left,right, how='outer', on='ip'), frames)
    return max_

In [10]:
'''start = time.time()
inputfile = '/data/maxim/data/newest.csv'
columns = ['packets', 'bytes', 'start','end', 'srcaddr',
            'destaddr','srcport','destport', 'protocol']

df = pd.read_csv(inputfile)#, skiprows = lambda x: logic(x))#1000000)
list1 = pd.concat([df["srcaddr"].drop_duplicates(),df['destaddr'].drop_duplicates()], axis=0).drop_duplicates()
d = dict([(y,x) for x,y in enumerate(set(list1))])
df['srchash'] = df['srcaddr'].swifter.apply(lambda x: d[x])
df['desthash'] = df['destaddr'].swifter.apply(lambda x: d[x])
df = df.sort_values('srchash')
df = df.drop(['srchash','desthash'])
df.to_csv('/data/maxim/data/newest2.csv',index=False)''';



In [17]:
inputfile = '/data/maxim/data/newest2.csv'
df = pd.read_csv(inputfile, nrows = 100000)
df.to_csv('newdf/test.csv', index=False)

In [23]:
new =  pd.concat([df["srcaddr"].drop_duplicates(),df['destaddr'].drop_duplicates()], axis=0).drop_duplicates().to_frame()
new.shape

(7303, 1)

In [19]:
#new.drop(new.index, inplace=True)
start = time.time()
new =  pd.concat([df["srcaddr"].drop_duplicates(),df['destaddr'].drop_duplicates()], axis=0).drop_duplicates().to_frame()
new.columns = ['ip']
new.set_index('ip', inplace=True)
print(new.shape)

src = pd.DataFrame(df.srcaddr.value_counts(sort=False).reset_index())
src.columns = ["ip", "out_req"]
src.set_index('ip', inplace=True)
dest = pd.DataFrame(df.destaddr.value_counts(sort=False).reset_index())
dest.columns = ["ip", "in_req"]
dest.set_index('ip', inplace=True)

sport = df.groupby(['srcaddr']).agg({'srcport': pd.Series.nunique}).reset_index()
sport.columns = ["ip", "uniq_srcport"]
sport.set_index('ip', inplace=True)

dport = df.groupby(['destaddr']).agg({'destport': pd.Series.nunique}).reset_index()
dport.columns = ["ip", "uniq_destport"]
dport.set_index('ip', inplace=True)

popsrcport = df.groupby(['srcaddr'],sort=False)['srcport'].apply(lambda x: x.value_counts().index[0]).reset_index()
popsrcport.columns = ['ip','pop_srcport']
popsrcport.set_index('ip', inplace=True)

popdestport = df.groupby(['destaddr'],sort=False)['destport'].apply(lambda x: x.value_counts().index[0]).reset_index()
popdestport.columns = ['ip','pop_destport']
popdestport.set_index('ip', inplace=True)

'''
srcprot = df.groupby(['srcaddr'])['protocol'].apply(lambda x: x.value_counts()).reset_index()
srcprot.columns = ['ip','protocol','count']
destprot = df.groupby(['destaddr'])['protocol'].apply(lambda x: x.value_counts()).reset_index()
destprot.columns = ['ip','protocol','count']
prot = srcprot.merge(destprot,on='ip', how="outer")
print('prot',prot.shape)
#prot.assign(protocol = da.where(prot.count_x >= prot.count_y, prot.protocol_x, prot.protocol_y))
prot['protocol'] = np.where(prot['count_x'] >= prot['count_y'], prot['protocol_x'], prot['protocol_y'])
prot = prot.drop(['protocol_x', 'protocol_y', 'count_x','count_y'], axis=1)
new = new.merge(prot, on="ip", how="outer")
'''


features = ['bytes','packets','duration','rate']
frames = [new, src,dest,sport,dport,popsrcport,popdestport]
for feature in features:
    d = getstats(feature)
    frames.append(d)
new = reduce(lambda  left,right: pd.merge(left,right,
                                            how='left',left_index=True,right_index=True), frames)
print(new.shape)
new['ratio'] = (new['in_req']/new['out_req'])
new = new.replace([np.inf, -np.inf], np.nan).fillna(0)

print(new.shape)
end = time.time()
print((end - start))
new

(7303, 0)
(7303, 46)
(7303, 47)
29.374951362609863


Unnamed: 0_level_0,out_req,in_req,uniq_srcport,uniq_destport,pop_srcport,pop_destport,srcmaxbytes,destmaxbytes,srcminbytes,destminbytes,...,destmaxrate,srcminrate,destminrate,srcvarrate,destvarrate,srcskewrate,destskewrate,srckurrate,destkurrate,ratio
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.4.180.54,303.0,0.0,2.0,0.0,51782.0,0.0,6.987,0.000,3.951,0.000,...,0.00,34.667,0.00,221621.740116,0.0,0.140175,0.0,-1.939930,0.0,0.0
71.223.169.49,3.0,0.0,1.0,0.0,63028.0,0.0,8.940,0.000,8.806,0.000,...,0.00,0.108,0.00,0.000103,0.0,-0.347974,0.0,-1.500000,0.0,0.0
202.102.214.4,2.0,0.0,2.0,0.0,19415.0,0.0,4.369,0.000,4.234,0.000,...,0.00,69.000,0.00,50.000000,0.0,0.000000,0.0,-2.000000,0.0,0.0
96.19.116.43,15.0,0.0,1.0,0.0,40344.0,0.0,8.640,0.000,4.533,0.000,...,0.00,0.043,0.00,1530.257787,0.0,2.126060,0.0,2.952292,0.0,0.0
165.232.46.164,1.0,0.0,1.0,0.0,32767.0,0.0,3.689,0.000,3.689,0.000,...,0.00,40.000,0.00,0.000000,0.0,0.000000,0.0,-3.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52.55.158.175,0.0,1.0,0.0,1.0,0.0,443.0,0.000,3.951,0.000,3.951,...,52.00,0.000,52.00,0.000000,0.0,0.000000,0.0,0.000000,-3.0,0.0
23.48.12.74,0.0,1.0,0.0,1.0,0.0,443.0,0.000,3.689,0.000,3.689,...,40.00,0.000,40.00,0.000000,0.0,0.000000,0.0,0.000000,-3.0,0.0
23.37.116.26,0.0,1.0,0.0,1.0,0.0,443.0,0.000,3.951,0.000,3.951,...,52.00,0.000,52.00,0.000000,0.0,0.000000,0.0,0.000000,-3.0,0.0
23.48.12.75,0.0,1.0,0.0,1.0,0.0,443.0,0.000,8.408,0.000,8.408,...,48.75,0.000,48.75,0.000000,0.0,0.000000,0.0,0.000000,-3.0,0.0


In [22]:
new.index.drop_duplicates()

Index(['10.4.180.54', '71.223.169.49', '202.102.214.4', '96.19.116.43',
       '165.232.46.164', '128.138.163.211', '10.4.145.143', '47.74.174.54',
       '46.227.176.49', '128.138.55.24',
       ...
       '23.73.29.244', '54.204.176.31', '54.205.60.132', '23.67.242.9',
       '23.201.219.32', '52.55.158.175', '23.48.12.74', '23.37.116.26',
       '23.48.12.75', '3.218.170.188'],
      dtype='object', name='ip', length=7303)

In [20]:
pd.options.display.min_rows = pd.options.display.max_rows
pd.options.display.max_columns = 60
new.loc[new.index=='1.1.1.1']

Unnamed: 0_level_0,out_req,in_req,uniq_srcport,uniq_destport,pop_srcport,pop_destport,srcmaxbytes,destmaxbytes,srcminbytes,destminbytes,srcvarbytes,destvarbytes,srcskewbytes,destskewbytes,srckurbytes,destkurbytes,srcmaxpackets,destmaxpackets,srcminpackets,destminpackets,srcvarpackets,destvarpackets,srcskewpackets,destskewpackets,srckurpackets,destkurpackets,srcmaxduration,destmaxduration,srcminduration,destminduration,srcvarduration,destvarduration,srcskewduration,destskewduration,srckurduration,destkurduration,srcmaxrate,destmaxrate,srcminrate,destminrate,srcvarrate,destvarrate,srcskewrate,destskewrate,srckurrate,destkurrate,ratio
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1


In [33]:

frames = [new, src,dest,sport,dport,popsrcport,popdestport]
'''for feature in features:
    d = getstats(feature)
    frames.append(d)'''

new = reduce(lambda  left,right: pd.merge(left,right,
                                            how='outer',left_index=True,right_index=True), frames)

In [37]:
df.loc[df.destaddr=='1.1.1.1']

Unnamed: 0,packets,bytes,srcaddr,destaddr,srcport,destport,protocol,duration,rate
8390,1.0,4.094,128.138.185.179,1.1.1.1,47575,33440,17,0,60.0
9226,1.0,4.094,128.138.185.179,1.1.1.1,33290,33448,17,0,60.0
10197,1.0,4.094,128.138.185.179,1.1.1.1,36872,33453,17,0,60.0
51757,1.0,4.094,128.138.185.179,1.1.1.1,54058,33459,17,0,60.0
82998,1.0,4.094,128.138.185.179,1.1.1.1,37634,33443,17,0,60.0
91202,1.0,4.263,128.138.37.217,1.1.1.1,12651,53,17,0,71.0
121299,1.0,4.094,128.138.185.179,1.1.1.1,54360,33458,17,0,60.0
140234,1.0,4.344,128.138.56.21,1.1.1.1,53784,53,17,0,77.0
155011,1.0,4.094,128.138.185.179,1.1.1.1,50124,33460,17,0,60.0
184035,1.0,4.094,128.138.185.179,1.1.1.1,52993,33439,17,0,60.0


In [None]:
old = new
old

In [None]:

# printer port = 9100
cams = camera_subnets()
new['label'] = 0
new.loc[new['ip'].apply(check_in_subnets, args = [cams]),'label'] = 1

new.loc[new['pop_destport'] == 9100, 'label'] = 2

phones = phone_subnets()
new.loc[(new['ip'].apply(check_in_subnets, args = [phones])),'label'] = 3

servers = server_subnets()
new.loc[(new['ip'].apply(check_in_subnets, args = [servers])),'label'] = 4

sensors = sensor_subnets()
new.loc[(new['ip'].apply(check_in_subnets, args = [sensors])),'label'] = 5

In [None]:
new = new.drop_duplicates()
X= new
Y=X.label
X = X.drop(['label', 'ip', 'host', 'pop_destport','pop_srcport'],axis=1)

bestfeatures = SelectKBest(score_func=f_classif, k=10)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(15,'Score'))

#X = X.drop(X.columns.difference(featureScores.nlargest(15,'Score')['Specs']), 1)

In [None]:
'''g = sns.pairplot(data=pd.concat([X,Y],axis=1), vars=featureScores.nlargest(15,'Score'),
                 corner=True)
'''
X

In [None]:
X_trans_dict, X_pca_dict,instance_pca_dict = transformer_bundle(X, is_get_instance=True)
pca_plot_cumsum(instance_pca_dict)
np.shape(X_trans_dict['PowerTransformer'][0])

In [None]:
plt.rc('figure', facecolor='w')
pca_scatter_plot(X_pca_dict, Y)

In [None]:
plt.rc('figure', facecolor='w')
pca3d_scatter_plot(X_pca_dict, Y)

In [None]:
feature_list = list(X.columns)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42)


In [None]:
#27% for srcport
#23 for destport
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(x_train, y_train)
predictions = rf.predict(x_test)
predictions = [int(item) for item in predictions]
metrics.accuracy_score(predictions, y_test)

In [None]:
feature_list = list(X.columns)
#for key in X_trans_dict.keys(): 
for key in X_pca_dict.keys():
    x_train, x_test, y_train, y_test = train_test_split(X_pca_dict[key][:,:10], Y, test_size = 0.25, random_state = 42)

    kmeans = KMeans(n_clusters=20)
    kmeans.fit(x_train) 
    y_pred = kmeans.predict(x_test)
    print(key, metrics.accuracy_score(y_test, y_pred))


In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_pca_dict["QuantileTransformer"][:,:10], Y, test_size = 0.25, random_state = 42)

kmeans = KMeans(n_clusters=10)
kmeans.fit(x_train) 
y_pred = kmeans.predict(x_test)
print(key, metrics.accuracy_score(y_test, y_pred))
mydict = {i: np.where(kmeans.labels_ == i)[0] for i in range(kmeans.n_clusters)}

# Transform the dictionary into list
dictlist = []
for key, value in mydict.items():
    temp = [key,value]
    dictlist.append(temp)
print(np.shape(dictlist[0][1]))
dictlist

In [None]:
d_counts = [ { 0:0,1:0, 2:0, 3:0, 4:0, 5:0, 6:0,7:0, 8:0,9:0} for i in range(10) ]
for x,l in enumerate(dictlist):
    for i in l[1]:
        d_counts[x][new.iloc[i]['label']] +=1
d_counts

In [None]:
pd.options.display.max_rows = 50
pd.options.display.min_rows = pd.options.display.max_rows
pd.options.display.max_columns = 60
idx = new.iloc[dictlist[1][1]].index
new.loc[idx]


In [None]:
new.shape
