In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

#make plots inline using jupyter magic
%matplotlib inline

import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn import datasets, linear_model, metrics


import matplotlib as mpl
import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.neighbors import KNeighborsClassifier
#Balanced RF Classifier
from imblearn.ensemble import BalancedRandomForestClassifier as BRF

from IPython.display import Markdown as md  #enable markdown within code cell
from IPython.display import display, Math, Latex

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import confusion_matrix
import time

from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, precision_recall_curve, make_scorer,f1_score
from sklearn.metrics import precision_recall_curve as PRC

from scipy.spatial.distance import pdist
from scipy.spatial.distance import cosine
from scipy.spatial.distance import cdist

from scipy.stats import norm
import scipy

import metrics

data_dir = '/run/media/mnewlin/_userdata/uhnds/'
original_netflow_data_dir = data_dir + 'network/extracted/'
original_netflow_file = 'netflow_day-02'
fake_dir = '/run/media/mnewlin/_userdata/uhnds/network/converted/fake/'
real_dir = '/run/media/mnewlin/_userdata/uhnds/network/converted/real/'
real_file = 'netflow_day-02'

In [2]:
"""
    Generates probabilities for matrices X and Y, assuming given distribution
    distribution defaults to normal (may add other distributions later)
    
"""
def generate_probs(X,Y, dist='norm'):
    X = np.array(X)
    Y = np.array(Y)
    num_rows = X.shape[0]
    num_cols = X.shape[1]
    norm_x = np.zeros((num_rows, num_cols))
    norm_y = np.zeros((num_rows, num_cols))
    if dist == 'norm':
        for j in range(num_cols):
            xj = X[:,j]

            prob_xj = norm.pdf(xj, loc=xj.mean(), scale=xj.var())
            norm_x[:,j] = prob_xj

            yj = Y[:,j]

            prob_yj = norm.pdf(yj, loc=yj.mean(), scale=yj.var())
            norm_y[:,j] = prob_yj
    return norm_x, norm_y

## $l_p$ distance

In [3]:
def l_p_distance(X,Y,p=2,r=2):
    X = np.array(X)
    Y = np.array(Y)
    if (X.shape != Y.shape):
        print("Usage: Matrices must be the same shape.")
        return -1
    num_rows = X.shape[0]
    distances = np.zeros((num_rows,1))
    for i in range(num_rows):
        x = X[i]
        y = Y[i]
        distances[i] = np.power(np.sum(np.power(np.abs(x-y),p)),(1/r))
    
    return np.mean(distances)

In [4]:
X = np.array([[1.,2.],
              [3.,4.]])
Y = 2*X

print(l_p_distance(X,Y,p=0.5,r=0.5))


9.87831517751085


## Cosine Similarity

In [5]:
def cosine_similarity(X,Y):
    X = np.array(X)
    Y = np.array(Y)
    num_rows = X.shape[0]
    cos_sims = np.zeros((num_rows,1))
    for i in range(num_rows):
        x = X[i]
        y = Y[i]
        cos_sims[i] = cosine(x,y)
        return np.mean(cos_sims)
        

In [6]:
X = np.array([[1.,2.],
              [3.,4.]])
Y = X+5

print(cosine_similarity(X,Y))


X = np.array([[1.,2.],
              [3.,4.]])
Y = X
print(cosine_similarity(X,Y))


0.014928749927334062
0.0


## Mahalanobis Distance

In [7]:
def mahalanobis_distance(X,Y):
    X = np.array(X)
    Y = np.array(Y)
    mahalanobis = cdist(X, Y, 'mahalanobis')
    print(mahalanobis)
    return np.mean(mahalanobis)
    
        

In [8]:
X = np.array([[5580.,364445],
              [6976.,450942.]])
Y = np.array([[5197.,894353],
              [7069.,680679.]])

print(mahalanobis_distance(X,Y))




[[2.2336325  2.41220277]
 [2.31937267 1.03749336]]
2.000675323741658


## $\chi^2$-distance 

In [9]:
def chi_squared_dist(X,Y):
    X = np.array(X)
    Y = np.array(Y)
    prob_x, prob_y = generate_probs(X,Y)
    chi_squares = np.ones(X.shape[1])
    for j in range(chi_squares.size):
        chi_squares[j] = np.sum(np.divide(np.power(prob_x-prob_y, 2), prob_y))
    return np.mean(chi_squares)  

In [10]:
X = np.array([[5580.,364445],
              [6976.,450942.]])
Y = np.array([[5197.,894353],
              [7069.,680679.]])

print(chi_squared_dist(X,Y))

5.820811263338073e-07


In [11]:
a = np.array([[1,2],
              [3,4],
              [5,6],
              [7,8],
              [9,10],
              [11,12]])
new_a = np.reshape(a,(3,2,2))
print(new_a)

[[[ 1  2]
  [ 3  4]]

 [[ 5  6]
  [ 7  8]]

 [[ 9 10]
  [11 12]]]


In [12]:
def score_samples(data, sample_length, num_samples, metric='lp', p=2, r=2):
    # Reshape data into 3d array of n*l*w from 2d array of nl*w
    sample_list = np.reshape(np.array(data), (num_samples, sample_length, data.shape[1]))
    dist_matrix = np.zeros((num_samples, num_samples))
    if metric == 'lp':
        # Do pairwise metrics
        for i in range(num_samples):
            for j in range(num_samples):
                d = l_p_distance(sample_list[i], sample_list[j], p=p, r=r)
                dist_matrix[i,j] = d
        return np.mean(dist_matrix)

    return -1

In [13]:
score = score_samples(a, sample_length=2, num_samples=3, p=0.75, r=0.75)
print(score)

8.959438577030209


In [14]:
a0_pdf = norm.pdf(a[:,0], loc=a[:,0].mean(), scale = a[:,0].var())
print(a0_pdf)

[0.03119458 0.03308301 0.03406967 0.03406967 0.03308301 0.03119458]


In [15]:
X = np.random.randint(0,10, (5,3))
Y = np.random.randint(0,10, (5,3))
print(X)
print(Y)
norm_x,norm_y = metrics.generate_probs(X,Y)
print(norm_x)
print(norm_y)

print(metrics.chi_squared_dist(X, Y))

[[1 9 9]
 [5 9 7]
 [1 8 4]
 [4 5 4]
 [1 7 8]]
[[4 9 5]
 [2 5 8]
 [1 4 8]
 [0 7 4]
 [6 8 1]]
[[0.11802742 0.14650043 0.07796357]
 [0.09103297 0.14650043 0.09315279]
 [0.11802742 0.17528216 0.08016236]
 [0.11425755 0.09080479 0.08016236]
 [0.11802742 0.17182339 0.0876239 ]]
[[0.08215302 0.09091913 0.05729563]
 [0.0852631  0.10408196 0.0528636 ]
 [0.08101622 0.08715759 0.0528636 ]
 [0.07348705 0.11519022 0.05647364]
 [0.06573522 0.10675433 0.04777786]]
0.12593307173841073


In [16]:
X = np.random.randint(0,10, (5,3)).astype(np.float64)
Y = np.random.randint(0,10, (5,3)).astype(np.float64)


print(metrics.fid(X,Y))

0.037416389712263874


In [17]:
a = np.array([[-2,-1],
              [-1, -2]])

print(scipy.linalg.sqrtm(a))
print(scipy.linalg.sqrtm(np.abs(a)))

[[0.+1.3660254j 0.+0.3660254j]
 [0.+0.3660254j 0.+1.3660254j]]
[[1.3660254 0.3660254]
 [0.3660254 1.3660254]]


In [18]:
X = np.random.randint(0,10, (5,3)).astype(np.float64)
Y = np.random.randint(0,10, (5,3)).astype(np.float64)

print(metrics.calc_entropy(X,Y,sample_length=5))
print(metrics.calc_perplexity(X,Y,sample_length=5))

0.0013007343458958376
0.0017993448672211088


In [19]:
XXT = np.matmul(X, X.T)
YYT = np.matmul(Y,Y.T)

mah = metrics.mahalanobis_distance(X,Y)
print(mah)


2.3913904276564537


In [20]:
a = np.array([1,2,3,0,5,0,6,0])
epsilon = 1e-3
a = np.where(a == 0, epsilon, a)
print(a)

[1.e+00 2.e+00 3.e+00 1.e-03 5.e+00 1.e-03 6.e+00 1.e-03]


In [21]:
a = np.array([1,2,3,0,5,0,6,0])
print(a)
epsilon = 1e-10

a_log = np.where(a==0,a, np.log(a+epsilon))
print(a_log)

[1 2 3 0 5 0 6 0]
[1.00000008e-10 6.93147181e-01 1.09861229e+00 0.00000000e+00
 1.60943791e+00 0.00000000e+00 1.79175947e+00 0.00000000e+00]


In [22]:
X1 = np.random.randint(0,10, (10,4)).astype(np.float64)
Y1 = np.random.randint(10,20, (10,4)).astype(np.float64)
X2 = np.random.randint(0,10, (10,4)).astype(np.float64)
Y2 = np.random.randint(10,20, (10,4)).astype(np.float64)

Mxx = metrics.distance(X1,X2, sqrt=True)
Myy = metrics.distance(Y1,Y2, sqrt=True)
Mxy = metrics.distance(X1,Y1, sqrt=True)

sigma = Mxx.std()

print(metrics.mmd(Mxx,Myy,Mxy,sigma))


0.0


In [23]:
def fid(X,Y):

    X = np.array(X)
    Y = np.array(Y)
    prob_x, prob_y = metrics.generate_probs(X,Y)
    mu_x = np.mean(prob_x, axis=0)
    mu_y = np.mean(prob_y, axis=0)
    
    Cx = np.cov(prob_x,rowvar=False)
    Cy = np.cov(prob_y, rowvar=False)
    ssdiff = np.sum(np.square(mu_x-mu_y))
    covmean = scipy.linalg.sqrtm(Cx.dot(Cy))
    score = ssdiff + np.trace(Cx + Cy - 2.0*covmean)
    return np.abs(score)

In [24]:
X1 = np.random.randint(0,100000000, (100,10)).astype(np.float64)
X2 = np.random.randint(100,1000, (100,10)).astype(np.float64)

f0 = metrics.chi_squared_dist(X1,X2)


#print(f1)
print(f0)

0.0005690449185567254


In [25]:
def alt_mahalanobis(X,Y):
    X = np.array(X)
    Y = np.array(Y)
    mahalanobis = cdist(X, Y, 'mahalanobis')
    return np.mean(np.nan_to_num(mahalanobis))

In [26]:
def mahalanobis_distance(X,Y):
    X = np.array(X)
    Y = np.array(Y)
    stack = np.vstack([X, Y])
    VI = np.linalg.pinv(np.cov(stack, rowvar=False))
    mahalanobis = cdist(X, Y, 'mahalanobis', VI=VI)
    return np.mean(np.nan_to_num(mahalanobis))


In [27]:
X1 = np.random.randint(0,100000000, (100,10)).astype(np.float64)
X2 = np.random.randint(100,1000, (100,10)).astype(np.float64)

d = alt_mahalanobis(X1,X1)
print(d)
d0 = mahalanobis_distance(X1,X1)
print(d0)

4.365225038092692
4.365225038092692


In [133]:
from sklearn.decomposition import PCA

X1 = np.random.randint(0,100000000, (100,10)).astype(np.float64)
X2 = np.random.randint(0,100000000, (100,10)).astype(np.float64)

pca1 = PCA()
pca1.fit(X1)
x = pca1.components_
print(x)

pca1.fit(X2)
y = pca1.components_
print(x)

[[-0.21934199 -0.29170581 -0.0988365  -0.04780687  0.13271962 -0.61140977
   0.02163689 -0.38279516  0.14159024 -0.54429604]
 [-0.30471989  0.69836831  0.25590302 -0.06732126  0.40821972  0.05275839
   0.17602928 -0.27779936  0.26724203  0.02012737]
 [-0.36656236 -0.19905392 -0.30822141 -0.073805    0.42413997  0.3106361
   0.41101609  0.44765473 -0.11645448 -0.25745256]
 [-0.27474518 -0.00831652 -0.22423442  0.31316344 -0.48924904  0.13698992
   0.15252014  0.07391691  0.69939941 -0.00877547]
 [ 0.18162111 -0.12290771 -0.07596215  0.46699547  0.22544084  0.56444945
  -0.24290039 -0.44866545 -0.0199657  -0.31293096]
 [-0.09583462 -0.18025053 -0.3508781  -0.01047965 -0.01477602  0.00925911
   0.45199668 -0.549268   -0.19348801  0.53977977]
 [-0.15994374  0.5004366  -0.57459345 -0.14368454 -0.33911584  0.03022203
  -0.22360574 -0.06744225 -0.36728202 -0.2604253 ]
 [ 0.16126593  0.04824334 -0.53013237  0.09239493  0.47937572 -0.23523815
  -0.4069795   0.15799097  0.31021682  0.33184745]
 

In [46]:
from numpy.fft import fftn

X1 = np.random.randint(0,100, (10,4)).astype(np.float64)
X2 = np.random.randint(0,100, (10,4)).astype(np.float64)

f1 = fftn(X1)
f1_real = f1.real
print(f1_real)
print(f1_real.shape)

[[2070.           76.           10.           76.        ]
 [-137.79179607  261.99645     -42.49186938   81.20517613]
 [ 257.56577809  -50.96092852  -51.89512163   59.09367094]
 [-151.20820393  189.80501655   80.49186938  177.99335732]
 [-122.56577809  132.34304304   21.89512163  -41.47578545]
 [  28.           84.          124.           84.        ]
 [-122.56577809  -41.47578545   21.89512163  132.34304304]
 [-151.20820393  177.99335732   80.49186938  189.80501655]
 [ 257.56577809   59.09367094  -51.89512163  -50.96092852]
 [-137.79179607   81.20517613  -42.49186938  261.99645   ]]


In [28]:
import random
random_state=1
num_samples=20
sample_range=100
random.seed(a=random_state)
sample_list = random.sample(range(sample_range), num_samples)
print(sample_list)

[17, 72, 97, 8, 32, 15, 63, 57, 60, 83, 48, 26, 12, 62, 3, 49, 55, 77, 98, 0]


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
filename = '/run/media/mnewlin/_userdata/uhnds/host/host_events_sample.txt'
df_host = pd.read_json(filename, orient='records', lines=True, dtype=str)
display(df_host)
#
df_host = df_host.replace(to_replace=[r"[\$]+"], value='', regex=True)
display(df_host)
df_host_array = df_host.values

Unnamed: 0,AuthenticationPackage,Computer,DomainName,EventID,LogonID,LogonType,LogonTypeDescription,ServiceName,Source,Status,Time,UserName
0,,ActiveDirectory,Domain002,4769,,,,Comp883934$,Comp309534,0x0,2,User624729
1,MICROSOFT_AUTHENTICATION_PACKAGE_V1_0,ActiveDirectory,Domain002,4776,,,,,,0x0,2,Scanner
2,,ActiveDirectory,Domain002,4672,0x2e66398d,,,,,,2,ActiveDirectory$
3,Kerberos,ActiveDirectory,Domain002,4624,0x2e66398d,3.0,Network,,,,2,ActiveDirectory$
4,,ActiveDirectory,Domain002,4634,0x2e66398d,3.0,Network,,,,2,ActiveDirectory$
5,Kerberos,Comp966305,Domain002,4624,0x9f17415,3.0,Network,,,,2,User380010
6,,Comp966305,Domain002,4634,0x9f17415,3.0,Network,,,,2,User380010
7,Kerberos,Comp966305,Domain002,4624,0x9f17637,3.0,Network,,,,2,User096622
8,,Comp966305,Domain002,4634,0x9f17637,3.0,Network,,,,2,User096622
9,Kerberos,Comp966305,Domain002,4624,0x9f17fe4,3.0,Network,,,,2,User233472


Unnamed: 0,AuthenticationPackage,Computer,DomainName,EventID,LogonID,LogonType,LogonTypeDescription,ServiceName,Source,Status,Time,UserName
0,,ActiveDirectory,Domain002,4769,,,,Comp883934,Comp309534,0x0,2,User624729
1,MICROSOFT_AUTHENTICATION_PACKAGE_V1_0,ActiveDirectory,Domain002,4776,,,,,,0x0,2,Scanner
2,,ActiveDirectory,Domain002,4672,0x2e66398d,,,,,,2,ActiveDirectory
3,Kerberos,ActiveDirectory,Domain002,4624,0x2e66398d,3.0,Network,,,,2,ActiveDirectory
4,,ActiveDirectory,Domain002,4634,0x2e66398d,3.0,Network,,,,2,ActiveDirectory
5,Kerberos,Comp966305,Domain002,4624,0x9f17415,3.0,Network,,,,2,User380010
6,,Comp966305,Domain002,4634,0x9f17415,3.0,Network,,,,2,User380010
7,Kerberos,Comp966305,Domain002,4624,0x9f17637,3.0,Network,,,,2,User096622
8,,Comp966305,Domain002,4634,0x9f17637,3.0,Network,,,,2,User096622
9,Kerberos,Comp966305,Domain002,4624,0x9f17fe4,3.0,Network,,,,2,User233472


In [41]:
vectorizer = TfidfVectorizer(lowercase=False, preprocessor=None)
X = vectorizer.fit_transform(df_host_array)
print(vectorizer.get_feature_names())
print(X)

TypeError: expected string or bytes-like object

In [None]:
#filename = '/run/media/mnewlin/_userdata/uhnds/host/wls_day-01'
#df_host = pd.read_json(filename, orient='records', lines=True)
#display(df_host.head())