In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

#make plots inline using jupyter magic
%matplotlib inline

import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn import datasets, linear_model, metrics


import matplotlib as mpl
import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.neighbors import KNeighborsClassifier
#Balanced RF Classifier
from imblearn.ensemble import BalancedRandomForestClassifier as BRF

from IPython.display import Markdown as md  #enable markdown within code cell
from IPython.display import display, Math, Latex

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import confusion_matrix
import time

from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, precision_recall_curve, make_scorer,f1_score
from sklearn.metrics import precision_recall_curve as PRC

from scipy.spatial.distance import pdist
from scipy.spatial.distance import cosine
from scipy.spatial.distance import cdist

from scipy.stats import norm
import scipy

import metrics

data_dir = '/run/media/mnewlin/_userdata/uhnds/'
original_netflow_data_dir = data_dir + 'network/extracted/'
original_netflow_file = 'netflow_day-02'
fake_dir = '/run/media/mnewlin/_userdata/uhnds/network/converted/fake/'
real_dir = '/run/media/mnewlin/_userdata/uhnds/network/converted/real/'
real_file = 'netflow_day-02'

In [2]:
"""
    Generates probabilities for matrices X and Y, assuming given distribution
    distribution defaults to normal (may add other distributions later)
    
"""
def generate_probs(X,Y, dist='norm'):
    X = np.array(X)
    Y = np.array(Y)
    num_rows = X.shape[0]
    num_cols = X.shape[1]
    norm_x = np.zeros((num_rows, num_cols))
    norm_y = np.zeros((num_rows, num_cols))
    if dist == 'norm':
        for j in range(num_cols):
            xj = X[:,j]

            prob_xj = norm.pdf(xj, loc=xj.mean(), scale=xj.var())
            norm_x[:,j] = prob_xj

            yj = Y[:,j]

            prob_yj = norm.pdf(yj, loc=yj.mean(), scale=yj.var())
            norm_y[:,j] = prob_yj
    return norm_x, norm_y

## $l_p$ distance

In [3]:
def l_p_distance(X,Y,p=2,r=2):
    X = np.array(X)
    Y = np.array(Y)
    if (X.shape != Y.shape):
        print("Usage: Matrices must be the same shape.")
        return -1
    num_rows = X.shape[0]
    distances = np.zeros((num_rows,1))
    for i in range(num_rows):
        x = X[i]
        y = Y[i]
        distances[i] = np.power(np.sum(np.power(np.abs(x-y),p)),(1/r))
    
    return np.mean(distances)

In [4]:
X = np.array([[1.,2.],
              [3.,4.]])
Y = 2*X

print(l_p_distance(X,Y,p=0.5,r=0.5))


9.87831517751085


## Cosine Similarity

In [5]:
def cosine_similarity(X,Y):
    X = np.array(X)
    Y = np.array(Y)
    num_rows = X.shape[0]
    cos_sims = np.zeros((num_rows,1))
    for i in range(num_rows):
        x = X[i]
        y = Y[i]
        cos_sims[i] = cosine(x,y)
        return np.mean(cos_sims)
        

In [6]:
X = np.array([[1.,2.],
              [3.,4.]])
Y = X+5

print(cosine_similarity(X,Y))


X = np.array([[1.,2.],
              [3.,4.]])
Y = X
print(cosine_similarity(X,Y))


0.014928749927334062
0.0


## Mahalanobis Distance

In [7]:
def mahalanobis_distance(X,Y):
    X = np.array(X)
    Y = np.array(Y)
    mahalanobis = cdist(X, Y, 'mahalanobis')
    print(mahalanobis)
    return np.mean(mahalanobis)
    
        

In [8]:
X = np.array([[5580.,364445],
              [6976.,450942.]])
Y = np.array([[5197.,894353],
              [7069.,680679.]])

print(mahalanobis_distance(X,Y))




[[2.2336325  2.41220277]
 [2.31937267 1.03749336]]
2.000675323741658


## $\chi^2$-distance 

In [9]:
def chi_squared_dist(X,Y):
    X = np.array(X)
    Y = np.array(Y)
    prob_x, prob_y = generate_probs(X,Y)
    chi_squares = np.ones(X.shape[1])
    for j in range(chi_squares.size):
        chi_squares[j] = np.sum(np.divide(np.power(prob_x-prob_y, 2), prob_y))
    return np.mean(chi_squares)  

In [10]:
X = np.array([[5580.,364445],
              [6976.,450942.]])
Y = np.array([[5197.,894353],
              [7069.,680679.]])

print(chi_squared_dist(X,Y))

5.820811263338073e-07


In [11]:
a = np.array([[1,2],
              [3,4],
              [5,6],
              [7,8],
              [9,10],
              [11,12]])
new_a = np.reshape(a,(3,2,2))
print(new_a)

[[[ 1  2]
  [ 3  4]]

 [[ 5  6]
  [ 7  8]]

 [[ 9 10]
  [11 12]]]


In [12]:
def score_samples(data, sample_length, num_samples, metric='lp', p=2, r=2):
    # Reshape data into 3d array of n*l*w from 2d array of nl*w
    sample_list = np.reshape(np.array(data), (num_samples, sample_length, data.shape[1]))
    dist_matrix = np.zeros((num_samples, num_samples))
    if metric == 'lp':
        # Do pairwise metrics
        for i in range(num_samples):
            for j in range(num_samples):
                d = l_p_distance(sample_list[i], sample_list[j], p=p, r=r)
                dist_matrix[i,j] = d
        return np.mean(dist_matrix)

    return -1

In [13]:
score = score_samples(a, sample_length=2, num_samples=3, p=0.75, r=0.75)
print(score)

8.959438577030209


In [14]:
a0_pdf = norm.pdf(a[:,0], loc=a[:,0].mean(), scale = a[:,0].var())
print(a0_pdf)

[0.03119458 0.03308301 0.03406967 0.03406967 0.03308301 0.03119458]


In [15]:
X = np.random.randint(0,10, (5,3))
Y = np.random.randint(0,10, (5,3))
print(X)
print(Y)
norm_x,norm_y = metrics.generate_probs(X,Y)
print(norm_x)
print(norm_y)

print(metrics.chi_squared_dist(X, Y))

[[5 5 2]
 [3 5 6]
 [6 8 1]
 [2 1 3]
 [4 9 9]]
[[1 9 7]
 [3 6 5]
 [0 9 8]
 [8 9 0]
 [2 2 9]]
[[0.17603266 0.0507367  0.04509131]
 [0.17603266 0.0507367  0.04558632]
 [0.12098536 0.04855623 0.04346002]
 [0.12098536 0.04283901 0.0461497 ]
 [0.19947114 0.04631851 0.03982511]]
[[0.05004547 0.05070591 0.03899304]
 [0.05139302 0.05203997 0.03914444]
 [0.04817004 0.05070591 0.03835614]
 [0.04107154 0.05070591 0.03336189]
 [0.05113762 0.04227753 0.03736589]]
0.44158684122165903


In [16]:
X = np.random.randint(0,10, (5,3)).astype(np.float64)
Y = np.random.randint(0,10, (5,3)).astype(np.float64)


print(metrics.fid(X,Y))

4.629445206177525


In [17]:
a = np.array([[-2,-1],
              [-1, -2]])

print(scipy.linalg.sqrtm(a))
print(scipy.linalg.sqrtm(np.abs(a)))

[[0.+1.3660254j 0.+0.3660254j]
 [0.+0.3660254j 0.+1.3660254j]]
[[1.3660254 0.3660254]
 [0.3660254 1.3660254]]


In [18]:
X = np.random.randint(0,10, (5,3)).astype(np.float64)
Y = np.random.randint(0,10, (5,3)).astype(np.float64)

print(metrics.calc_entropy(X,Y,sample_length=5))
print(metrics.calc_perplexity(X,Y,sample_length=5))

0.0837359860874904
0.1070413653941326
