## Anomaly Detection

### Choosing features

In [None]:
from scipy.stats import skewnorm
import matplotlib.pyplot as plt

num_values=10000
max_value=100
# -ve values: left skewed, +ve values: right skewed
skewness=20

random=skewnorm.rvs(a=skewness, loc=max_value, size=num_values)

# Shift the set so that min value is equal to 0
random=random-min(random)
# Standardize all the values between 0 and 1
random=random/max(random)
# Multiply standardized values by the maximum value
random=random*max_value

x=random
plt.hist(x, bins=50)

In [None]:
plt.hist(x**0.5, bins=50)

In [None]:
plt.hist(x**0.4, bins=50)

In [None]:
import numpy as np

plt.hist(np.log(x+0.001), bins=50)

In [None]:
plt.hist(np.log(x+7), bins=50)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from utils_ad import *
from public_tests_ad import *

### Dataset

The dataset contains two features - 
   * throughput (mb/s) and 
   * latency (ms) of response of each server.

In [None]:
x_train, x_val, y_val=load_data()

In [None]:
print(x_train[:5])

In [None]:
print(x_val[:5])

In [None]:
print(y_val[:5])

In [None]:
print(x_train.shape, x_val.shape, y_val.shape)

In [None]:
plt.scatter(x_train[:, 0], x_train[:, 1], marker='x', c='b')

plt.title('Training Dataset')
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')
plt.axis([0, 30, 0, 30])
plt.show()

In [None]:
def estimate_gaussian(x):
  m, n=x.shape

  mean=1/m*(np.sum(x, axis=0))
  variance=1/m*(np.sum((x-mean)**2, axis=0))

  return mean, variance

In [None]:
mean, variance=estimate_gaussian(x_train)

print(f"Mean of each feature: {mean}")
print(f"Variance of each feature: {variance}")

estimate_gaussian_test(estimate_gaussian)

In [None]:
# Returns the density
p=multivariate_gaussian(x_train, mean, variance)

visualize_fit(x_train, mean, variance)

In [None]:
def select_threshold(y_val, p_val):
  best_threshold=0
  best_F1=0
  F1=0
  step_size=(max(p_val)-min(p_val))/1000

  for epsilon in np.arange(min(p_val), max(p_val), step_size):
    predictions=(p_val<epsilon)
    tp=np.sum((predictions==1) & (y_val==1))
    fn=np.sum((predictions==0) & (y_val==1))
    fp=np.sum((predictions==1) & (y_val==0))

    prec=tp/(tp+fp)
    recall=tp/(tp+fn)

    F1=(2*prec*recall)/(prec+recall)

    if F1>best_F1:
      best_F1=F1
      best_threshold=epsilon

  return best_threshold, best_F1

In [None]:
p_val=multivariate_gaussian(x_train, mean, variance)
epsilon, F1=select_threshold(y_val, p_val)

print(f"Best threshold found using cross validation set: {epsilon}")
print(f"Best F1 score using cross validation set: {F1}")

select_threshold_test(select_threshold)

In [None]:
outliers=p<epsilon

visualize_fit(x_train, mean, variance)

plt.plot(x_train[outliers, 0], x_train[outliers, 1], 'ro',
         markerfacecolor='none', markeredgewidth=2)

### Using High Dimensional Dataset

In [None]:
x_train_high, x_val_high, y_val_high=load_data_multi()

In [None]:
print(x_train_high.shape, x_val_high.shape, y_val_high.shape)

In [None]:
# Estimate the gaussian parameters
mean_high, variance_high=estimate_gaussian(x_train_high)

# Evaluate the probabilities of training set
p_high=multivariate_gaussian(x_train_high, mean_high, variance_high)

# Evaluate the probabilities of cross validation set
p_val_high=multivariate_gaussian(x_val_high, mean_high, variance_high)

# Finding best threshold
epsilon_high, F1_high=select_threshold(y_val_high, p_val_high)

print(epsilon_high, F1_high)
print(f"Number of anomalies found: {sum(p_high<epsilon_high)}")