In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# 9 - Anomaly Detection

In this session, we will have a look at some techniques which can be used for anomaly detection.

We will work on two dataset:
- a toy dataset, artificially generated with the sklearn methods
- a real Credit Card dataset

We will be working with four models:
- OneClassSVM (`class sklearn.svm.OneClassSVM`)
- EllipticEnvelope (`class sklearn.covariance.EllipticEnvelope`), a.k.a. Robust Covariance
- IsolationForest (`class sklearn.ensemble.IsolationForest`)
- Local Outlier Factor (`class sklearn.neighbors.LocalOutlierFactor`)

## Index

- [9.0 - Imports](#9.0)
- [9.1 - Comparing anomaly detection algorithms for outlier detection on toy datasets](#9.1)
- [9.2 - Anomaly detection in Credit Card usage data](#9.2)
    - [9.2.1 - Visualization](#9.2.1)
    - [9.2.2 - Anomaly Detection](#9.2.2)

## 9.0
## Imports

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import time

# plotting
import matplotlib
import matplotlib.pyplot as plt

# preprocessing and data preparation
from sklearn.datasets import make_moons, make_blobs
from sklearn.preprocessing import StandardScaler

# anomaly detection
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

## 9.1
## Comparing anomaly detection algorithms for outlier detection on toy datasets

This example shows characteristics of different anomaly detection algorithms on 2D datasets.

Datasets contain one or two modes (regions of high density) to illustrate the ability of algorithms to cope with multimodal data.

For each dataset, 15% of samples are generated as random uniform noise. 
This proportion is the value given to the nu parameter of the OneClassSVM and the contamination parameter of the other outlier detection algorithms (which are therefore used in the "optimal" configuration; in reality, it is not easy to find the value of these parameters).

Decision boundaries between inliers and outliers are displayed in black except for Local Outlier Factor (LOF) as it has no predict method to be applied on new data when it is used for outlier detection.

In [None]:
RANDOM_SEED = 42

# setting for the synthetic data
n_samples = 300
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers

print("n_samples  = %3d" % n_samples)
print("n_outliers = %3d" % n_outliers)
print("n_inliers  = %3d" % n_inliers)

In [None]:
# define outlier/anomaly detection methods to be compared
anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
    ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)),
    ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=RANDOM_SEED)),
    ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction))
]

In [None]:
# Define datasets
blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
datasets = [
    make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3], **blobs_params)[0],
    4. * (make_moons(n_samples=n_inliers, noise=.05, random_state=RANDOM_SEED)[0] - np.array([0.5, 0.25])),
    14. * (np.random.RandomState(RANDOM_SEED).rand(n_inliers, 2) - 0.5)
]

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(24, 4))
for idx, dataset in enumerate(datasets):
    ax[idx].scatter(dataset[:,0], dataset[:,1], s=15, alpha=0.5, c='g')
plt.show()
# be careful with the axis, they are on different scales!

Let's now add the outliers:

In [None]:
datasets_w_ouliers = []
rng = np.random.RandomState(RANDOM_SEED)
for X in datasets:
    datasets_w_ouliers.append(np.concatenate([X, rng.uniform(low=-6, high=6, size=(n_outliers, 2))], axis=0))

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(24, 4))
for idx, dataset in enumerate(datasets_w_ouliers):
#     ax[idx].scatter(dataset[:,0], dataset[:,1], s=15, alpha=0.5, color=['g']*n_inliers+['r']*n_outliers)
    ax[idx].scatter(dataset[:n_inliers,0], dataset[:n_inliers,1], s=15, alpha=0.5, color='g')
    ax[idx].scatter(dataset[n_inliers:,0], dataset[n_inliers:,1], s=15, alpha=0.5, color='r')
    ax[idx].set_xlim(-7.5, 7.5)
    ax[idx].set_ylim(-7.5, 7.5)
plt.show()

In this case, we know which are the anomalies, and therefore we can plot them in a different color, to see where they are in the space.

Let's now run (separately on each dataset) the anomaly detection models.

In [None]:
fig, ax = plt.subplots(5, 4, figsize=(19, 24))

xx, yy = np.meshgrid(np.linspace(-7.5, 7.5, 150), np.linspace(-7.5, 7.5, 150))

for idx1, X in enumerate(datasets_w_ouliers):

    for idx2, (name, algorithm) in enumerate(anomaly_algorithms):
        if idx1 == 0:
            ax[idx1][idx2].set_title(name, size=12)

        # fit the data and tag outliers
        if name == "Local Outlier Factor":
            y_pred = algorithm.fit_predict(X)
        else:
            y_pred = algorithm.fit(X).predict(X)

        # plot the levels lines and the points
        if name != "Local Outlier Factor":  # LOF does not implement predict
            Z = algorithm.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            ax[idx1][idx2].contour(xx, yy, Z, levels=[0], linewidths=2, colors='black', alpha=0.5)

        colors = np.array(['#377eb8', '#ff7f00'])
        ax[idx1][idx2].scatter(X[:, 0], X[:, 1], s=15, color=colors[(y_pred + 1) // 2], alpha=0.5)

        ax[idx1][idx2].set_xlim(-7.5, 7.5)
        ax[idx1][idx2].set_ylim(-7.5, 7.5)

plt.show()

The :class:`sklearn.svm.OneClassSVM` is known to be sensitive to outliers and thus does not perform very well for outlier detection.
This estimator is best suited for novelty detection when the training set is not contaminated by outliers. 
That said, outlier detection in high-dimension, or without any assumptions on the distribution of the inlying data is very challenging, and a One-class SVM might give useful results in these situations depending on the value of its hyperparameters.

:class:`sklearn.covariance.EllipticEnvelope` assumes the data is Gaussian and learns an ellipse. 
It thus degrades when the data is not unimodal. 
Notice however that this estimator is robust to outliers.

:class:`sklearn.ensemble.IsolationForest` and :class:`sklearn.neighbors.LocalOutlierFactor` seem to perform reasonably well for multi-modal data sets. 
The advantage of :class:`sklearn.neighbors.LocalOutlierFactor` over the other estimators is shown for the third data set, where the two modes have different densities.
This advantage is explained by the local aspect of LOF, meaning that it only compares the score of abnormality of one sample with the scores of its neighbors.

Finally, for the last data set, it is hard to say that one sample is more abnormal than another sample as they are uniformly distributed in a hypercube. 
Except for the :class:`sklearn.svm.OneClassSVM` which overfits a little, all estimators present decent solutions for this situation. 
In such a case, it would be wise to look more closely at the scores of abnormality of the samples as a good estimator should assign similar scores to all the samples.

While these examples give some intuition about the algorithms, this intuition might not apply to very high dimensional data.

Finally, note that parameters of the models have been here handpicked but that in practice they need to be adjusted. 
In the absence of labelled data, the problem is completely unsupervised so model selection can be a challenge.

#### Example of simple univariate analysis

Another option to perform anomaly detection is to do some univariate analysis, such as computing the **median** and the **MAD** (Median Absolute Deviation) and labelling points more than `k` times MAD abeove the median as outliers (where `k` is a parameter, similar to the k of kMeans)

In [None]:
k = 3

fig, ax = plt.subplots(1, 5, figsize=(24, 4))

for idx, X in enumerate(datasets_w_ouliers):
    mad = stats.median_abs_deviation(X)
    median = np.median(X, axis=0)
    thres = (median - k*mad), (median + k*mad)
    y_pred = [np.any([x < thres[0], x > thres[1]]) for x in X]
    print(np.unique(y_pred, return_counts=True))
    
    ax[idx].scatter(X[:,0], X[:,1], s=15, alpha=0.5, color=['r' if pred else 'g' for pred in y_pred])
    ax[idx].set_xlim(-7.5, 7.5)
    ax[idx].set_ylim(-7.5, 7.5)

plt.show()

As you can see, this approach is quite "weak", and it works well only if the data has a round-ish shape.

## 9.2
## Anomaly detection in Credit Card usage data
[Index](#Index)

We will now analyse a credit card usage dataset, available on Kaggle: 
https://www.kaggle.com/arjunbhasin2013/ccdata

We also uploaded the dataset to Beep.

Follow the link to read about the fields in the dataset, here a short recap:
- `CUSTID` : Identification of Credit Card holder (Categorical)
- `BALANCE` : Balance amount left in their account to make purchases (
- `BALANCEFREQUENCY` : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
- `PURCHASES` : Amount of purchases made from account
- `ONEOFFPURCHASES` : Maximum purchase amount done in one-go
- `INSTALLMENTSPURCHASES` : Amount of purchase done in installment
- `CASHADVANCE` : Cash in advance given by the user
- `PURCHASESFREQUENCY` : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
- `ONEOFFPURCHASESFREQUENCY` : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
- `PURCHASESINSTALLMENTSFREQUENCY` : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
- `CASHADVANCEFREQUENCY` : How frequently the cash in advance being paid
- `CASHADVANCETRX` : Number of Transactions made with "Cash in Advanced"
- `PURCHASESTRX` : Numbe of purchase transactions made
- `CREDITLIMIT` : Limit of Credit Card for user
- `PAYMENTS` : Amount of Payment done by user
- `MINIMUM_PAYMENTS` : Minimum amount of payments made by user
- `PRCFULLPAYMENT` : Percent of full payment paid by user
- `TENURE` : Tenure of credit card service for user



We will first visualise different aspects of the data and then apply some anomaly detection techniques to it.

### 9.2.1
### Visualisation
[Index](#Index)

<div class="alert alert-block alert-danger">
    <b>Q: Read the data into a pandas dataframe named <code>df</code> and print out the first few lines to have a look at it.</b>
</div>

In [None]:
df =  # TODO

<div class="alert alert-block alert-danger">
    <b>Q: How many rows are there in the data?.</b>
</div>

In [None]:
 # TODO

<div class="alert alert-block alert-danger">
    <b>Q: What are the types of the different columns? How many are non-numeric?.</b>
</div>

In [None]:
 # TODO

---

Lets try to visualise the data. 

A possible way to visualise data quickly is to use a **boxplot**, which summarises the distribution of values on each dimension. 

Run the code below to generate boxplots. (Assumes that your dataframe is called 'df'.)

In [None]:
df.boxplot();

That plot is a bit too small and hard to read! Let's make it a bit bigger and rotate the labels ....

In [None]:
ax = df.boxplot(figsize=(12, 8))
plt.setp(ax.get_xticklabels(), rotation=90);

The different variables have different scales, so it doesn't make much sense to plot them all together.

<div class="alert alert-block alert-danger">
    <b>Q: Plot the boxplot separately for columns with comparable scales (e.g. remove the _FREQUENCY, _TRX, PRC_FULL_PAYMENT and TENURE columns and plot them separately).</b>
</div>

In [None]:
 # TODO

About the boxplots: 
- The horizontal line within the box is the median value.
- The box in the boxplot show the quartiles (the 25% and 75% percentiles) in the data, meaning that 50% of the data lies within the range of the box. 
- The whiskers extend 1.5 times the interquartile above/below the top/bottom of the box.
- Points outside that range are often outliers and are shown on the plot as circles.

Note: To produce a more comparable plot you could also use the StandardScalar routine to standardise the data on each dimension. We are going to use it, later in the notebook.

<div class="alert alert-block alert-danger">
    <b>Q: Compare a boxplot with a histogram for the same feature, e.g. the PURCHASES column.</b>
</div>

In [None]:
 # TODO

In [None]:
 # TODO

### 9.2.2
### Anomaly Detection
[Index](#Index)

We'll now try to run the four types of anomaly detection system on this dataset.

Before you can do that you'll need to do some preprocessing of the data to remove non-numeric columsn, and any rows containing null 'NaN' values.

<div class="alert alert-block alert-danger">
    <b>Q: First of all, drop the non-numeric column.</b>
</div>

In [None]:
df =  # TODO

Unknown values are often coded as NaN in pandas dataframes. Run the following code to see if there are any in the table:

In [None]:
df.isna().drop_duplicates()

<div class="alert alert-block alert-danger">
    <b>Q: Which columns contain NaN values?</b>
</div>

<div class="alert alert-block alert-success">
ANS
</div>

You can use the following code to show all lines containing a NaN value for the column called 'COLUMN_NAME': 

In [None]:
# df[df['COLUMN_NAME'].isnull()]

In [None]:
 # TODO

<div class="alert alert-block alert-danger">
    <b>Q: Remove all rows containing NaN numbers or replace them with 'default' values.</b>
</div>

In [None]:
 # TODO

<div class="alert alert-block alert-danger">
    <b>Q: Run the Eliptic Envelope (Robust Covariance) anomaly detection routine on the remaining data setting the contamination parameter to 0.01</b>
</div>

Run the Eliptic Envelope (Robust Covariance) anomaly detection routine on the remaining data setting the contamination parameter to 0.01

In [None]:
outlier_fraction =  # TODO
y_pred =  # TODO

There is a warning, telling us that it should not happen.

It might have to do with either
- scaling of the data
- some bad columns distribution

I have performed the analysis and observed that the `'ONEOFF_PURCHASES'` causes the issue, thus I will just drop it.

In [None]:
df_noNaN = df_noNaN.drop('ONEOFF_PURCHASES', axis=1)
y_pred = EllipticEnvelope(contamination=outlier_fraction).fit_predict(df_noNaN)

<div class="alert alert-block alert-danger">
    <b>Q: How many anomalies does it find?</b>
</div>

To find out how many anomalies it's found, we can print the count of the different values of y_pred. 

In [None]:
 # TODO

<div class="alert alert-block alert-danger">
    <b>Q: The predicted anomalies have a value of -1. Print them out.</b>
</div>

In [None]:
 # TODO

<div class="alert alert-block alert-danger">
    <b>Q: Visualise using boxplots the anomalous and non-anomalous datasets separately. Do the distributions look different?</b>
</div>

In [None]:
 # TODO

You can note that the values are generally much smaller for the non-anomalies...

<div class="alert alert-block alert-danger">
    <b>Q: Plot the features BALANCE and PURCHASES against each other using a 2 dimensional scatter plot and color the dots differently based on whether they are considered anomalies or not. What do you conclude? </b>
</div>

Note: you will probably need to set the "alpha" parameter controllign transperancy to 0.3

In [None]:
 # TODO

<div class="alert alert-block alert-danger">
    <b>Q: Repeat the analysis (i.e. generate and plot the predictions) with the other 3 anomaly detection techniques: One-class SVM, Isolation Forest and Local Outlier Factor. How well do they do?</b>
</div>

In [None]:
 # TODO

One-class SVM and LOF will be sensitive to scaling of the data, so we can use the StandardScalar to rescale the features so that they each have unit variance.
We can then repeat the the anomaly detection after scaling and look for any change?

<div class="alert alert-block alert-danger">
    <b>Q: Rescale the data using the StandardScaler.</b>
</div>

In [None]:
df_noNaN_scaled =  # TODO

<div class="alert alert-block alert-danger">
    <b>Q: Generate and plot the predictions.</b>
</div>

In [None]:
 # TODO

Note: to perform the analysis, you could also plot the points labeled as outliers separately from the others.

---