In [2]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.api as sm
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

from cs109.dataloader import build_dataset

%matplotlib inline

### Load the dataset 

In [3]:
msft_data = build_dataset()

In [12]:
msft_data.sample(n = 10)

Unnamed: 0,Adj Close,Diluted EPS,Gross Margin,Operating Income,Revenue,SP500,FEDFUNDS,DGS10,Acquisition,Investment,...,SC 13D,SC 13D/A,SC 13G,SC 13G/A,SC TO-C,SC TO-I,SC TO-I/A,SC TO-T,SC TO-T/A,SD
2013-12-23,33.772603,0.62,13384.0,6334.0,18529.0,1827.99,0.09,6.22,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-11-15,22.082167,0.62,13056.0,7116.0,16195.0,1197.75,0.19,6.22,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996-10-01,5.607073,0.06,1923.0,853.0,2405.0,689.08,5.24,6.22,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014-05-02,36.878373,0.68,14425.0,6974.0,20403.0,1881.14,0.09,6.22,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-06-24,43.834608,0.61,14568.0,6594.0,21729.0,2108.58,0.13,6.22,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-02-07,8.519356,0.07,2250.0,1035.0,2808.0,789.56,5.19,6.22,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-11-15,22.082167,0.62,13056.0,7116.0,16195.0,1197.75,0.19,6.22,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2003-06-11,16.947894,0.2,6561.0,2744.0,7835.0,997.48,1.22,6.22,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2006-06-20,17.61325,0.29,8872.0,3888.0,10900.0,1240.12,4.99,6.22,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009-09-21,20.924422,0.34,10513.0,3987.0,13099.0,1064.66,0.15,6.22,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Investigate Model Performance

- try diferent outlier fractions
- try different variables (first difference, difference between price and moving averages etc)
- try different model parameters 


In [None]:
def show_outliers(df, data_column, outliers_mask, figsize=(12, 6), title = None):
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111)
    df[data_column].plot(title = title, ax = ax)
    df[outliers_mask][data_column].plot(linestyle = '', marker = 'o', color = 'r', markersize = 2, ax = ax)  
    return ax

In [None]:
X_train = msft_data.as_matrix()
scaler = StandardScaler().fit(X_train)
X = scaler.transform(X_train)

In [None]:
out_num = []
for outliers_fraction in np.linspace(0.001, 0.1):
    clf = OneClassSVM(nu = 0.95 * outliers_fraction + 0.05, kernel = "rbf", gamma = 0.1).fit(X)
    p = clf.predict(X)
    # number of outliers detected
    out_num.append(np.sum(p == -1))

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(1, 1, 1)
ax.plot(np.linspace(0.001, 0.1), out_num)

In [None]:
out_num = []
for outliers_fraction in np.linspace(0.001, 0.1):
    clf = IsolationForest(contamination = outliers_fraction).fit(X)
    p = clf.predict(X)
    # number of outliers detected
    out_num.append(np.sum(p == -1))

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(1, 1, 1)
ax.plot(np.linspace(0.001, 0.1), out_num)

In [None]:
outliers_fraction = 0.01
clf = IsolationForest(contamination = outliers_fraction).fit(X)
p = clf.predict(X)
outliers_mask = p == -1
show_outliers(msft_data, 'Adj Close', outliers_mask, title = 'outliers by Isolation Forest')

In [None]:
outliers_fraction = 0.001
clf = OneClassSVM(nu = 0.95 * outliers_fraction + 0.05, kernel = "rbf", gamma = 0.5).fit(X)
p = clf.predict(X)
print np.sum(p == -1)
outliers_mask = p == -1
show_outliers(msft_data, 'Adj Close', outliers_mask, title = 'outliers by One Class SVM')