<a href="https://colab.research.google.com/github/narutsoo/tutorial/blob/master/Auto_Outlier_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic : 4 Automatic Outlier Detection Algorithms in Python

*Reference*

* https://machinelearningmastery.com/model-based-outlier-detection-and-removal-in-python/


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv', header=None)

In [3]:
data = df.values

To define input and output variables (and print the shape)

In [4]:
X = data[:,:-1]
y = data[:,-1]

print (X.shape, y.shape)

(506, 13) (506,)


To separate train & test datasets (and print the shape)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(339, 13) (167, 13) (339,) (167,)


---
## Baseline Model Performance
---

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv', header=None)

In [8]:
data = df.values

In [9]:
# To define input and output variables
X = data[:,:-1]
y = data[:,-1]

In [10]:
# To separate train & test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# To define the model
model = LinearRegression()
# To fit the model
model.fit(X_train, y_train)
# To obtain prediction values
y_pred = model.predict(X_test)
# To evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print ("MAE = %.3f" %(mae))

MAE = 3.417


---
# Automatic Outlier Detection
---

## 1) Isolation Forest
---

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error

In [12]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv', header=None)

In [13]:
# Change dataframe to numpy array
data = df.values

In [14]:
# To define input and output variables
X = data[:, :-1]
y = data[:, -1]

In [15]:
# To separate train & test datasets (and print the shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Identify outlier in the training set
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# To select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
print (X_train.shape, y_train.shape)
# To define the model
model = LinearRegression()
# To fit the model
model.fit(X_train, y_train)
# To obtain prediction values
y_pred = model.predict(X_test)
# To evaluate the model
mae = mean_absolute_error(y_test, y_pred)
# print the mae
print ("MAE = %.3f" %mae)

(339, 13) (167, 13) (339,) (167,)
(305, 13) (305,)
MAE = 3.263


## 2) Minimum Covariance Determinant
---

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.covariance import EllipticEnvelope #**********
from sklearn.metrics import mean_absolute_error

In [17]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv', header=None)

In [18]:
# Change dataframe to numpy array
data = df.values

In [19]:
# To define input and output variables
X = data[:,:-1]
y = data[:,-1]

In [20]:
# To separate train & test datasets (and print the shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Identify outlier in the training set #**********
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# To select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
print (X_train.shape, y_train.shape)
# To define the model
model = LinearRegression()
# To fit the model
model.fit(X_train, y_train)
# To obtain prediction values 
y_pred = model.predict(X_test)
# To evaluate the model
mae = mean_absolute_error(y_test, y_pred)
# print the mae
print ("MAE = %.3f" %mae)

(339, 13) (167, 13) (339,) (167,)
(335, 13) (335,)
MAE = 3.388


## 3) Local Outlier Factor
---

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor #**********
from sklearn.metrics import mean_absolute_error

In [22]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv', header=None)

In [23]:
# Change dataframe to numpy array
data = df.values

In [24]:
# To define input and output variables
X = data[:,:-1]
y = data[:,-1]

In [25]:
# To separate train & test datasets (and print the shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Identify outlier in the training set #**********
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# To select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
print (X_train.shape, y_train.shape)
# To define the model
model = LinearRegression()
# To fit the model
model.fit(X_train, y_train)
# To obtain prediction values 
y_pred = model.predict(X_test)
# To evaluate the model
mae = mean_absolute_error(y_test, y_pred)
# print the mae
print ("MAE = %.3f" %mae)

(339, 13) (167, 13) (339,) (167,)
(305, 13) (305,)
MAE = 3.356


## 4) One-Class SVM
---

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import OneClassSVM #**********
from sklearn.metrics import mean_absolute_error

In [27]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv', header=None)

In [28]:
# Change dataframe to numpy array
data = df.values

In [29]:
# To define input and output variables
X = data[:,:-1]
y = data[:,-1]

In [30]:
# To separate train & test datasets (and print the shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Identify outlier in the training set #**********
ocs = OneClassSVM(nu=0.01)
yhat = ocs.fit_predict(X_train)
# To select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
print (X_train.shape, y_train.shape)
# To define the model
model = LinearRegression()
# To fit the model
model.fit(X_train, y_train)
# To obtain prediction values 
y_pred = model.predict(X_test)
# To evaluate the model
mae = mean_absolute_error(y_test, y_pred)
# print the mae
print ("MAE = %.3f" %mae)

(339, 13) (167, 13) (339,) (167,)
(336, 13) (336,)
MAE = 3.431
