# Outlier Detection in Housing Data

In [1]:
# load and summarize the dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

## Reading the Data

In [2]:
url = 'https://goz39a.s3.eu-central-1.amazonaws.com/housing.csv'
df = pd.read_csv(url, header=None)

In [3]:
data = df.values
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)

(506, 13) (506,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
# summarize the shape of the train and test sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(379, 13) (127, 13) (379,) (127,)


## Building a Regression Model

In [5]:
model = LinearRegression()
model.fit(X_train, y_train)
y_prediction = model.predict(X_test)
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 3.668


## Isolation Forest
This method makes uses of a collection of (random) trees. This is where the name "Forest" comes from. In each of the tree, the data is split by creating a barrier between the maximum $(M)$ and minimum $(m)$ value on a particular dimension. The split is done sampling a splitting value $\sim U(m,M)$
This process is repeated over and over again untill all the datapoints are in a separate node.
<br>
<br>
Outliers are data-points (observations) that are very quickly separated from the rest of the samples.

In [6]:
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.2)
y_prediction = iso.fit_predict(X_train)

In the example above the contamination parameter is set at 0.2. The 

In [7]:
print(X_train.shape)

(379, 13)


In [8]:
# select all rows that are not outliers
mask = y_prediction != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(303, 13) (303,)


In [10]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
y_prediction = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 4.210


## MinCovDet 

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.2)
y_prediction = ee.fit_predict(X_train)

In [12]:
mask = y_prediction != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(271, 13) (271,)


In [13]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
y_prediction = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 4.126


## Local Outlier Factor

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [15]:
# identify outliers in the training dataset
lof = LocalOutlierFactor(contamination=0.2)
y_prediction = lof.fit_predict(X_train)

In [16]:
mask = y_prediction != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(271, 13) (271,)


In [17]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
y_prediction = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 3.654


## One Class SVM

In [17]:
ee = OneClassSVM(nu=0.2)
y_prediction = ee.fit_predict(X_train)

In [18]:
mask = y_prediction != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(217, 13) (217,)


In [19]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
y_prediction = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, y_prediction)
print('MAE: %.3f' % mae)

MAE: 4.133
