In [1]:
#Load and summarize the dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split

# load the dataset
df=read_csv('housing_1.csv')
#retrieve the array
data=df.values
# split into input and output elements
X,y=data[:, :-1],data[: ,-1]
#summarize the shape of the dataset
print(X.shape,y.shape)
#split into train and test sets
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33, random_state=1)
#summarize the shape of the train and test sets
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(505, 13) (505,)
(338, 13) (167, 13) (338,) (167,)


In [2]:
df.isnull().sum()

0.00632    0
18.00      0
2.310      0
0          0
0.5380     0
6.5750     0
65.20      0
4.0900     0
1          0
296.0      0
15.30      0
396.90     0
4.98       0
24.00      0
dtype: int64

# Baseline Model Performance

In [3]:
#evaluate model on the raw dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_absolute_error
#load dataset
df=read_csv('housing_1.csv')
#retrieve the array
data=df.values
# split into input and output elements
X,y=data[:, :-1],data[: ,-1]
#split into train and test sets
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33, random_state=1)
#fit the model
model=LinearRegression()
model.fit(X_train,y_train)
#evaluate the model
yhat=model.predict(X_test)
#evaluate predictions
mae=mean_absolute_error(y_test,yhat)
print('MAE: %.3f' %mae)

MAE: 3.656


# Automatic Outlier Detection
# 1. Isolation Forest

In [4]:
#evaluate model performance with outliers removed using isolation forest
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error

#load dataset
df=read_csv('housing_1.csv')

#retrieve the array
data=df.values
# split into input and output elements
X,y=data[:, :-1],data[: ,-1]

#split into train and test sets
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33, random_state=1)

#summarize the shape of the training dataset
print(X_train.shape,y_train.shape)

#identify outliers in the training dataset
iso=IsolationForest(contamination=0.1)
yhat=iso.fit_predict(X_train)

#select all rows that are not outliers
mask=yhat!=-1
X_train,y_train=X_train[mask,:],y_train[mask]

#summarise the shape of the updated training dataset
print(X_train.shape,y_train.shape)

#fit the model
model=LinearRegression()
model.fit(X_train,y_train)

#evaluate the model
yhat=model.predict(X_test)

#evaluate predictions
mae=mean_absolute_error(y_test,yhat)
print('MAE: %.3f' %mae)


(338, 13) (338,)
(304, 13) (304,)
MAE: 3.483


# 2. minimum covariance determinant


In [5]:
#evaluate model performance with outliers removed using elliptical envelope
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import mean_absolute_error

#load the dataset
df=read_csv('housing_1.csv')

#retrieve the array
data=df.values

# split into input and output elements
X,y=data[:, :-1],data[: ,-1]

#split into train and test sets
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33, random_state=1)

#summarize the shape of the training dataset
print(X_train.shape,y_train.shape)

#identify outliers in the training dataset
ee=EllipticEnvelope(contamination=0.01)
yhat=ee.fit_predict(X_train)

#select all rows that are not outliers
mask=yhat!=-1
X_train,y_train=X_train[mask,:],y_train[mask]

#summarise the shape of the updated training dataset
print(X_train.shape,y_train.shape)

#fit the model
model=LinearRegression()
model.fit(X_train,y_train)

#evaluate the model
yhat=model.predict(X_test)

#evaluate predictions
mae=mean_absolute_error(y_test,yhat)
print('MAE: %.3f' %mae)

(338, 13) (338,)
(334, 13) (334,)
MAE: 3.652


# 3.Local outlier factor

In [6]:
#evaluate model performance with outliers removed using local outlier factor
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error

#load the dataset
df=read_csv('housing_1.csv')

#retrieve the array
data=df.values

# split into input and output elements
X,y=data[:, :-1],data[: ,-1]

#split into train and test sets
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33, random_state=1)

#summarize the shape of the training dataset
print(X_train.shape,y_train.shape)

#identify outliers in the training dataset
lof=LocalOutlierFactor()
yhat=lof.fit_predict(X_train)

#select all rows that are not outliers
mask=yhat!=-1
X_train,y_train=X_train[mask,:],y_train[mask]

#summarise the shape of the updated training dataset
print(X_train.shape,y_train.shape)

#fit the model
model=LinearRegression()
model.fit(X_train,y_train)

#evaluate the model
yhat=model.predict(X_test)

#evaluate predictions
mae=mean_absolute_error(y_test,yhat)
print('MAE: %.3f' %mae)

(338, 13) (338,)
(291, 13) (291,)
MAE: 3.590


# 4. one class svm

In [7]:
#evaluate model performance with outliers removed using one class svm
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.svm import OneClassSVM
from sklearn.metrics import mean_absolute_error

#load the dataset
df=read_csv('housing_1.csv')

#retrieve the array
data=df.values

# split into input and output elements
X,y=data[:, :-1],data[: ,-1]

#split into train and test sets
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33, random_state=1)

#summarize the shape of the training dataset
print(X_train.shape,y_train.shape)

#identify outliers in the training dataset
ee=OneClassSVM(nu=0.01)
yhat=ee.fit_predict(X_train)

#select all rows that are not outliers
mask=yhat!=-1
X_train,y_train=X_train[mask,:],y_train[mask]

#summarise the shape of the updated training dataset
print(X_train.shape,y_train.shape)

#fit the model
model=LinearRegression()
model.fit(X_train,y_train)

#evaluate the model
yhat=model.predict(X_test)

#evaluate predictions
mae=mean_absolute_error(y_test,yhat)
print('MAE: %.3f' %mae)

(338, 13) (338,)
(333, 13) (333,)
MAE: 3.682
