In [1]:
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std
# seed the random number generator
seed(1)
# generate univariate observations
data = 5 * randn(10000) + 50
# calculate summary statistics
data_mean, data_std = mean(data), std(data)
# define outliers
cut_off = data_std * 3
lower, upper = data_mean - cut_off, data_mean + cut_off
# identify outliers
outliers = [x for x in data if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in data if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))

Identified outliers: 29
Non-outlier observations: 9971


In [2]:
# identify outliers with interquartile range
from numpy.random import seed
from numpy.random import randn
from numpy import percentile
# seed the random number generator
seed(1)
# generate univariate observations
data = 5 * randn(10000) + 50
# calculate interquartile range
q25, q75 = percentile(data, 25), percentile(data, 75)
iqr = q75 - q25
print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))
# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off
# identify outliers
outliers = [x for x in data if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in data if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))

Percentiles: 25th=46.685, 75th=53.359, IQR=6.674
Identified outliers: 81
Non-outlier observations: 9919


In [3]:
# load and summarize the dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
# load the dataset
df = read_csv('HousingData.csv', header=None)
# retrieve the array
data = df.values
# split into input and output elements
X, y = data[:, :-1], data[:, -1]
# summarize the shape of the dataset
print(X.shape, y.shape)
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the train and test sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(507, 13) (507,)
(339, 13) (168, 13) (339,) (168,)


In [27]:
# evaluate model on the raw dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
# load the dataset
dx = read_csv('housing.csv', header=None)
# retrieve the array

In [29]:
#evaluate model on training dataset with outliers removed
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error
# load the dataset
df = read_csv('housing.csv', header=None)
# retrieve the array
data = df.values
# split into input and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

(328, 3) (328,)


ValueError: could not convert string to float: 'RM'

In [9]:
data

array([['CRIM', 'ZN', 'INDUS', ..., 'B', 'LSTAT', 'MEDV'],
       ['0.00632', '18', '2.31', ..., '396.9', '4.98', '24'],
       ['0.02731', '0', '7.07', ..., '396.9', '9.14', '21.6'],
       ...,
       ['0.06076', '0', '11.93', ..., '396.9', '5.64', '23.9'],
       ['0.10959', '0', '11.93', ..., '393.45', '6.48', '22'],
       ['0.04741', '0', '11.93', ..., '396.9', '7.88', '11.9']],
      dtype=object)

In [13]:
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2
6,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
7,0.08829,12.5,7.87,,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
9,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93,16.5


In [16]:
df[10:30]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
10,0.17004,12.5,7.87,,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9
11,0.22489,12.5,7.87,0.0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45,15.0
12,0.11747,12.5,7.87,0.0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27,18.9
13,0.09378,12.5,7.87,0.0,0.524,5.889,39.0,5.4509,5,311,15.2,390.5,15.71,21.7
14,0.62976,0.0,8.14,0.0,0.538,5.949,61.8,4.7075,4,307,21.0,396.9,8.26,20.4
15,0.63796,0.0,8.14,,0.538,6.096,84.5,4.4619,4,307,21.0,380.02,10.26,18.2
16,0.62739,0.0,8.14,0.0,0.538,5.834,56.5,4.4986,4,307,21.0,395.62,8.47,19.9
17,1.05393,0.0,8.14,0.0,0.538,5.935,29.3,4.4986,4,307,21.0,386.85,6.58,23.1
18,0.7842,0.0,8.14,0.0,0.538,5.99,81.7,4.2579,4,307,21.0,386.75,14.67,17.5
19,0.80271,0.0,8.14,0.0,0.538,5.456,36.6,3.7965,4,307,21.0,288.99,11.69,20.2


In [17]:
df[30:50]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
30,1.00245,0,8.14,0.0,0.538,6.674,87.3,4.239,4,307,21.0,380.23,11.98,21.0
31,1.13081,0,8.14,0.0,0.538,5.713,94.1,4.233,4,307,21.0,360.17,22.6,12.7
32,1.35472,0,8.14,0.0,0.538,6.072,100.0,4.175,4,307,21.0,376.73,13.04,14.5
33,1.38799,0,8.14,0.0,0.538,5.95,82.0,3.99,4,307,21.0,232.6,27.71,13.2
34,1.15172,0,8.14,0.0,0.538,5.701,95.0,3.7872,4,307,21.0,358.77,18.35,13.1
35,1.61282,0,8.14,0.0,0.538,6.096,96.9,3.7598,4,307,21.0,248.31,20.34,13.5
36,0.06417,0,5.96,0.0,0.499,5.933,68.2,3.3603,5,279,19.2,396.9,,18.9
37,0.09744,0,,0.0,0.499,5.841,61.4,3.3779,5,279,19.2,377.56,11.41,20.0
38,0.08014,0,5.96,0.0,0.499,5.85,41.5,3.9342,5,279,19.2,396.9,8.77,21.0
39,0.17505,0,5.96,0.0,0.499,5.966,30.2,3.8473,5,279,19.2,393.43,10.13,24.7


In [18]:
df[50:70]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
50,0.21977,0.0,6.91,0,0.448,5.602,62.0,6.0877,3,233,17.9,396.9,16.2,19.4
51,0.08873,21.0,5.64,0,0.439,5.963,45.7,6.8147,4,243,16.8,395.56,13.45,19.7
52,0.04337,21.0,,0,0.439,6.115,63.0,6.8147,4,243,16.8,393.97,9.43,20.5
53,0.0536,21.0,5.64,0,0.439,6.511,21.1,6.8147,4,243,16.8,396.9,5.28,25.0
54,,21.0,5.64,0,0.439,5.998,21.4,6.8147,4,243,16.8,396.9,8.43,23.4
55,0.0136,75.0,4.0,0,0.41,5.888,47.6,7.3197,3,469,21.1,396.9,14.8,18.9
56,0.01311,90.0,1.22,0,0.403,7.249,21.9,8.6966,5,226,17.9,395.93,4.81,35.4
57,0.02055,85.0,0.74,0,0.41,6.383,35.7,9.1876,2,313,17.3,396.9,5.77,24.7
58,0.01432,100.0,1.32,0,0.411,6.816,40.5,8.3248,5,256,15.1,392.9,3.95,31.6
59,0.15445,25.0,5.13,0,0.453,6.145,29.2,7.8148,8,284,19.7,390.68,6.86,23.3


In [26]:
df.iloc[54]

0        NaN
1         21
2       5.64
3          0
4      0.439
5      5.998
6       21.4
7     6.8147
8          4
9        243
10      16.8
11     396.9
12      8.43
13      23.4
Name: 54, dtype: object

In [28]:
dx[50:70]

Unnamed: 0,0,1,2,3
50,5.602,16.2,17.9,407400.0
51,5.963,13.45,16.8,413700.0
52,6.115,9.43,16.8,430500.0
53,6.511,5.28,16.8,525000.0
54,5.998,8.43,16.8,491400.0
55,5.888,14.8,21.1,396900.0
56,7.249,4.81,17.9,743400.0
57,6.383,5.77,17.3,518700.0
58,6.816,3.95,15.1,663600.0
59,6.145,6.86,19.7,489300.0


In [30]:
dx[1:30]

Unnamed: 0,0,1,2,3
1,6.575,4.98,15.3,504000.0
2,6.421,9.14,17.8,453600.0
3,7.185,4.03,17.8,728700.0
4,6.998,2.94,18.7,701400.0
5,7.147,5.33,18.7,760200.0
6,6.43,5.21,18.7,602700.0
7,6.012,12.43,15.2,480900.0
8,6.172,19.15,15.2,569100.0
9,5.631,29.93,15.2,346500.0
10,6.004,17.1,15.2,396900.0


In [34]:
dx.isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

In [35]:
df.isnull().sum() # checking out null values 

0    0
1    0
2    0
3    0
dtype: int64

In [36]:
dx.dropna(axis=0)

#dropping nan values based on rows ; all rows with a nan values are dropped

Unnamed: 0,0,1,2,3
0,RM,LSTAT,PTRATIO,MEDV
1,6.575,4.98,15.3,504000.0
2,6.421,9.14,17.8,453600.0
3,7.185,4.03,17.8,728700.0
4,6.998,2.94,18.7,701400.0
...,...,...,...,...
485,6.593,9.67,21.0,470400.0
486,6.12,9.08,21.0,432600.0
487,6.976,5.64,21.0,501900.0
488,6.794,6.48,21.0,462000.0


In [37]:
#evaluate model on training dataset with outliers removed
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error
# load the dataset
#df = read_csv('housing.csv', header=None)
# retrieve the array
data = dx.values
# split into input and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
#the error below is cause of row 0 which includes strings

(328, 3) (328,)


ValueError: could not convert string to float: 'RM'

In [39]:
dx.isna().sum()
#checking out the presence of nan values on tthe dataset so as to drop them

0    0
1    0
2    0
3    0
dtype: int64

In [43]:
dx[1:60]

Unnamed: 0,0,1,2,3
1,6.575,4.98,15.3,504000.0
2,6.421,9.14,17.8,453600.0
3,7.185,4.03,17.8,728700.0
4,6.998,2.94,18.7,701400.0
5,7.147,5.33,18.7,760200.0
6,6.43,5.21,18.7,602700.0
7,6.012,12.43,15.2,480900.0
8,6.172,19.15,15.2,569100.0
9,5.631,29.93,15.2,346500.0
10,6.004,17.1,15.2,396900.0


In [45]:
dx.head()

Unnamed: 0,0,1,2,3
0,RM,LSTAT,PTRATIO,MEDV
1,6.575,4.98,15.3,504000.0
2,6.421,9.14,17.8,453600.0
3,7.185,4.03,17.8,728700.0
4,6.998,2.94,18.7,701400.0


In [51]:
nudx=dx.drop([0])
#dropping a row

In [52]:
nudx

Unnamed: 0,0,1,2,3
1,6.575,4.98,15.3,504000.0
2,6.421,9.14,17.8,453600.0
3,7.185,4.03,17.8,728700.0
4,6.998,2.94,18.7,701400.0
5,7.147,5.33,18.7,760200.0
...,...,...,...,...
485,6.593,9.67,21.0,470400.0
486,6.12,9.08,21.0,432600.0
487,6.976,5.64,21.0,501900.0
488,6.794,6.48,21.0,462000.0


In [55]:
#evaluate model on training dataset with outliers removed
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error
# load the dataset
#df = read_csv('housing.csv', header=None)
# retrieve the array
data = nudx.values
# split into input and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
#cannot understand why the value of MAE value is so large ???????? Why?

(327, 3) (327,)
(315, 3) (315,)
MAE: 64126.177


In [81]:
df.rename(columns=df.iloc[0],inplace =True)
 
'''#To rename the header without reassign df:

df.rename(columns=df.iloc[0], inplace = True)
To drop the row without reassign df:

df.drop(df.index[0], inplace = True)'''

'#To rename the header without reassign df:\n\ndf.rename(columns=df.iloc[0], inplace = True)\nTo drop the row without reassign df:\n\ndf.drop(df.index[0], inplace = True)'

In [65]:
#po=df.drop([0],inplace = True)

In [82]:
df.head()

Unnamed: 0,6.575,4.98,15.3,504000.0
1,6.575,4.98,15.3,504000.0
2,6.421,9.14,17.8,453600.0
3,7.185,4.03,17.8,728700.0
4,6.998,2.94,18.7,701400.0
5,7.147,5.33,18.7,760200.0


In [75]:
#po.head()
df.dropna(axis=0)
df.head()

Unnamed: 0,6.575,4.98,15.3,504000.0
1,6.575,4.98,15.3,504000.0
2,6.421,9.14,17.8,453600.0
3,7.185,4.03,17.8,728700.0
4,6.998,2.94,18.7,701400.0
5,7.147,5.33,18.7,760200.0


In [85]:
'''from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error
# load the dataset'''
df = read_csv('HousingData.csv', header=None)
'''# retrieve the array
data = df.values
# split into input and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
#cannot understand why the value of MAE value is so large ???????? Why?'''

"# retrieve the array\ndata = df.values\n# split into input and output elements\nX, y = data[:, :-1], data[:, -1]\n# split into train and test sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)\n# summarize the shape of the training dataset\nprint(X_train.shape, y_train.shape)\n# identify outliers in the training dataset\nlof = LocalOutlierFactor()\nyhat = lof.fit_predict(X_train)\n# select all rows that are not outliers\nmask = yhat != -1\nX_train, y_train = X_train[mask, :], y_train[mask]\n# summarize the shape of the updated training dataset\nprint(X_train.shape, y_train.shape)\n# fit the model\nmodel = LinearRegression()\nmodel.fit(X_train, y_train)\n# evaluate the model\nyhat = model.predict(X_test)\n# evaluate predictions\nmae = mean_absolute_error(y_test, yhat)\nprint('MAE: %.3f' % mae)\n#cannot understand why the value of MAE value is so large ???????? Why?"

In [86]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,0.06263,0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21,391.99,,22.4
503,0.04527,0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21,396.9,9.08,20.6
504,0.06076,0,11.93,0,0.573,6.976,91,2.1675,1,273,21,396.9,5.64,23.9
505,0.10959,0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21,393.45,6.48,22


In [88]:
data=df.values

In [89]:
data

array([['CRIM', 'ZN', 'INDUS', ..., 'B', 'LSTAT', 'MEDV'],
       ['0.00632', '18', '2.31', ..., '396.9', '4.98', '24'],
       ['0.02731', '0', '7.07', ..., '396.9', '9.14', '21.6'],
       ...,
       ['0.06076', '0', '11.93', ..., '396.9', '5.64', '23.9'],
       ['0.10959', '0', '11.93', ..., '393.45', '6.48', '22'],
       ['0.04741', '0', '11.93', ..., '396.9', '7.88', '11.9']],
      dtype=object)

In [92]:
df.rename(columns=df.iloc[0],inplace =True)

In [96]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4


In [97]:
df.drop([0])

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,0.06263,0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21,391.99,,22.4
503,0.04527,0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21,396.9,9.08,20.6
504,0.06076,0,11.93,0,0.573,6.976,91,2.1675,1,273,21,396.9,5.64,23.9
505,0.10959,0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21,393.45,6.48,22


In [98]:
df.dropna(axis=0)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,0.17783,0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.1,17.5
501,0.22438,0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.9,14.33,16.8
503,0.04527,0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21,396.9,9.08,20.6
504,0.06076,0,11.93,0,0.573,6.976,91,2.1675,1,273,21,396.9,5.64,23.9


In [100]:
dx=df.drop([0])

In [101]:
dx

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502,0.06263,0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21,391.99,,22.4
503,0.04527,0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21,396.9,9.08,20.6
504,0.06076,0,11.93,0,0.573,6.976,91,2.1675,1,273,21,396.9,5.64,23.9
505,0.10959,0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21,393.45,6.48,22


In [105]:
dm=dx.dropna(axis=0)

In [106]:
#evaluate model on training dataset with outliers removed
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import IsolationForest
# load the dataset
#df = read_csv('housing.csv', header=None)
# retrieve the array
data = dm.values
# split into input and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
lof = LocalOutlierFactor()
#iso=IsolationForest()
yhat = lof.fit_predict(X_train)
#iso2=iso(X_train)
# select all rows that are not outliers
mask = yhat != -1
#mask1=iso2 !=-1
X_train, y_train = X_train[mask, :], y_train[mask]

# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
#cannot understand why the value of MAE value is so large ???????? Why?
#finally fixed the issues after wrangling with the data for 3 hours :)

(263, 13) (263,)
(230, 13) (230,)
MAE: 3.542


In [107]:
dm

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
1,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
2,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
3,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
4,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
6,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,0.17783,0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.1,17.5
501,0.22438,0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.9,14.33,16.8
503,0.04527,0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21,396.9,9.08,20.6
504,0.06076,0,11.93,0,0.573,6.976,91,2.1675,1,273,21,396.9,5.64,23.9


In [108]:
#evaluate model on training dataset with outliers removed
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import IsolationForest
# load the dataset
#df = read_csv('housing.csv', header=None)
# retrieve the array
data = dm.values
# split into input and output elements
X, y = data[:, :-1], data[:, -1]
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
#lof = LocalOutlierFactor()
iso=IsolationForest()
#yhat = lof.fit_predict(X_train)
iso2=iso(X_train)
# select all rows that are not outliers
#mask = yhat != -1
mask1=iso2 !=-1
X_train, y_train = X_train[mask1, :], y_train[mask1]

# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)
#cannot understand why the value of MAE value is so large ???????? Why?
#finally fixed the issues after wrangling with the data for 3 hours :)

(263, 13) (263,)


TypeError: 'IsolationForest' object is not callable

In [109]:
'''Make sure to learn how to implement these otlier detection algorithms'''

'Make sure to learn how to implement these otlier detection algorithms'