# 1. Load Data

In [3]:
from sklearn.datasets import load_boston
boston = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [2]:
import pandas as pd
df = pd.DataFrame(data=boston.data, columns = boston.feature_names)

In [None]:
df

# 2. Remove Arbitrary Data

In [6]:
import numpy as np
np.random.seed(123)
n_samples, n_features = np.shape(df)

In [7]:
rng = np.random.RandomState(0)
missing_feature = rng.choice(n_features, n_samples, replace = True)

In [8]:
missing_feature

array([12,  5,  0,  3, 11,  3,  7,  9,  3,  5,  2,  4,  7,  6,  8,  8, 12,
       10,  1,  6,  7,  7,  8,  1,  5,  9,  8,  9,  4,  3,  0,  3,  5,  0,
        2,  3,  8,  1,  3,  3,  3,  7,  0,  1,  9,  9,  0, 10,  4,  7,  3,
       11,  2,  7, 12,  2,  0,  0,  4,  5,  5,  6,  8,  4,  1,  4,  9, 10,
       10,  8,  1,  1,  7,  9,  9,  3,  6,  7, 11,  2, 11,  0,  3,  5, 12,
        9, 10,  4, 11,  4,  6,  4,  4,  3, 12,  4,  4,  8,  4,  3, 10,  7,
        5,  5,  0,  1,  5,  9,  3,  0,  5,  0,  1,  2,  4,  2,  0,  3,  2,
       10,  0,  7,  5,  9,  0, 10,  2, 11, 10,  7, 11,  2,  9,  2,  3, 11,
        3,  2,  3,  4,  1,  2, 11,  9, 10,  1,  4, 10,  6, 11,  8, 11,  2,
        3,  0,  0,  6,  0,  6,  3, 10,  3,  8, 12,  8,  8,  2,  3,  2, 11,
        0,  8,  8,  3,  8, 10,  2,  8,  4,  3, 12,  0,  4,  3, 11, 12,  6,
        9, 11,  8,  0,  8,  5,  9,  0, 12,  9,  6,  5,  3,  1,  8,  0,  4,
       11, 11,  9,  6,  5,  7,  8,  8,  9,  2,  8,  6, 11,  6,  9,  1,  6,
       12,  8,  8,  3,  2

In [9]:
df_missing = df.copy()

for i in range(np.shape(df_missing)[0]):
  df_missing.iloc[i,missing_feature[i]] = np.nan

df_missing

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,
1,0.02731,0.0,7.07,0.0,0.469,,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,,5.64
504,0.10959,0.0,11.93,,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [10]:
df_missing.isna().sum()


CRIM       42
ZN         34
INDUS      36
CHAS       56
NOX        43
RM         36
AGE        32
DIS        39
RAD        47
TAX        41
PTRATIO    31
B          40
LSTAT      29
dtype: int64

# 3. MICE

In [11]:
!pip install impyute


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting impyute
  Downloading impyute-0.0.8-py2.py3-none-any.whl (31 kB)
Installing collected packages: impyute
Successfully installed impyute-0.0.8


In [13]:
import impyute
from impyute.imputation.cs import mice


In [14]:
df_mice = mice(np.asarray(df_missing))


In [15]:
df_mice = pd.DataFrame(df_mice, columns = df.columns)
df_mice

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.006320,18.0,2.31,0.000000,0.538,6.57500,65.2,4.0900,1.0,296.000000,15.3,396.900000,9.133942
1,0.027310,0.0,7.07,0.000000,0.469,6.51195,78.9,4.9671,2.0,242.000000,17.8,396.900000,9.140000
2,-3.231193,0.0,7.07,0.000000,0.469,7.18500,61.1,4.9671,2.0,242.000000,17.8,392.830000,4.030000
3,0.032370,0.0,2.18,0.067194,0.458,6.99800,45.8,6.0622,3.0,222.000000,18.7,394.630000,2.940000
4,0.069050,0.0,2.18,0.000000,0.458,7.14700,54.2,6.0622,3.0,222.000000,18.7,399.996900,5.330000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.062630,0.0,11.93,0.000000,0.573,6.59300,69.1,2.4786,1.0,302.427965,21.0,391.990000,9.670000
502,0.045270,0.0,11.93,0.000000,0.573,6.12000,76.7,2.2875,1.0,273.000000,21.0,396.900000,13.558562
503,0.060760,0.0,11.93,0.000000,0.573,6.97600,91.0,2.1675,1.0,273.000000,21.0,413.510511,5.640000
504,0.109590,0.0,11.93,0.135587,0.573,6.79400,89.3,2.3889,1.0,273.000000,21.0,393.450000,6.480000


# 4. Check Results

In [16]:
real_set = []
pred_set = []

for i in range(np.shape(df)[0]):
  real = df.iloc[i,missing_feature[i]]
  pred = df_mice.iloc[i,missing_feature[i]]

  real_set.append(real)
  pred_set.append(pred)

  print('Real value: ', real, '--Imputed value: ', pred)

Real value:  4.98 --Imputed value:  9.133941798860349
Real value:  6.421 --Imputed value:  6.511950214881191
Real value:  0.02729 --Imputed value:  -3.2311931869844446
Real value:  0.0 --Imputed value:  0.06719421003774081
Real value:  396.9 --Imputed value:  399.9969002052736
Real value:  0.0 --Imputed value:  0.06021550456654243
Real value:  5.5605 --Imputed value:  4.1768940258880205
Real value:  311.0 --Imputed value:  317.4320389854618
Real value:  0.0 --Imputed value:  0.05891438051850753
Real value:  6.004 --Imputed value:  6.043458122204067
Real value:  7.87 --Imputed value:  6.834063181731613
Real value:  0.524 --Imputed value:  0.5376029273138159
Real value:  5.4509 --Imputed value:  4.714836744177895
Real value:  61.8 --Imputed value:  59.658859153135765
Real value:  4.0 --Imputed value:  5.2067948908965
Real value:  4.0 --Imputed value:  5.450436747093903
Real value:  6.58 --Imputed value:  10.157842647033725
Real value:  21.0 --Imputed value:  18.621564025472786
Real value

In [17]:
rlt = pd.DataFrame({'real':real_set,'pred':pred_set})
rlt

Unnamed: 0,real,pred
0,4.98000,9.133942
1,6.42100,6.511950
2,0.02729,-3.231193
3,0.00000,0.067194
4,396.90000,399.996900
...,...,...
501,273.00000,302.427965
502,9.08000,13.558562
503,396.90000,413.510511
504,0.00000,0.135587


## Performance evaluation

In [21]:
from sklearn.metrics import mean_squared_error

In [23]:
mse = mean_squared_error(rlt['real'], rlt['pred'])
mse


684.2554412973799

In [24]:
rmse = mse ** 0.5
rmse

26.15827672644702

In [25]:
from sklearn.metrics import mean_absolute_error


In [26]:
mae = mean_absolute_error(rlt['real'], rlt['pred'])
mae

8.16440174739959