# How to handle missing data 

Python Machine Learning 2nd Edition by Sebastian Raschka, Packt Publishing Ltd. 2017

Code Repository: https://github.com/rasbt/python-machine-learning-book-2nd-edition

Code License: MIT License

In [3]:
# Replaciong missing/0 values
import numpy as np
from urllib.request import urlopen
from sklearn.preprocessing import Imputer

### Get data 

In [4]:
# First, let's load the Pima Indians Diabetes dataset
url = "http://goo.gl/j0Rvxq"
raw_data = urlopen(url)
dataset = np.loadtxt(raw_data, delimiter=",")
print(dataset.shape)

(768, 9)


In [5]:
# separate the data and target features
X = dataset[:,0:7]
y = dataset[:,8]

### Replace 0 with NaN 

In [6]:
# Find all 0s, replace them with NaN
modified_X = np.copy(X)
modified_X[modified_X==0]=np.nan

In [7]:
X is modified_X

False

In [9]:
X

array([[   6.   ,  148.   ,   72.   , ...,    0.   ,   33.6  ,    0.627],
       [   1.   ,   85.   ,   66.   , ...,    0.   ,   26.6  ,    0.351],
       [   8.   ,  183.   ,   64.   , ...,    0.   ,   23.3  ,    0.672],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,  112.   ,   26.2  ,    0.245],
       [   1.   ,  126.   ,   60.   , ...,    0.   ,   30.1  ,    0.349],
       [   1.   ,   93.   ,   70.   , ...,    0.   ,   30.4  ,    0.315]])

In [10]:
modified_X

array([[   6.   ,  148.   ,   72.   , ...,      nan,   33.6  ,    0.627],
       [   1.   ,   85.   ,   66.   , ...,      nan,   26.6  ,    0.351],
       [   8.   ,  183.   ,   64.   , ...,      nan,   23.3  ,    0.672],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,  112.   ,   26.2  ,    0.245],
       [   1.   ,  126.   ,   60.   , ...,      nan,   30.1  ,    0.349],
       [   1.   ,   93.   ,   70.   , ...,      nan,   30.4  ,    0.315]])

### Replace NaN with the mean 

In [11]:
# Impute missing values with mean of the features
imp = Imputer(missing_values='NaN', strategy='mean')
imputed_X = imp.fit_transform(modified_X)

In [12]:
imputed_X

array([[   6.        ,  148.        ,   72.        , ...,  155.54822335,
          33.6       ,    0.627     ],
       [   1.        ,   85.        ,   66.        , ...,  155.54822335,
          26.6       ,    0.351     ],
       [   8.        ,  183.        ,   64.        , ...,  155.54822335,
          23.3       ,    0.672     ],
       ..., 
       [   5.        ,  121.        ,   72.        , ...,  112.        ,
          26.2       ,    0.245     ],
       [   1.        ,  126.        ,   60.        , ...,  155.54822335,
          30.1       ,    0.349     ],
       [   1.        ,   93.        ,   70.        , ...,  155.54822335,
          30.4       ,    0.315     ]])