# Chapter 2 - Data Preparation Basics
## Segment 2 - Treating missing values

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

### Figuring out what data is missing

In [2]:
# We start by creating a variable called missing and assign it to nan from numpy
missing = np.nan

# Now let's create a Series object using the constructor and pass label indexes to each of the rows in outr Series
# in the label indexes we will use the missing variables instead of 'row 3' and 'row 7'
series_obj = Series(['row 1', 'row 2', missing, 'row 4', 'row 5', 'row 6', missing, 'row 8'])
series_obj

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object

In [3]:
# Now let's work with isnull() method
# This method returns a boolean value that describe in True/False whether an element in Pandas object is a null value
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

### Filling in for missing values

In [4]:
# Let's create a DataFrame of Random numbers

# First let's set the random seed 
np.random.seed(25)

# Let's create the DataFrame object using a Series of 36 Random numbers organized into 6 rows and 6 columns
DF_obj = DataFrame(np.random.rand(36).reshape(6, 6))
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [5]:
# Now we are going to use the .loc method to select rows and columns and set certain values in the DataFrame to missing
# We want to select rows at index position 3 through 5 and at column position 0 and set their values equal to missing
DF_obj.loc[3:5, 0] = missing

# Let's do this again for different selection of the DataFrame to select row postions from 1 through 4 at column position 5
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [6]:
#  Now we will use the DataFrame fillna method to fill NaN values with 0
filled_DF = DF_obj.fillna(0)
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.0
3,0.0,0.836375,0.481343,0.516502,0.383048,0.0
4,0.0,0.559053,0.03445,0.71993,0.421004,0.0
5,0.0,0.900274,0.669612,0.456069,0.289804,0.525819


In [7]:
# This will then fill in the missing values from each column series as designated by the dictionary key with its own value as specified by the corresponding dictionary value.
# So at column position 0, will fill it with 0.1
# and at column position 5, will fill it with 1.25 
filled_DF = DF_obj.fillna({0: 0.1, 5: 1.25})
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,1.25
2,0.447031,0.585445,0.161985,0.520719,0.326051,1.25
3,0.1,0.836375,0.481343,0.516502,0.383048,1.25
4,0.1,0.559053,0.03445,0.71993,0.421004,1.25
5,0.1,0.900274,0.669612,0.456069,0.289804,0.525819


In [8]:
# Also we can use forward fill `ffill`
# the `ffill` method will fill the null values in the series with the last non-null values in the same series
fill_Df = DF_obj.fillna(method='ffill')
fill_Df

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.117376
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.117376
3,0.447031,0.836375,0.481343,0.516502,0.383048,0.117376
4,0.447031,0.559053,0.03445,0.71993,0.421004,0.117376
5,0.447031,0.900274,0.669612,0.456069,0.289804,0.525819


### Counting missing values

In [9]:
# Sometimes we want to create a summary statistics of the dataset
# one of the things, we want to count the null values to understand which variables are most problematic

# First we will need to regenerate our dataframe with missing values
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape(6, 6))
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [10]:
# Now we will count how many null values in the dataframe per column 
DF_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

### Filtering out missing values

In [11]:
# First we will use the `dropna` method and use it to drop any rows with NaN values
DF_no_NaN = DF_obj.dropna()
DF_no_NaN

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376


In [12]:
# to drop columns instead of rows we pass the axis=1
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN

Unnamed: 0,1,2,3,4
0,0.582277,0.278839,0.185911,0.4111
1,0.437611,0.556229,0.36708,0.402366
2,0.585445,0.161985,0.520719,0.326051
3,0.836375,0.481343,0.516502,0.383048
4,0.559053,0.03445,0.71993,0.421004
5,0.900274,0.669612,0.456069,0.289804
