## Dealing with missing values

In [1]:
import numpy as np
import pandas as pd

### Check for Missing Data / Null Values

In [2]:
df = pd.DataFrame(np.random.randint(1, 101,(5,3)), index=['a', 'c', 'e', 'f',
'h'],columns=['column one', 'column two', 'column three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print('Sample DataFrame:\n\n',df)

Sample DataFrame:

    column one  column two  column three
a        90.0        68.0          71.0
b         NaN         NaN           NaN
c         5.0        73.0          18.0
d         NaN         NaN           NaN
e        25.0        16.0          43.0
f        46.0        15.0          50.0
g         NaN         NaN           NaN
h        67.0        64.0          30.0


In [3]:
print('Is null:\n',df.isnull())

Is null:
    column one  column two  column three
a       False       False         False
b        True        True          True
c       False       False         False
d        True        True          True
e       False       False         False
f       False       False         False
g        True        True          True
h       False       False         False


In [4]:
print('Is null:\n',df.isnull().sum())

Is null:
 column one      3
column two      3
column three    3
dtype: int64


In [5]:
print('Is null:\n',df.isnull().any())

Is null:
 column one      True
column two      True
column three    True
dtype: bool


In [6]:
print('\nNot null:\n',df.notnull())


Not null:
    column one  column two  column three
a        True        True          True
b       False       False         False
c        True        True          True
d       False       False         False
e        True        True          True
f        True        True          True
g       False       False         False
h        True        True          True


In [7]:
df = pd.DataFrame(np.random.randint(10,15,(3,3)),columns=['Column1','Column2','Column3'])
df.iloc[np.random.randint(0,3),np.random.randint(0,3,3)] = np.nan

print('Sample DataFrame:\n\n',df)

Sample DataFrame:

    Column1  Column2  Column3
0       12     11.0       10
1       14      NaN       14
2       13     11.0       10


In [8]:
print('Calculations with missing data:\n')
print(df.agg([np.sum,np.mean]))

Calculations with missing data:

      Column1  Column2    Column3
sum      39.0     22.0  34.000000
mean     13.0     11.0  11.333333


### Filling and dropping missing values

In [9]:
df = pd.DataFrame({'day':pd.date_range('07/01/2020',periods=10),'temperature':np.random.randint(15,40,10),
                   'event':[['Sunny','Rainy','Cloudy',np.nan,np.nan][np.random.randint(0,5)] for i in range(10)]})
df.iloc[np.random.randint(0,10,5),1] = np.nan

print('Sample DataFrame:\n\n',df)

Sample DataFrame:

          day  temperature   event
0 2020-07-01         37.0   Rainy
1 2020-07-02         22.0     NaN
2 2020-07-03          NaN     NaN
3 2020-07-04          NaN     NaN
4 2020-07-05         19.0     NaN
5 2020-07-06          NaN     NaN
6 2020-07-07          NaN     NaN
7 2020-07-08         34.0  Cloudy
8 2020-07-09          NaN     NaN
9 2020-07-10         30.0  Cloudy


In [10]:
print('Fill NaN values in temperature with 0 and in event with "no event":\n\n',
      df.fillna({'temperature':0,'event':'no event'}))

Fill NaN values in temperature with 0 and in event with "no event":

          day  temperature     event
0 2020-07-01         37.0     Rainy
1 2020-07-02         22.0  no event
2 2020-07-03          0.0  no event
3 2020-07-04          0.0  no event
4 2020-07-05         19.0  no event
5 2020-07-06          0.0  no event
6 2020-07-07          0.0  no event
7 2020-07-08         34.0    Cloudy
8 2020-07-09          0.0  no event
9 2020-07-10         30.0    Cloudy


In [11]:
print('Forward fill:\n\n',df.fillna(method='ffill'))

Forward fill:

          day  temperature   event
0 2020-07-01         37.0   Rainy
1 2020-07-02         22.0   Rainy
2 2020-07-03         22.0   Rainy
3 2020-07-04         22.0   Rainy
4 2020-07-05         19.0   Rainy
5 2020-07-06         19.0   Rainy
6 2020-07-07         19.0   Rainy
7 2020-07-08         34.0  Cloudy
8 2020-07-09         34.0  Cloudy
9 2020-07-10         30.0  Cloudy


In [12]:
print('Backward fill:\n\n',df.fillna(method='bfill'))

Backward fill:

          day  temperature   event
0 2020-07-01         37.0   Rainy
1 2020-07-02         22.0  Cloudy
2 2020-07-03         19.0  Cloudy
3 2020-07-04         19.0  Cloudy
4 2020-07-05         19.0  Cloudy
5 2020-07-06         34.0  Cloudy
6 2020-07-07         34.0  Cloudy
7 2020-07-08         34.0  Cloudy
8 2020-07-09         30.0  Cloudy
9 2020-07-10         30.0  Cloudy


In [13]:
print('Interpolate:\n\n',df.interpolate())

Interpolate:

          day  temperature   event
0 2020-07-01         37.0   Rainy
1 2020-07-02         22.0     NaN
2 2020-07-03         21.0     NaN
3 2020-07-04         20.0     NaN
4 2020-07-05         19.0     NaN
5 2020-07-06         24.0     NaN
6 2020-07-07         29.0     NaN
7 2020-07-08         34.0  Cloudy
8 2020-07-09         32.0     NaN
9 2020-07-10         30.0  Cloudy


In [14]:
new_df = df
new_df['day'] = pd.date_range('07/01/2020',periods=10,freq='B')
new_df.set_index('day',inplace=True)
print('\nSame DataFrame but different date range(no weekends):\n\n',new_df)


Same DataFrame but different date range(no weekends):

             temperature   event
day                            
2020-07-01         37.0   Rainy
2020-07-02         22.0     NaN
2020-07-03          NaN     NaN
2020-07-06          NaN     NaN
2020-07-07         19.0     NaN
2020-07-08          NaN     NaN
2020-07-09          NaN     NaN
2020-07-10         34.0  Cloudy
2020-07-13          NaN     NaN
2020-07-14         30.0  Cloudy


In [15]:
print('\nInterpolate biased on date:\n\n',new_df.interpolate(method='time'))


Interpolate biased on date:

             temperature   event
day                            
2020-07-01         37.0   Rainy
2020-07-02         22.0     NaN
2020-07-03         21.4     NaN
2020-07-06         19.6     NaN
2020-07-07         19.0     NaN
2020-07-08         24.0     NaN
2020-07-09         29.0     NaN
2020-07-10         34.0  Cloudy
2020-07-13         31.0     NaN
2020-07-14         30.0  Cloudy


In [16]:
print('Sample DataFrame:\n\n',df)

Sample DataFrame:

             temperature   event
day                            
2020-07-01         37.0   Rainy
2020-07-02         22.0     NaN
2020-07-03          NaN     NaN
2020-07-06          NaN     NaN
2020-07-07         19.0     NaN
2020-07-08          NaN     NaN
2020-07-09          NaN     NaN
2020-07-10         34.0  Cloudy
2020-07-13          NaN     NaN
2020-07-14         30.0  Cloudy


In [17]:
print('\nDrop rows if any of the values were NaN:\n\n',df.dropna())


Drop rows if any of the values were NaN:

             temperature   event
day                            
2020-07-01         37.0   Rainy
2020-07-10         34.0  Cloudy
2020-07-14         30.0  Cloudy


In [18]:
print('\nDrop rows if all of the values in the row were NaN:\n\n',df.dropna(how='all'))


Drop rows if all of the values in the row were NaN:

             temperature   event
day                            
2020-07-01         37.0   Rainy
2020-07-02         22.0     NaN
2020-07-07         19.0     NaN
2020-07-10         34.0  Cloudy
2020-07-14         30.0  Cloudy


In [19]:
print('\nKeep rows with atleast 1 value otherwise drop:\n\n',df.dropna(thresh=1))


Keep rows with atleast 1 value otherwise drop:

             temperature   event
day                            
2020-07-01         37.0   Rainy
2020-07-02         22.0     NaN
2020-07-07         19.0     NaN
2020-07-10         34.0  Cloudy
2020-07-14         30.0  Cloudy
