In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame(np.random.randn(5, 3), 
                index=['a', 'c', 'e', 'f', 'h'],
                columns=['one', 'two', 'three'])
df

Unnamed: 0,one,two,three
a,-0.130144,-1.003936,0.9734
c,0.645567,0.749565,0.874958
e,-0.916102,0.932321,-1.10938
f,-1.271114,-0.604654,0.211032
h,0.183184,-0.87179,-2.542023


In [5]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df2

Unnamed: 0,one,two,three
a,-0.130144,-1.003936,0.9734
b,,,
c,0.645567,0.749565,0.874958
d,,,
e,-0.916102,0.932321,-1.10938
f,-1.271114,-0.604654,0.211032
g,,,
h,0.183184,-0.87179,-2.542023


In [12]:
pd.isna(df2[['one','two']])

Unnamed: 0,one,two
a,False,False
b,True,True
c,False,False
d,True,True
e,False,False
f,False,False
g,True,True
h,False,False


In [13]:
pd.notna(df2[['one','two']])

Unnamed: 0,one,two
a,True,True
b,False,False
c,True,True
d,False,False
e,True,True
f,True,True
g,False,False
h,True,True


In [14]:
df2.isna()

Unnamed: 0,one,two,three
a,False,False,False
b,True,True,True
c,False,False,False
d,True,True,True
e,False,False,False
f,False,False,False
g,True,True,True
h,False,False,False


### **np.nan != np.nan**
### **None == np.nan.**

# Timestamp

In [24]:
df3 = df2.copy()
df3['timestamp'] = pd.Timestamp('20120101')
df3

Unnamed: 0,one,two,three,timestamp
a,-0.130144,-1.003936,0.9734,2012-01-01
b,,,,2012-01-01
c,0.645567,0.749565,0.874958,2012-01-01
d,,,,2012-01-01
e,-0.916102,0.932321,-1.10938,2012-01-01
f,-1.271114,-0.604654,0.211032,2012-01-01
g,,,,2012-01-01
h,0.183184,-0.87179,-2.542023,2012-01-01


In [22]:
df3.loc[['a','c','h'],['one','timestamp']] = np.nan
df3

Unnamed: 0,one,two,three,timestamp
a,,-1.003936,0.9734,NaT
b,,,,2012-01-01
c,,0.749565,0.874958,NaT
d,,,,2012-01-01
e,-0.916102,0.932321,-1.10938,2012-01-01
f,-1.271114,-0.604654,0.211032,2012-01-01
g,,,,2012-01-01
h,,-0.87179,-2.542023,NaT


In [21]:
df3.get_dtype_counts()

float64           3
datetime64[ns]    1
dtype: int64

# Inserting missing data

## Numeric

In [32]:
sint = pd.Series([1, 2, 3])
sint.loc[0] = None
sint

0    NaN
1    2.0
2    3.0
dtype: float64

In [27]:
sint.loc[1] = np.nan
sint

0    NaN
1    NaN
2    3.0
dtype: float64

### Datetime

In [44]:
range_date = pd.date_range('1/3/2000', periods=11, freq='1D')
sdate = pd.Series(range_date)
sdate.loc[1] = None
sdate

0    2000-01-03
1           NaT
2    2000-01-05
3    2000-01-06
4    2000-01-07
5    2000-01-08
6    2000-01-09
7    2000-01-10
8    2000-01-11
9    2000-01-12
10   2000-01-13
dtype: datetime64[ns]

In [43]:
sdate.loc[2] = np.nan
sdate

0    2000-01-03
1           NaT
2           NaT
3    2000-01-06
4    2000-01-07
5    2000-01-08
6    2000-01-09
7    2000-01-10
8    2000-01-11
9    2000-01-12
10   2000-01-13
dtype: datetime64[ns]

### Object

In [28]:
schar = pd.Series(["a", "b", "c"])
schar.loc[0] = None
schar

0    None
1       b
2       c
dtype: object

In [30]:
schar.loc[1] = np.nan
schar

0    None
1     NaN
2       c
dtype: object

#### Object

# Filling missing data

In [46]:
fldf = df3.copy()
fldf

Unnamed: 0,one,two,three,timestamp
a,-0.130144,-1.003936,0.9734,2012-01-01
b,,,,2012-01-01
c,0.645567,0.749565,0.874958,2012-01-01
d,,,,2012-01-01
e,-0.916102,0.932321,-1.10938,2012-01-01
f,-1.271114,-0.604654,0.211032,2012-01-01
g,,,,2012-01-01
h,0.183184,-0.87179,-2.542023,2012-01-01


In [47]:
fldf.fillna(0)

Unnamed: 0,one,two,three,timestamp
a,-0.130144,-1.003936,0.9734,2012-01-01
b,0.0,0.0,0.0,2012-01-01
c,0.645567,0.749565,0.874958,2012-01-01
d,0.0,0.0,0.0,2012-01-01
e,-0.916102,0.932321,-1.10938,2012-01-01
f,-1.271114,-0.604654,0.211032,2012-01-01
g,0.0,0.0,0.0,2012-01-01
h,0.183184,-0.87179,-2.542023,2012-01-01


In [54]:
fldf.fillna('not known')

Unnamed: 0,one,two,three,timestamp
a,-0.130144,-1.00394,0.9734,2012-01-01
b,not known,not known,not known,2012-01-01
c,0.645567,0.749565,0.874958,2012-01-01
d,not known,not known,not known,2012-01-01
e,-0.916102,0.932321,-1.10938,2012-01-01
f,-1.27111,-0.604654,0.211032,2012-01-01
g,not known,not known,not known,2012-01-01
h,0.183184,-0.87179,-2.54202,2012-01-01


In [57]:
fldf.ffill()

Unnamed: 0,one,two,three,timestamp
a,-0.130144,-1.003936,0.9734,2012-01-01
b,-0.130144,-1.003936,0.9734,2012-01-01
c,0.645567,0.749565,0.874958,2012-01-01
d,0.645567,0.749565,0.874958,2012-01-01
e,-0.916102,0.932321,-1.10938,2012-01-01
f,-1.271114,-0.604654,0.211032,2012-01-01
g,-1.271114,-0.604654,0.211032,2012-01-01
h,0.183184,-0.87179,-2.542023,2012-01-01


# Filling DataFrames

In [60]:
dff = pd.DataFrame(np.random.randn(10,3), columns=list('ABC'))
dff

Unnamed: 0,A,B,C
0,0.062115,-0.321973,-1.107027
1,1.408851,-0.926138,-0.294738
2,0.635803,-0.735559,-0.046086
3,-0.5213,0.6849,-0.569707
4,-0.166942,-0.27007,-0.317
5,0.007632,0.607943,0.466126
6,-1.073665,-1.548541,1.497753
7,-0.964708,0.378699,0.884103
8,0.961296,0.696125,0.97251
9,1.722649,0.753469,1.377772
