In [19]:
import pandas as pd
import numpy as np

In [20]:
df = pd.DataFrame(
   ...:     np.random.randn(5, 3),
   ...:     index=["a", "c", "e", "f", "h"],
   ...:     columns=["one", "two", "three"],
   ...: )

df

Unnamed: 0,one,two,three
a,0.745602,0.880366,-1.469942
c,-1.584421,0.612144,0.176005
e,2.090175,0.238499,-2.106245
f,-1.536459,1.877619,-0.445507
h,-1.38279,-0.792796,0.843612


In [21]:
df["four"] = "bar"

In [22]:
df["five"] = df["one"] > 0

In [23]:
df

Unnamed: 0,one,two,three,four,five
a,0.745602,0.880366,-1.469942,bar,True
c,-1.584421,0.612144,0.176005,bar,False
e,2.090175,0.238499,-2.106245,bar,True
f,-1.536459,1.877619,-0.445507,bar,False
h,-1.38279,-0.792796,0.843612,bar,False


In [24]:
df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])
df2

Unnamed: 0,one,two,three,four,five
a,0.745602,0.880366,-1.469942,bar,True
b,,,,,
c,-1.584421,0.612144,0.176005,bar,False
d,,,,,
e,2.090175,0.238499,-2.106245,bar,True
f,-1.536459,1.877619,-0.445507,bar,False
g,,,,,
h,-1.38279,-0.792796,0.843612,bar,False


In [25]:
df2["one"]

a    0.745602
b         NaN
c   -1.584421
d         NaN
e    2.090175
f   -1.536459
g         NaN
h   -1.382790
Name: one, dtype: float64

In [26]:
pd.isna(df2["one"])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [27]:
df2["four"].notna()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

In [28]:
df2.isna()

Unnamed: 0,one,two,three,four,five
a,False,False,False,False,False
b,True,True,True,True,True
c,False,False,False,False,False
d,True,True,True,True,True
e,False,False,False,False,False
f,False,False,False,False,False
g,True,True,True,True,True
h,False,False,False,False,False


In [29]:
None == None

True

In [30]:
np.nan == np.nan

False

In [31]:
df2["one"] == np.nan

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
Name: one, dtype: bool

### Integer dtypes and missing data

In [32]:
pd.Series([1, 2, np.nan, 4], dtype=pd.Int64Dtype())

0       1
1       2
2    <NA>
3       4
dtype: Int64

### Datetimes

In [33]:
df2 = df.copy()

In [34]:
df2["timestamp"] = pd.Timestamp("20120101")

In [35]:
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,0.745602,0.880366,-1.469942,bar,True,2012-01-01
c,-1.584421,0.612144,0.176005,bar,False,2012-01-01
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01
h,-1.38279,-0.792796,0.843612,bar,False,2012-01-01


In [36]:
df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan

In [37]:
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.880366,-1.469942,bar,True,NaT
c,,0.612144,0.176005,bar,False,NaT
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01
h,,-0.792796,0.843612,bar,False,NaT


In [38]:
df2.dtypes.value_counts()

float64           3
bool              1
object            1
datetime64[ns]    1
dtype: int64

### Inserting missing data

In [39]:
s = pd.Series([1, 2, 3])

In [40]:
s.loc[0] = None

In [41]:
s

0    NaN
1    2.0
2    3.0
dtype: float64

In [42]:
s = pd.Series(["a", "b", "c"])

In [43]:
s.loc[0] = None

In [44]:
s.loc[1] = np.nan

In [45]:
s

0    None
1     NaN
2       c
dtype: object

### Calculations with missing data

In [48]:
df

Unnamed: 0,one,two,three,four,five
a,0.745602,0.880366,-1.469942,bar,True
c,-1.584421,0.612144,0.176005,bar,False
e,2.090175,0.238499,-2.106245,bar,True
f,-1.536459,1.877619,-0.445507,bar,False
h,-1.38279,-0.792796,0.843612,bar,False


In [49]:
df["one"].sum()

-1.667892378827441

In [50]:
df.mean(1)

a    0.289007
c   -0.199068
e    0.305607
f   -0.026087
h   -0.332994
dtype: float64

In [51]:
df.cumsum()

Unnamed: 0,one,two,three,four,five
a,0.745602,0.880366,-1.469942,bar,1
c,-0.838818,1.49251,-1.293937,barbar,1
e,1.251357,1.731009,-3.400181,barbarbar,2
f,-0.285102,3.608628,-3.845689,barbarbarbar,2
h,-1.667892,2.815832,-3.002076,barbarbarbarbar,2


### NA values in GroupBy

In [52]:
df

Unnamed: 0,one,two,three,four,five
a,0.745602,0.880366,-1.469942,bar,True
c,-1.584421,0.612144,0.176005,bar,False
e,2.090175,0.238499,-2.106245,bar,True
f,-1.536459,1.877619,-0.445507,bar,False
h,-1.38279,-0.792796,0.843612,bar,False


In [53]:
df.groupby("one").mean()

Unnamed: 0_level_0,two,three,five
one,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.584421,0.612144,0.176005,False
-1.536459,1.877619,-0.445507,False
-1.38279,-0.792796,0.843612,False
0.745602,0.880366,-1.469942,True
2.090175,0.238499,-2.106245,True


### Cleaning / filling missing data
#### pandas objects are equipped with various data manipulation methods for dealing with missing data.

### Filling missing values: fillna
#### fillna() can “fill in” NA values with non-NA data in a couple of ways, which we illustrate:
#### Replace NA with a scalar value

In [54]:
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.880366,-1.469942,bar,True,NaT
c,,0.612144,0.176005,bar,False,NaT
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01
h,,-0.792796,0.843612,bar,False,NaT


In [55]:
df2.fillna(0)

Unnamed: 0,one,two,three,four,five,timestamp
a,0.0,0.880366,-1.469942,bar,True,0
c,0.0,0.612144,0.176005,bar,False,0
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01 00:00:00
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01 00:00:00
h,0.0,-0.792796,0.843612,bar,False,0


In [56]:
df2['one'].fillna('missing')

a     missing
c     missing
e    2.090175
f   -1.536459
h     missing
Name: one, dtype: object

#### Fill gaps forward or backward

#### Using the same filling arguments as reindexing, we can propagate non-NA values forward or backward:

In [62]:
df3 = df2.copy()
df3

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.880366,-1.469942,bar,True,NaT
c,,0.612144,0.176005,bar,False,NaT
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01
h,,-0.792796,0.843612,bar,False,NaT


In [63]:
df3.fillna(method="pad")

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.880366,-1.469942,bar,True,NaT
c,,0.612144,0.176005,bar,False,NaT
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01
h,-1.536459,-0.792796,0.843612,bar,False,2012-01-01


#### Limit the amount of filling

#### If we only want consecutive gaps filled up to a certain number of data points, we can use the limit keyword:

In [64]:
df3

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.880366,-1.469942,bar,True,NaT
c,,0.612144,0.176005,bar,False,NaT
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01
h,,-0.792796,0.843612,bar,False,NaT


In [65]:
df3.fillna(method="pad", limit=1)

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.880366,-1.469942,bar,True,NaT
c,,0.612144,0.176005,bar,False,NaT
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01
h,-1.536459,-0.792796,0.843612,bar,False,2012-01-01


### Filling with a PandasObject

In [67]:
dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC"))
dff.iloc[3:5, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan
dff

Unnamed: 0,A,B,C
0,-0.940989,0.849398,-1.451357
1,0.474079,0.40152,-1.358311
2,-1.450596,1.438411,-0.735787
3,,-1.291267,0.34411
4,,,-0.63604
5,-2.671435,,
6,-1.108437,0.929766,
7,0.479037,-0.937561,
8,-0.607845,-1.570562,-0.287242
9,1.260404,-0.44363,1.895163


In [68]:
dff.fillna(dff.mean())

Unnamed: 0,A,B,C
0,-0.940989,0.849398,-1.451357
1,0.474079,0.40152,-1.358311
2,-1.450596,1.438411,-0.735787
3,-0.570723,-1.291267,0.34411
4,-0.570723,-0.07799,-0.63604
5,-2.671435,-0.07799,-0.318495
6,-1.108437,0.929766,-0.318495
7,0.479037,-0.937561,-0.318495
8,-0.607845,-1.570562,-0.287242
9,1.260404,-0.44363,1.895163


In [69]:
dff.fillna(dff.mean()["B":"C"])

Unnamed: 0,A,B,C
0,-0.940989,0.849398,-1.451357
1,0.474079,0.40152,-1.358311
2,-1.450596,1.438411,-0.735787
3,,-1.291267,0.34411
4,,-0.07799,-0.63604
5,-2.671435,-0.07799,-0.318495
6,-1.108437,0.929766,-0.318495
7,0.479037,-0.937561,-0.318495
8,-0.607845,-1.570562,-0.287242
9,1.260404,-0.44363,1.895163


In [70]:
dff.where(pd.notna(dff), dff.mean(), axis="columns")

Unnamed: 0,A,B,C
0,-0.940989,0.849398,-1.451357
1,0.474079,0.40152,-1.358311
2,-1.450596,1.438411,-0.735787
3,-0.570723,-1.291267,0.34411
4,-0.570723,-0.07799,-0.63604
5,-2.671435,-0.07799,-0.318495
6,-1.108437,0.929766,-0.318495
7,0.479037,-0.937561,-0.318495
8,-0.607845,-1.570562,-0.287242
9,1.260404,-0.44363,1.895163


### Dropping axis labels with missing data: dropna

In [74]:
df3

Unnamed: 0,one,two,three,four,five,timestamp
a,,0.880366,-1.469942,bar,True,NaT
c,,0.612144,0.176005,bar,False,NaT
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01
h,,-0.792796,0.843612,bar,False,NaT


In [75]:
df3.dropna(axis=0)

Unnamed: 0,one,two,three,four,five,timestamp
e,2.090175,0.238499,-2.106245,bar,True,2012-01-01
f,-1.536459,1.877619,-0.445507,bar,False,2012-01-01


In [76]:
df3.dropna(axis=1)

Unnamed: 0,two,three,four,five
a,0.880366,-1.469942,bar,True
c,0.612144,0.176005,bar,False
e,0.238499,-2.106245,bar,True
f,1.877619,-0.445507,bar,False
h,-0.792796,0.843612,bar,False


In [78]:
df3["one"].dropna()

e    2.090175
f   -1.536459
Name: one, dtype: float64