# Missing Values
1. Identifying missing data
2. Dropping missing data
3. Filling in missing data
4. Pandas data functions automatically ignores missing values

In [1]:
import pandas as pd 
import numpy as np
ser = pd.Series({'AAPL':0.01,'MSFT':-0.01,'TSLA':np.nan,'LULU':0.005})

index=['20201201','20201202','20201203','20201204']
columns = ['AAPL','MSFT','TSLA','LULU']

data=[[np.nan,0.03,0.05,0.005],
      [np.nan,np.nan,-0.05,-0.0025],
      [np.nan,np.nan,np.nan,np.nan],
      [np.nan,0.015,0.03,0.01]]

df=pd.DataFrame(data,index=index,columns=columns)
df

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,,0.03,0.05,0.005
20201202,,,-0.05,-0.0025
20201203,,,,
20201204,,0.015,0.03,0.01


## Identifying Missing Data

In [2]:
# missing data is represented by np.nan
nan=np.nan
nan

nan

In [3]:
pd.isnull(nan)

True

In [4]:
pd.isnull(5)

False

In [5]:
ser

AAPL    0.010
MSFT   -0.010
TSLA      NaN
LULU    0.005
dtype: float64

In [6]:
ser.isnull()

AAPL    False
MSFT    False
TSLA     True
LULU    False
dtype: bool

In [7]:
ser.notnull()

AAPL     True
MSFT     True
TSLA    False
LULU     True
dtype: bool

In [8]:
ser.count()

3

In [9]:
df.isnull()

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,True,False,False,False
20201202,True,True,False,False
20201203,True,True,True,True
20201204,True,False,False,False


In [10]:
df.notnull()

Unnamed: 0,AAPL,MSFT,TSLA,LULU
20201201,False,True,True,True
20201202,False,False,True,True
20201203,False,False,False,False
20201204,False,True,True,True


In [11]:
df.count()

AAPL    0
MSFT    2
TSLA    3
LULU    3
dtype: int64

## Dropping Missing Data

In [None]:
ser[ser.notnull()]

In [None]:
ser.dropna()

In [None]:
# By default dropna drops any row containing NaN
df.dropna()

In [None]:
df

In [None]:
df.dropna(how='all')

In [None]:
df.dropna(thresh=3)

In [None]:
df.dropna(axis=1,thresh=3)

## Filling Missing Data

In [None]:
ser

In [None]:
ser.fillna(0)

In [None]:
df

In [None]:
df.fillna(0)

In [None]:
df.fillna({'AAPL':0.02,'TSLA':0.02,'MSFT':0.02,'LULU':0.01})

In [None]:
avg=df.mean()
avg

In [None]:
df.fillna(avg)

In [None]:
df.fillna(method='ffill',limit=1)

In [None]:
df

## Pandas Functions Auto Exclude NaN

In [None]:
df.mean()

In [None]:
# only use non NaN values
df.fillna(0).sum()/df.count()

In [None]:
np.mean(df.values,1)

In [None]:
df.rank()