<a href="https://colab.research.google.com/github/manojcsathreya/Pandas/blob/main/8_Pandas_Fix_missing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#How to handle missign data in pandas?

In [1]:
import pandas as pd
import numpy as np

In [2]:
s=pd.Series(["Sam",np.nan,"Tim","Kim"])
s

0    Sam
1    NaN
2    Tim
3    Kim
dtype: object

In [3]:
#to check what rows are having null values
s.isnull() 

0    False
1     True
2    False
3    False
dtype: bool

In [4]:
#to check the opposite
s.notnull()

0     True
1    False
2     True
3     True
dtype: bool

In [6]:
s[3]=np.NaN

In [7]:
s

0    Sam
1    NaN
2    Tim
3    NaN
dtype: object

In [8]:
#to drop rows having null values
s.dropna()

0    Sam
2    Tim
dtype: object

In [9]:
from numpy import NaN as NA

In [10]:
df=pd.DataFrame([[1,2,3],[4,NA,5],
                 [NA,NA,NA]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [11]:
#dropping the rows having null values
df.dropna()

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [13]:
#this does not change the original dataset
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [16]:
#to specify some additional parameters, like how do you want to drop the rows 
df.dropna(how='all')
#here it only drops rows which are having all null values

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0


In [17]:
#to drop rows having any null values
df.dropna(how='any')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [19]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [24]:
#we can also set threshold values
df.dropna(thresh=1)
#A row can have atmost one null value

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0


In [25]:
#filling data for NaN
df.fillna(0)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,0.0,5.0
2,0.0,0.0,0.0


In [26]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [29]:
#we can fill row wise data
df.fillna({0:15,1:25,2:35})
#all 0th column NaN be having 15, 1st column NaN be having 25, 2nd column NaN be having 35  

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,25.0,5.0
2,15.0,25.0,35.0


In [30]:
#we can use inplace=True in dropna() to make changes permanent

In [33]:
#ffill stands for forwardfill imputes the missing values with values behind that
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,4.0,2.0,5.0


In [34]:
#we can also set the limit on how many rows we can fill this data 
df.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,4.0,,5.0


In [35]:
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [37]:
df[0].iloc[-1] = 1
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,1.0,,


In [39]:
#we can also impute the rows from previous column values. we just have to use axis = 1
df.fillna(method = 'ffill', axis=1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,4.0,5.0
2,1.0,1.0,1.0


In [40]:
df=pd.DataFrame([[1,2,3],[4,NA,5],
                 [NA,NA,NA]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


In [41]:
data=pd.Series([1,0,NA,5])
data

0    1.0
1    0.0
2    NaN
3    5.0
dtype: float64

In [42]:
#we can also fill the missing values with mean
data.fillna(value=data.mean())

0    1.0
1    0.0
2    2.0
3    5.0
dtype: float64

In [44]:
df.fillna(value=df.mean())

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,2.5,2.0,4.0
