In [1]:
import numpy as np
import pandas as pd

In [17]:
# Creating dataframes with the NaN values
df = pd.DataFrame([[1, 2, 3, 4], [5, np.nan, np.nan, 6], [7, np.nan, 8, 9]])

In [18]:
df

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,4
1,5,,,6
2,7,,8.0,9


In [19]:
# To check whether the dataframe has null values or not
df.isna()

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,True,True,False
2,False,True,False,False


In [5]:
# This drops the null values right through columns and rows. 
# This returns rows only if a row is completely null free
df.dropna()

Unnamed: 0,0,1,2
0,1,2.0,3.0


In [8]:
# Axis = 0 in drop na will remove null values and return is all elements of a ROW is completely null free
# Axis = 1 in drop na will remove null values and return is all elements of a COLUMN is completely null free
df.dropna(axis=1)

Unnamed: 0,0
0,1
1,4
2,6


In [27]:
# If we need to allow only few null values to get dropped in a row we can use thresh
# Don't know why this is not working need to check
df.dropna(how='all')

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,4
1,5,,,6
2,7,,8.0,9


In [22]:
# To fill null values with given default value
df.fillna(0)

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,4
1,5,0.0,0.0,6
2,7,0.0,8.0,9


In [30]:
# We have two methods to it foward filling and backward filling with the previous values
# Forward filling with axis = 0, so the NaN value cell will be looking for a value from the above row
# if its starting row then NaN will be retained else the previous row's value will be replaced
df.ffill(axis=0)

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,4
1,5,2.0,3.0,6
2,7,2.0,8.0,9


In [31]:
# Forward filling with axis = 1, so the NaN value cell will be looking for a value from the before/left column 
# if its starting column then NaN will be retained else the before/left column's value will be replaced
df.ffill(axis=1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,5.0,5.0,5.0,6.0
2,7.0,7.0,8.0,9.0


In [32]:
# Back filling is just opposite of the forward filling ofcourse for both axis
df.bfill(axis=0)

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,4
1,5,,8.0,6
2,7,,8.0,9


In [34]:
# If we want to just replace one instance of it, then we can use limit
# This works well with both ffill and bfill
df.bfill(axis=1, limit=1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,5.0,,6.0,6.0
2,7.0,8.0,8.0,9.0


In [38]:
df_tt = pd.read_csv('titanic.csv', sep='\t')

In [39]:
df_tt.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [40]:
# This gives columnwise totals(non-null totals)
df_tt.count()

PassengerId    156
Survived       156
Pclass         156
Name           156
Sex            156
Age            126
SibSp          156
Parch          156
Ticket         156
Fare           156
Cabin           31
Embarked       155
dtype: int64

In [43]:
# This has 156 rows/index and 12 columns
df_tt.shape

(156, 12)

In [45]:
# This gives count of null values we have in each columns
# Age has 30, Cabin has 125 & Embarked has 1 NaN values 
df_tt.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             30
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          125
Embarked         1
dtype: int64

In [46]:
# We are dropping a row if a row has atleast one NaN value
df_tt_dropped = df_tt.dropna()

In [47]:
# Original Shape of the dataframe
df_tt.shape

(156, 12)

In [48]:
# This is the shape after dropping NaN values
df_tt_dropped.shape

(27, 12)

In [49]:
# We see a huge amount of data is getting dropped, so we can try to apply few methods do some changes
df_tt_dropped_all = df_tt.dropna(how='all')

In [52]:
# Here we don't see any changes in the shape to the original dataframe
df_tt_dropped_all.shape

(156, 12)

In [53]:
# We can replace the null values that will be a better choice here
df_tt_replace_zeros = df_tt.fillna(0)

In [57]:
# We have replaced all values with 0 and we don't have any null values, 
# but the point here is we cannot use the same method all time
df_tt_replace_zeros.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [58]:
df_tt.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             30
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          125
Embarked         1
dtype: int64

In [60]:
# Replacing NaN values with 0s doesn't make sense, so we first get the unique values of the Cabin column
df_tt.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2'], dtype=object)

In [61]:
# We then replace C2 with all NaN values and place it inplace instead of creating a new dataframe
df_tt.Cabin.fillna('C2', inplace=True)

In [62]:
# Now we have non null data for most of the columns
df_tt.isna().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age            30
SibSp           0
Parch           0
Ticket          0
Fare            0
Cabin           0
Embarked        1
dtype: int64

In [67]:
# For age lets fill with mean value of the column values, which makes more sense and lets do it inplace
df_tt.Age.fillna(df_tt.Age.mean().round(), inplace=True)

In [68]:
# Now we are left with only one column's null value
df_tt.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       1
dtype: int64

In [69]:
# This is same case as of Cabin
df_tt.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [70]:
# Replacing it inplace with default value as 'S'
df_tt.Embarked.fillna('S', inplace=True)

In [71]:
# All non null data for the given dataframe has been achived 
df_tt.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64