In [1]:
import pandas as pd

In [2]:
# Sample data to play with and clean
data = {
    'age': [27, 50, 34, None, None, None],
    'gender': ['f', 'f', 'f', 'm', 'm', None],
    'height' : [64, None, 71, 66, 68, None],
    'weight' : [140, None, 130, 110, 160, None],
}
df = pd.DataFrame(data)

In [3]:
# Full dataset
print(df)

    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f     NaN     NaN
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0
5   NaN   None     NaN     NaN


In [4]:
# Drop all rows that have any missing values in any column
print(df.dropna())

    age gender  height  weight
0  27.0      f    64.0   140.0
2  34.0      f    71.0   130.0


In [6]:
# Drop only rows where all values are missing
print(df.dropna(how='all'))

    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f     NaN     NaN
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0


In [7]:
# Drop only rows where more than two values are missing
print(df.dropna(thresh=2))

    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f     NaN     NaN
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0


In [9]:
# Drop all rows that have any missing values in the 'gender' or 'height' columns
print(df.dropna(subset=['gender', 'height']))

    age gender  height  weight
0  27.0      f    64.0   140.0
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0


In [10]:
print(df.dropna(subset=['height', 'weight']))

    age gender  height  weight
0  27.0      f    64.0   140.0
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0


In [12]:
# For each numeric column, replace the missing values with the mean for that column
df.fillna(df.mean(), inplace=True)
print(df)

    age gender  height  weight
0  27.0      f   64.00   140.0
1  50.0      f   67.25   135.0
2  34.0      f   71.00   130.0
3  37.0      m   66.00   110.0
4  37.0      m   68.00   160.0
5  37.0   None   67.25   135.0


In [13]:
# For each column, replace the missing values with the most common value for that
# column. Useful for filling in missing categorical values.
# As written, this command will fill in missing values for both numerical and
# categorical columns.
df = pd.DataFrame(data)
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))
print(df)

    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f    68.0   160.0
2  34.0      f    71.0   130.0
3  34.0      m    66.0   110.0
4  34.0      m    68.0   160.0
5  34.0      f    68.0   160.0


In [14]:
df = pd.DataFrame(data)
print(df)

    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f     NaN     NaN
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0
5   NaN   None     NaN     NaN


In [16]:
df.fillna(df.mean(), inplace=True)
print(df)

    age gender  height  weight
0  27.0      f   64.00   140.0
1  50.0      f   67.25   135.0
2  34.0      f   71.00   130.0
3  37.0      m   66.00   110.0
4  37.0      m   68.00   160.0
5  37.0   None   67.25   135.0


In [19]:
df = pd.DataFrame(data)
df.fillna(df.mode(), inplace=True)
print(df)

    age gender  height  weight
0  27.0      f    64.0   140.0
1  50.0      f    66.0   130.0
2  34.0      f    71.0   130.0
3   NaN      m    66.0   110.0
4   NaN      m    68.0   160.0
5   NaN    NaN     NaN     NaN
