In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('penguins_simple.csv', sep=';')
df = df[['Species', 'Culmen Depth (mm)']]
df.head()

Unnamed: 0,Species,Culmen Depth (mm)
0,Adelie,18.7
1,Adelie,17.4
2,Adelie,18.0
3,Adelie,19.3
4,Adelie,20.6


In [2]:
# introduce 150 artificial NaNs in random positions
# (don't do this for the Titanic data!)
ids = df.sample(150, random_state=42).index
df.loc[ids, 'Culmen Depth (mm)'] = np.NaN

In [3]:
df.head()

Unnamed: 0,Species,Culmen Depth (mm)
0,Adelie,18.7
1,Adelie,17.4
2,Adelie,18.0
3,Adelie,
4,Adelie,20.6


In [4]:
# Approach 1. fill by median
median = df['Culmen Depth (mm)'].median()
print(median)
print()
df['fill_1'] = df['Culmen Depth (mm)'].fillna(median)

17.3



In [5]:
# Approach 2. fill by median for each species
print(df.groupby('Species')['Culmen Depth (mm)'].mean().round(1)) # for illustration only
print()

Species
Adelie       18.4
Chinstrap    18.5
Gentoo       15.0
Name: Culmen Depth (mm), dtype: float64



In [6]:
medians = df.groupby('Species')['Culmen Depth (mm)'].transform('median') # <-- 'mean', 'sum' or custom function work as well
df['fill_2'] = df['Culmen Depth (mm)'].fillna(medians) # <-- requires index to be the same
print(df)

    Species  Culmen Depth (mm)  fill_1  fill_2
0    Adelie               18.7    18.7    18.7
1    Adelie               17.4    17.4    17.4
2    Adelie               18.0    18.0    18.0
3    Adelie                NaN    17.3    18.4
4    Adelie               20.6    20.6    20.6
..      ...                ...     ...     ...
328  Gentoo                NaN    17.3    15.0
329  Gentoo               14.3    14.3    14.3
330  Gentoo                NaN    17.3    15.0
331  Gentoo               14.8    14.8    14.8
332  Gentoo               16.1    16.1    16.1

[333 rows x 4 columns]
