In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({
    'Gender': ['M', 'F', 'M', 'M', 'F', 'M', np.nan, 'M'],
    'Manpower': [25, np.nan, 33, np.nan, 25, 29, 26, 32],
    'Sales': [343, 280, 332, 272, np.nan, 326, 259, 297]
})

In [4]:
df

Unnamed: 0,Gender,Manpower,Sales
0,M,25.0,343.0
1,F,,280.0
2,M,33.0,332.0
3,M,,272.0
4,F,25.0,
5,M,29.0,326.0
6,,26.0,259.0
7,M,32.0,297.0


## Drop missing value

In [5]:
df.dropna()

Unnamed: 0,Gender,Manpower,Sales
0,M,25.0,343.0
2,M,33.0,332.0
5,M,29.0,326.0
7,M,32.0,297.0


In [6]:
df.dropna(axis=1)

0
1
2
3
4
5
6
7


In [7]:
df.dropna(how='all', subset=['Gender'])

Unnamed: 0,Gender,Manpower,Sales
0,M,25.0,343.0
1,F,,280.0
2,M,33.0,332.0
3,M,,272.0
4,F,25.0,
5,M,29.0,326.0
7,M,32.0,297.0


### Stat imputation

#### Generalize imputation

In [19]:
mode_gender = df.Gender.mode()[0]
mode_gender

'M'

In [20]:
missing_index_gender=df[df.Gender.isna()].index

In [21]:
df.loc[missing_index_gender,'Gender'] = mode_gender

In [22]:
df

Unnamed: 0,Gender,Manpower,Sales
0,M,25.0,343.0
1,F,,280.0
2,M,33.0,332.0
3,M,,272.0
4,F,25.0,
5,M,29.0,326.0
6,M,26.0,259.0
7,M,32.0,297.0


In [23]:
median_power = df.Manpower.median()

In [30]:
df.Manpower.fillna(median_power,)

0    25.0
1    27.5
2    33.0
3    27.5
4    25.0
5    29.0
6    26.0
7    32.0
Name: Manpower, dtype: float64

In [33]:
df

Unnamed: 0,Gender,Manpower,Sales
0,M,25.0,343.0
1,F,,280.0
2,M,33.0,332.0
3,M,,272.0
4,F,25.0,
5,M,29.0,326.0
6,M,26.0,259.0
7,M,32.0,297.0


#### Similar case imputation

In [34]:
df.groupby('Gender').mean()

Unnamed: 0_level_0,Manpower,Sales
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,25.0,280.0
M,29.0,304.833333


In [35]:
mean_F = df.groupby('Gender').mean().iloc[0,0]
mean_M = df.groupby('Gender').mean().iloc[1,0]

In [37]:
miss_M_index = df[df.Manpower.isna() & (df.Gender == 'M')].index
miss_F_index = df[df.Manpower.isna() & (df.Gender == 'F')].index


In [38]:
df.loc[miss_M_index,'Manpower'] = mean_M
df.loc[miss_F_index,'Manpower'] = mean_F


In [39]:
df

Unnamed: 0,Gender,Manpower,Sales
0,M,25.0,343.0
1,F,25.0,280.0
2,M,33.0,332.0
3,M,29.0,272.0
4,F,25.0,
5,M,29.0,326.0
6,M,26.0,259.0
7,M,32.0,297.0


In [40]:
## Predict model imputation

In [42]:
df.interpolate('spline',order = 2)

Unnamed: 0,Gender,Manpower,Sales
0,M,25.0,343.0
1,F,25.0,280.0
2,M,33.0,332.0
3,M,29.0,272.0
4,F,25.0,299.521892
5,M,29.0,326.0
6,M,26.0,259.0
7,M,32.0,297.0
