### Impute/Replace Missing Values

- Dataset: msleep.csv
- Learning Date: 13-Dec-23
- Learning from: Prasert Kanawattanachai (CBS)
    - Github: https://github.com/prasertcbs/

In [1]:
# import libraries

import pandas as pd 
import numpy as np 

In [2]:
print(f'Pandas version: {pd.__version__}')

Pandas version: 1.5.2


In [3]:
# import data to a dataframe

df = pd.read_csv('https://github.com/prasertcbs/tutorial/raw/master/msleep.csv')
df.head(4)

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
0,Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,,11.9,,50.0
1,Owl monkey,Aotus,omni,Primates,,17.0,1.8,,7.0,0.0155,0.48
2,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,,9.6,,1.35
3,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.133333,9.1,0.00029,0.019


In [4]:
# get sample data randomly

df = df.sample(20, random_state = 123) # set random state as we would like to reproduce data
df.head(4)

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104


In [5]:
df.shape

(20, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 69 to 81
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          20 non-null     object 
 1   genus         20 non-null     object 
 2   vore          18 non-null     object 
 3   order         20 non-null     object 
 4   conservation  14 non-null     object 
 5   sleep_total   20 non-null     float64
 6   sleep_rem     14 non-null     float64
 7   sleep_cycle   9 non-null      float64
 8   awake         20 non-null     float64
 9   brainwt       16 non-null     float64
 10  bodywt        20 non-null     float64
dtypes: float64(6), object(5)
memory usage: 1.9+ KB


### check NA in 'vore' column

In [7]:
df[df.vore.isna()] # get the row which vore's data is NA

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6
72,Musk shrew,Suncus,,Soricomorpha,,12.8,2.0,0.183333,11.2,0.00033,0.048


### scikit-learn: SimpleImputer

In [8]:
import sklearn
from sklearn.impute import SimpleImputer # we normally use fillna() btw SimpleImputer is more flexible if our data is not in a dataframe format such as numpy array

In [9]:
# here are apply SimpleImputer on a pandas dataframe

imp = SimpleImputer(strategy = 'most_frequent') # create an object 'imp' to call SimpleImputer and then pass strategy
imp.fit_transform(df[['vore']]) # require to pass data as datafram to fit_transform

array([['herbi'],
       ['herbi'],
       ['herbi'],
       ['omni'],
       ['herbi'],
       ['herbi'],
       ['omni'],
       ['omni'],
       ['herbi'],
       ['carni'],
       ['carni'],
       ['carni'],
       ['herbi'],
       ['herbi'],
       ['carni'],
       ['herbi'],
       ['herbi'],
       ['omni'],
       ['herbi'],
       ['carni']], dtype=object)

In [10]:
df.head()

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104
29,Giraffe,Giraffa,herbi,Artiodactyla,cd,1.9,0.4,,22.1,,899.995


In [11]:
df[df.vore.isna()] # get the row which vore's data is NA

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6
72,Musk shrew,Suncus,,Soricomorpha,,12.8,2.0,0.183333,11.2,0.00033,0.048


In [12]:
df['vore2'] = imp.fit_transform(df[['vore']]) # require to pass data as datafram to fit_transform
df.head(4)

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92,herbi
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112,herbi
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0,herbi
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104,omni


In [13]:
df[df.vore2.isna()] # get the row which vore's data is NA

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2


In [14]:
df[df['vore'].isna()][['name', 'vore', 'vore2']]

Unnamed: 0,name,vore,vore2
62,Rock hyrax,,herbi
72,Musk shrew,,herbi


In [15]:
imp2 = SimpleImputer(strategy = 'constant', fill_value = 'omni') # fill NA with constant value 'omni
imp2

In [16]:
df['vore3'] = imp2.fit_transform(df[['vore']])
df.head(4)

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2,vore3
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92,herbi,herbi
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112,herbi,herbi
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0,herbi,herbi
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104,omni,omni


In [17]:
df[df['vore'].isna()][['name', 'vore', 'vore2', 'vore3']]

Unnamed: 0,name,vore,vore2,vore3
62,Rock hyrax,,herbi,omni
72,Musk shrew,,herbi,omni


In [18]:
df

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2,vore3
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92,herbi,herbi
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112,herbi,herbi
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0,herbi,herbi
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104,omni,omni
29,Giraffe,Giraffa,herbi,Artiodactyla,cd,1.9,0.4,,22.1,,899.995,herbi,herbi
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6,herbi,omni
24,European hedgehog,Erinaceus,omni,Erinaceomorpha,lc,10.1,3.5,0.283333,13.9,0.0035,0.77,omni,omni
37,Macaque,Macaca,omni,Primates,,10.1,1.2,0.75,13.9,0.179,6.8,omni,omni
76,Brazilian tapir,Tapirus,herbi,Perissodactyla,vu,4.4,1.0,0.9,19.6,0.169,207.501,herbi,herbi
31,Gray seal,Haliochoerus,carni,Carnivora,lc,6.2,1.5,,17.8,0.325,85.0,carni,carni


In [19]:
# imp3 = SimpleImputer(strategy = 'mean')
# imp3 = SimpleImputer(strategy = 'median')

imp3 = SimpleImputer(strategy = 'constant', fill_value = -99)
df['sleep_rem4'] = imp3.fit_transform(df[['sleep_rem']])
df.tail(4)

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2,vore3,sleep_rem4
23,Donkey,Equus,herbi,Perissodactyla,domesticated,3.1,0.4,,20.9,0.419,187.0,herbi,herbi,0.4
19,North American Opossum,Didelphis,omni,Didelphimorphia,lc,18.0,4.9,0.333333,6.0,0.0063,1.7,omni,omni,4.9
72,Musk shrew,Suncus,,Soricomorpha,,12.8,2.0,0.183333,11.2,0.00033,0.048,herbi,omni,2.0
81,Arctic fox,Vulpes,carni,Carnivora,,12.5,,,11.5,0.0445,3.38,carni,carni,-99.0


In [20]:
df[df.sleep_rem.isna()][['name', 'sleep_rem', 'sleep_rem4']]

Unnamed: 0,name,sleep_rem,sleep_rem4
69,Arctic ground squirrel,,-99.0
75,Eastern american chipmunk,,-99.0
44,Slow loris,,-99.0
9,Roe deer,,-99.0
59,Common porpoise,,-99.0
81,Arctic fox,,-99.0


In [22]:
# imp_x = SimpleImputer(strategy = 'mean')
# imp_x = SimpleImputer(strategy = 'median')
# imp_x = SimpleImputer(strategy = 'median')
imp_x = SimpleImputer(missing_values = -99, strategy = 'constant', fill_value = np.nan)

df['sleep_rem5'] = imp_x.fit_transform(df[['sleep_rem4']])
df[df['sleep_rem4'] == -99][['name', 'sleep_rem4', 'sleep_rem5']]

Unnamed: 0,name,sleep_rem4,sleep_rem5
69,Arctic ground squirrel,-99.0,
75,Eastern american chipmunk,-99.0,
44,Slow loris,-99.0,
9,Roe deer,-99.0,
59,Common porpoise,-99.0,
81,Arctic fox,-99.0,


In [23]:
imp_x = SimpleImputer(strategy = 'mean')
# imp_x = SimpleImputer(strategy = 'median')
# imp_x = SimpleImputer(strategy = 'median')
# imp_x = SimpleImputer(missing_values = -99, strategy = 'constant', fill_value = np.nan)

df['sleep_rem6'] = imp_x.fit_transform(df[['sleep_rem5']])
df[df['sleep_rem5'].isna()][['name', 'sleep_rem4', 'sleep_rem5', 'sleep_rem6']]

Unnamed: 0,name,sleep_rem4,sleep_rem5,sleep_rem6
69,Arctic ground squirrel,-99.0,,2.0
75,Eastern american chipmunk,-99.0,,2.0
44,Slow loris,-99.0,,2.0
9,Roe deer,-99.0,,2.0
59,Common porpoise,-99.0,,2.0
81,Arctic fox,-99.0,,2.0
