## Impute/Replace missing values

In [44]:
# import libraties

import pandas as pd
import numpy as np

In [45]:
print(f'pandas version: {[pd.__version__]}')
print(f'numpy version: {np.__version__}')

pandas version: ['1.5.2']
numpy version: 1.24.1


In [46]:
# read data from Aj. Prasert's Github

df = pd.read_csv('https://github.com/prasertcbs/tutorial/raw/master/msleep.csv')
df.head(3)

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
0,Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,,11.9,,50.0
1,Owl monkey,Aotus,omni,Primates,,17.0,1.8,,7.0,0.0155,0.48
2,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,,9.6,,1.35


In [47]:
df.shape

(83, 11)

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          83 non-null     object 
 1   genus         83 non-null     object 
 2   vore          76 non-null     object 
 3   order         83 non-null     object 
 4   conservation  54 non-null     object 
 5   sleep_total   83 non-null     float64
 6   sleep_rem     61 non-null     float64
 7   sleep_cycle   32 non-null     float64
 8   awake         83 non-null     float64
 9   brainwt       56 non-null     float64
 10  bodywt        83 non-null     float64
dtypes: float64(6), object(5)
memory usage: 7.3+ KB


In [49]:
# sample 20 rows data

df = df.sample(20, random_state = 123)

In [50]:
df.shape

(20, 11)

In [51]:
df

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104
29,Giraffe,Giraffa,herbi,Artiodactyla,cd,1.9,0.4,,22.1,,899.995
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6
24,European hedgehog,Erinaceus,omni,Erinaceomorpha,lc,10.1,3.5,0.283333,13.9,0.0035,0.77
37,Macaque,Macaca,omni,Primates,,10.1,1.2,0.75,13.9,0.179,6.8
76,Brazilian tapir,Tapirus,herbi,Perissodactyla,vu,4.4,1.0,0.9,19.6,0.169,207.501
31,Gray seal,Haliochoerus,carni,Carnivora,lc,6.2,1.5,,17.8,0.325,85.0


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 69 to 81
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          20 non-null     object 
 1   genus         20 non-null     object 
 2   vore          18 non-null     object 
 3   order         20 non-null     object 
 4   conservation  14 non-null     object 
 5   sleep_total   20 non-null     float64
 6   sleep_rem     14 non-null     float64
 7   sleep_cycle   9 non-null      float64
 8   awake         20 non-null     float64
 9   brainwt       16 non-null     float64
 10  bodywt        20 non-null     float64
dtypes: float64(6), object(5)
memory usage: 1.9+ KB


### Check NA in vore column

In [53]:
df[df['vore'].isna()] # filter rows which vore value is NaN(NA)

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6
72,Musk shrew,Suncus,,Soricomorpha,,12.8,2.0,0.183333,11.2,0.00033,0.048


### scikit-learn: SimpleImputer

In [54]:
# import libraries

import sklearn
from sklearn.impute import SimpleImputer

In [55]:
print(f'sklean version: {sklearn.__version__}')

sklean version: 1.2.0


In [56]:
imp = SimpleImputer(strategy = "most_frequent") # replace NA with the mode

In [57]:
imp

In [58]:
imp.fit_transform(df[['vore']])

array([['herbi'],
       ['herbi'],
       ['herbi'],
       ['omni'],
       ['herbi'],
       ['herbi'],
       ['omni'],
       ['omni'],
       ['herbi'],
       ['carni'],
       ['carni'],
       ['carni'],
       ['herbi'],
       ['herbi'],
       ['carni'],
       ['herbi'],
       ['herbi'],
       ['omni'],
       ['herbi'],
       ['carni']], dtype=object)

In [59]:
df.head(3)

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0


In [60]:
df['vore2'] = imp.fit_transform(df[['vore']]) # passing data with dataframe needs df[[]]

In [61]:
df

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92,herbi
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112,herbi
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0,herbi
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104,omni
29,Giraffe,Giraffa,herbi,Artiodactyla,cd,1.9,0.4,,22.1,,899.995,herbi
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6,herbi
24,European hedgehog,Erinaceus,omni,Erinaceomorpha,lc,10.1,3.5,0.283333,13.9,0.0035,0.77,omni
37,Macaque,Macaca,omni,Primates,,10.1,1.2,0.75,13.9,0.179,6.8,omni
76,Brazilian tapir,Tapirus,herbi,Perissodactyla,vu,4.4,1.0,0.9,19.6,0.169,207.501,herbi
31,Gray seal,Haliochoerus,carni,Carnivora,lc,6.2,1.5,,17.8,0.325,85.0,carni


In [62]:
df[df.vore.isna()]

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6,herbi
72,Musk shrew,Suncus,,Soricomorpha,,12.8,2.0,0.183333,11.2,0.00033,0.048,herbi


In [63]:
df[df.vore.isna()][['name', 'vore', 'vore2']]

Unnamed: 0,name,vore,vore2
62,Rock hyrax,,herbi
72,Musk shrew,,herbi


In [65]:
imp2 = SimpleImputer(strategy = 'constant', fill_value = 'omni') # impute with assigned constant
df['vore3'] = imp2.fit_transform(df[['vore']])
df[df.vore.isna()][['name', 'vore', 'vore2', 'vore3']]

Unnamed: 0,name,vore,vore2,vore3
62,Rock hyrax,,herbi,omni
72,Musk shrew,,herbi,omni


In [67]:
# imp_mean = SimpleImputer(strategy = 'mean')
# imp_median = SimpleImputer(strategy = 'median')
# imp_mode = SimpleImputer(strategy = 'most_frequent')

imp3 = SimpleImputer(strategy = 'constant', fill_value = -99)

df['sleep_rem2'] = imp3.fit_transform(df[['sleep_rem']])
df[df.sleep_rem.isna()][['name', 'sleep_rem', 'sleep_rem2']]

Unnamed: 0,name,sleep_rem,sleep_rem2
69,Arctic ground squirrel,,-99.0
75,Eastern american chipmunk,,-99.0
44,Slow loris,,-99.0
9,Roe deer,,-99.0
59,Common porpoise,,-99.0
81,Arctic fox,,-99.0


In [68]:
imp_x = SimpleImputer(missing_values = -99, strategy = 'constant', fill_value = np.nan) # required to replace -99 values with NaN first, then we can do next step

df['sleep_rem3'] = imp_x.fit_transform(df[['sleep_rem2']])
df[df.sleep_rem2 == -99][['name', 'sleep_rem', 'sleep_rem2', 'sleep_rem3']]

Unnamed: 0,name,sleep_rem,sleep_rem2,sleep_rem3
69,Arctic ground squirrel,,-99.0,
75,Eastern american chipmunk,,-99.0,
44,Slow loris,,-99.0,
9,Roe deer,,-99.0,
59,Common porpoise,,-99.0,
81,Arctic fox,,-99.0,


In [70]:
imp_mean = SimpleImputer(strategy = 'mean')

df['sleep_rem4'] = imp_mean.fit_transform(df[['sleep_rem']])
df[df.sleep_rem.isna()][['name', 'sleep_rem', 'sleep_rem2', 'sleep_rem3', 'sleep_rem4']]

Unnamed: 0,name,sleep_rem,sleep_rem2,sleep_rem3,sleep_rem4
69,Arctic ground squirrel,,-99.0,,2.0
75,Eastern american chipmunk,,-99.0,,2.0
44,Slow loris,,-99.0,,2.0
9,Roe deer,,-99.0,,2.0
59,Common porpoise,,-99.0,,2.0
81,Arctic fox,,-99.0,,2.0


In [71]:
imp_median = SimpleImputer(strategy = 'median')

df['sleep_rem5'] = imp_median.fit_transform(df[['sleep_rem']])
df[df.sleep_rem.isna()][['name', 'sleep_rem', 'sleep_rem2', 'sleep_rem3', 'sleep_rem4', 'sleep_rem5']]

Unnamed: 0,name,sleep_rem,sleep_rem2,sleep_rem3,sleep_rem4,sleep_rem5
69,Arctic ground squirrel,,-99.0,,2.0,1.75
75,Eastern american chipmunk,,-99.0,,2.0,1.75
44,Slow loris,,-99.0,,2.0,1.75
9,Roe deer,,-99.0,,2.0,1.75
59,Common porpoise,,-99.0,,2.0,1.75
81,Arctic fox,,-99.0,,2.0,1.75
