<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/simple_imputer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display = 'diagram')

In [2]:
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTjuYaoDqW6uQmo8Xx1W2jkCxJ33mEovfeG8iRueZwu-PMWTbZ36_N645kB61Z7JDmzVu1RGEzGPV5G/pub?output=csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,State,Lat,Lng,Area,Children,Age,Income,Marital,Gender,ReAdmis,...,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_days,TotalCharge,Additional_charges
0,AL,34.3496,-86.72508,Suburban,1.0,53,86575.93,Divorced,Male,0,...,0.0,1.0,1.0,1.0,0,1,Blood Work,10.58577,3726.70286,17939.40342
1,FL,30.84513,-85.22907,Urban,3.0,51,46805.99,Married,Female,0,...,0.0,0.0,0.0,0.0,1,0,Intravenous,15.129562,4193.190458,17612.99812
2,SD,43.54321,-96.63772,Suburban,3.0,53,14370.14,Widowed,Female,0,...,0.0,0.0,0.0,0.0,0,0,Blood Work,4.772177,2434.234222,17505.19246
3,MN,43.89744,-93.51479,Suburban,0.0,78,39741.49,Married,Male,0,...,0.0,0.0,0.0,0.0,1,1,Blood Work,1.714879,2127.830423,12993.43735
4,VA,37.59894,-76.88958,Rural,1.0,22,1209.56,Widowed,Female,0,...,1.0,0.0,0.0,1.0,0,0,CT Scan,1.254807,2113.073274,3716.525786


In [3]:
df.info()
print('\nThere are', df.isna().sum().sum(), 'missing values.')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   State               995 non-null    object 
 1   Lat                 1000 non-null   float64
 2   Lng                 1000 non-null   float64
 3   Area                995 non-null    object 
 4   Children            993 non-null    float64
 5   Age                 1000 non-null   int64  
 6   Income              1000 non-null   float64
 7   Marital             995 non-null    object 
 8   Gender              995 non-null    object 
 9   ReAdmis             1000 non-null   int64  
 10  VitD_levels         1000 non-null   float64
 11  Doc_visits          1000 non-null   int64  
 12  Full_meals_eaten    1000 non-null   int64  
 13  vitD_supp           1000 non-null   int64  
 14  Soft_drink          1000 non-null   int64  
 15  Initial_admin       995 non-null    object 
 16  HighBlo

In [4]:
# Define the featuers and target, then split the data
X = df.drop(columns = 'Additional_charges')
y = df['Additional_charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [5]:
# Construct the column selectors
cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include = 'number')

# Get the columns of each type
cat_columns = cat_selector(X_train)
num_columns = num_selector(X_train)

# Display the columns
print('Categorical columns:', cat_columns)
print('Numeric columns:', num_columns)

Categorical columns: ['State', 'Area', 'Marital', 'Gender', 'Initial_admin', 'Complication_risk', 'Services']
Numeric columns: ['Lat', 'Lng', 'Children', 'Age', 'Income', 'ReAdmis', 'VitD_levels', 'Doc_visits', 'Full_meals_eaten', 'vitD_supp', 'Soft_drink', 'HighBlood', 'Stroke', 'Overweight', 'Arthritis', 'Diabetes', 'Hyperlipidemia', 'BackPain', 'Anxiety', 'Allergic_rhinitis', 'Reflux_esophagitis', 'Asthma', 'Initial_days', 'TotalCharge']


In [13]:
# Find the columns with missing data
X_train.isna().any()

State                  True
Lat                   False
Lng                   False
Area                   True
Children               True
Age                   False
Income                False
Marital                True
Gender                 True
ReAdmis               False
VitD_levels           False
Doc_visits            False
Full_meals_eaten      False
vitD_supp             False
Soft_drink            False
Initial_admin          True
HighBlood             False
Stroke                False
Complication_risk      True
Overweight            False
Arthritis              True
Diabetes               True
Hyperlipidemia         True
BackPain               True
Anxiety                True
Allergic_rhinitis      True
Reflux_esophagitis    False
Asthma                False
Services               True
Initial_days          False
TotalCharge           False
dtype: bool

In [16]:
# Construct a median imputer
median_imputer = SimpleImputer(strategy = 'median')
median_imputer.fit(X_train[num_columns])

# Use the median imputer to fill empty numeric columns
X_train.loc[:, num_columns] = median_imputer.transform(X_train[num_columns])
X_test.loc[:, num_columns] = median_imputer.transform(X_test[num_columns])

In [18]:
X_train.isna().any()

State                  True
Lat                   False
Lng                   False
Area                   True
Children              False
Age                   False
Income                False
Marital                True
Gender                 True
ReAdmis               False
VitD_levels           False
Doc_visits            False
Full_meals_eaten      False
vitD_supp             False
Soft_drink            False
Initial_admin          True
HighBlood             False
Stroke                False
Complication_risk      True
Overweight            False
Arthritis             False
Diabetes              False
Hyperlipidemia        False
BackPain              False
Anxiety               False
Allergic_rhinitis     False
Reflux_esophagitis    False
Asthma                False
Services               True
Initial_days          False
TotalCharge           False
dtype: bool

In [21]:
# Redo the train, test, split step to use SimpleImputer
# with ColumnTransformer
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train.isna().any()

State                  True
Lat                   False
Lng                   False
Area                   True
Children               True
Age                   False
Income                False
Marital                True
Gender                 True
ReAdmis               False
VitD_levels           False
Doc_visits            False
Full_meals_eaten      False
vitD_supp             False
Soft_drink            False
Initial_admin          True
HighBlood             False
Stroke                False
Complication_risk      True
Overweight            False
Arthritis              True
Diabetes               True
Hyperlipidemia         True
BackPain               True
Anxiety                True
Allergic_rhinitis      True
Reflux_esophagitis    False
Asthma                False
Services               True
Initial_days          False
TotalCharge           False
dtype: bool

In [23]:
# Build the column selectors
num_selector = make_column_selector(dtype_include = 'number')
cat_selector = make_column_selector(dtype_include = 'object')

In [24]:
# Build the imputers
freq_imputer = SimpleImputer(strategy = 'most_frequent')
median_imputer = SimpleImputer(strategy = 'median')

In [26]:
# Construct the tuples to use with ColumnTransformer
num_tuple = (median_imputer, num_selector)
cat_tuple = (freq_imputer, cat_selector)

# Construct the column transformer
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')
col_transformer

In [31]:
# Impute missing values with ColumnTransformer

# Fit the column transformer
col_transformer.fit(X_train)

# Impute the missing values
X_train_imputed = col_transformer.transform(X_train)
X_test_imputed = col_transformer.transform(X_test)

# Convert back to DataFrame
X_train_imputed = pd.DataFrame(X_train_imputed, columns = X_train.columns)
X_train_imputed
X_train_imputed.isna().any()

State                 False
Lat                   False
Lng                   False
Area                  False
Children              False
Age                   False
Income                False
Marital               False
Gender                False
ReAdmis               False
VitD_levels           False
Doc_visits            False
Full_meals_eaten      False
vitD_supp             False
Soft_drink            False
Initial_admin         False
HighBlood             False
Stroke                False
Complication_risk     False
Overweight            False
Arthritis             False
Diabetes              False
Hyperlipidemia        False
BackPain              False
Anxiety               False
Allergic_rhinitis     False
Reflux_esophagitis    False
Asthma                False
Services              False
Initial_days          False
TotalCharge           False
dtype: bool