<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/pipelines_and_columntransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display = 'diagram')

In [8]:
# Load the data
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTjuYaoDqW6uQmo8Xx1W2jkCxJ33mEovfeG8iRueZwu-PMWTbZ36_N645kB61Z7JDmzVu1RGEzGPV5G/pub?output=csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,State,Lat,Lng,Area,Children,Age,Income,Marital,Gender,ReAdmis,...,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_days,TotalCharge,Additional_charges
0,AL,34.3496,-86.72508,Suburban,1.0,53,86575.93,Divorced,Male,0,...,0.0,1.0,1.0,1.0,0,1,Blood Work,10.58577,3726.70286,17939.40342
1,FL,30.84513,-85.22907,Urban,3.0,51,46805.99,Married,Female,0,...,0.0,0.0,0.0,0.0,1,0,Intravenous,15.129562,4193.190458,17612.99812
2,SD,43.54321,-96.63772,Suburban,3.0,53,14370.14,Widowed,Female,0,...,0.0,0.0,0.0,0.0,0,0,Blood Work,4.772177,2434.234222,17505.19246
3,MN,43.89744,-93.51479,Suburban,0.0,78,39741.49,Married,Male,0,...,0.0,0.0,0.0,0.0,1,1,Blood Work,1.714879,2127.830423,12993.43735
4,VA,37.59894,-76.88958,Rural,1.0,22,1209.56,Widowed,Female,0,...,1.0,0.0,0.0,1.0,0,0,CT Scan,1.254807,2113.073274,3716.525786


In [10]:
# Check datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   State               995 non-null    object 
 1   Lat                 1000 non-null   float64
 2   Lng                 1000 non-null   float64
 3   Area                995 non-null    object 
 4   Children            993 non-null    float64
 5   Age                 1000 non-null   int64  
 6   Income              1000 non-null   float64
 7   Marital             995 non-null    object 
 8   Gender              995 non-null    object 
 9   ReAdmis             1000 non-null   int64  
 10  VitD_levels         1000 non-null   float64
 11  Doc_visits          1000 non-null   int64  
 12  Full_meals_eaten    1000 non-null   int64  
 13  vitD_supp           1000 non-null   int64  
 14  Soft_drink          1000 non-null   int64  
 15  Initial_admin       995 non-null    object 
 16  HighBlo

In [15]:
# Ordinal encode 'Complication Risk'
df['Complication_risk'].value_counts()
replacement_dict = {'High': 2, 'Medium': 1, 'Med': 1, 'Low': 0}
df['Complication_risk'].replace(replacement_dict, inplace = True)
df['Complication_risk'].value_counts()

1.0    463
2.0    311
0.0    221
Name: Complication_risk, dtype: int64

In [20]:
# Get features matrix and target vector, then split the data.
X = df.drop('Additional_charges', axis = 1)
y = df['Additional_charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [21]:
# Build the column selectors
cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include = 'number')

In [22]:
# Instantiate transformers

# Imputers
freq_imputer = SimpleImputer(strategy = 'most_frequent')
mean_imputer = SimpleImputer(strategy = 'mean')

# Scaler
scaler = StandardScaler()

# OneHotEncoder
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

In [25]:
# Instantiate pipelines

# Numeric pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
display(numeric_pipe)

# Categorical pipeline
categorical_pipe = make_pipeline(freq_imputer, ohe)
display(categorical_pipe)

In [26]:
# Instantiate column transformer

# Tuples for the column transformer
number_tuple = (numeric_pipe, num_selector)
categorical_tuple = (categorical_pipe, cat_selector)

# ColumnTransformer
preprocessor = make_column_transformer(number_tuple, categorical_tuple)
preprocessor

In [27]:
# Fit the column transformer on the training data
preprocessor.fit(X_train)

In [29]:
# Transform the data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [36]:
# Inspect the data
print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data')
print(np.isnan(X_test_processed).sum().sum(), 'missing values in test data')
print()
print('All data in the training set are', X_train_processed.dtype)
print('All the data in the test set are', X_test_processed.dtype)
print()
print('The shape of the data is', X_train_processed.shape)
print()
X_train_processed

0 missing values in training data
0 missing values in test data

All data in the training set are float64
All the data in the test set are float64

The shape of the data is (750, 97)



array([[-0.50820472,  0.28193545, -0.06527826, ...,  0.        ,
         1.        ,  0.        ],
       [-0.72064168,  0.25283631,  1.23912135, ...,  0.        ,
         0.        ,  0.        ],
       [-0.49340318,  0.48282262, -0.50007813, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.27295848,  0.63816773, -0.93487801, ...,  0.        ,
         0.        ,  0.        ],
       [-0.89653885, -1.73729615, -0.93487801, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.30727477,  1.1082109 , -0.93487801, ...,  0.        ,
         0.        ,  0.        ]])