<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/column_transfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTBHJmMvKTS0d5EQOiBRa-kW7E1yHZscF7HreM7ffDpPMSit-1t1E1koL2tDp3Xwtmctw3IbAlWC1pt/pub?output=csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,State,Lat,Lng,Area,Children,Age,Income,Marital,Gender,ReAdmis,...,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_days,TotalCharge,Additional_charges
0,AL,34.3496,-86.72508,Suburban,1,53,86575.93,Divorced,Male,0,...,0,1,1,1,0,1,Blood Work,10.58577,3726.70286,17939.40342
1,FL,30.84513,-85.22907,Urban,3,51,46805.99,Married,Female,0,...,0,0,0,0,1,0,Intravenous,15.129562,4193.190458,17612.99812
2,SD,43.54321,-96.63772,Suburban,3,53,14370.14,Widowed,Female,0,...,0,0,0,0,0,0,Blood Work,4.772177,2434.234222,17505.19246
3,MN,43.89744,-93.51479,Suburban,0,78,39741.49,Married,Male,0,...,0,0,0,0,1,1,Blood Work,1.714879,2127.830423,12993.43735
4,VA,37.59894,-76.88958,Rural,1,22,1209.56,Widowed,Female,0,...,1,0,0,1,0,0,CT Scan,1.254807,2113.073274,3716.525786


In [None]:
# Get the features and target then split the data
X = df.drop(columns = 'Additional_charges')
y = df['Additional_charges']
X_train, X_test, y_train, y_split = train_test_split(X, y, random_state = 42)

In [40]:
# Construct the column selectors
cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include = 'number')

In [None]:
# Instantiate the column transformers
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown = 'ignore')

In [None]:
# Make tuple for preprocessing the categorical and numeric columns
num_tuple = (scaler, num_selector)
cat_tuple = (ohe, cat_selector)

In [None]:
# Instantiate the column transformer
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

In [None]:
# Fit the transformer using the training data
col_transformer.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fd889c32050>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7fd889c32490>)])

In [None]:
# Transform the training and testing data
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

In [None]:
X_train_df = pd.DataFrame(X_train_processed)
X_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.508205,0.281935,-0.060153,0.272586,-1.123467,0.0,0.509399,-0.008943,0.014639,-0.620174,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.720642,0.252836,1.241233,1.119125,-0.619881,0.0,-0.999823,1.907372,0.014639,-0.620174,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,-0.493403,0.482823,-0.493948,0.272586,-0.518276,0.0,0.119354,-0.9671,0.014639,-0.620174,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.134821,-0.434666,2.108824,1.307245,1.93872,0.0,-1.044875,-0.008943,-0.983474,2.762592,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.587322,0.497439,-0.927744,-0.809103,-0.32824,0.0,1.283708,-0.008943,0.014639,-0.620174,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [None]:
X_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 100 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       750 non-null    float64
 1   1       750 non-null    float64
 2   2       750 non-null    float64
 3   3       750 non-null    float64
 4   4       750 non-null    float64
 5   5       750 non-null    float64
 6   6       750 non-null    float64
 7   7       750 non-null    float64
 8   8       750 non-null    float64
 9   9       750 non-null    float64
 10  10      750 non-null    float64
 11  11      750 non-null    float64
 12  12      750 non-null    float64
 13  13      750 non-null    float64
 14  14      750 non-null    float64
 15  15      750 non-null    float64
 16  16      750 non-null    float64
 17  17      750 non-null    float64
 18  18      750 non-null    float64
 19  19      750 non-null    float64
 20  20      750 non-null    float64
 21  21      750 non-null    float64
 22  2