<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/ordinal_and_one_hot_encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSnIjhFxAYodtV-lqOvvRjhFkhXyn6Bb6E3cgKRixgyCQbCjUCyQAjS0fV-Q4w7HkHXjhgKanpkqOBZ/pub?output=csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0.1,Unnamed: 0,State,Lat,Lng,Area,Children,Age,Income,Marital,Gender,...,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_days,TotalCharge,Additional_charges
0,1,AL,34.3496,-86.72508,Suburban,1.0,53,86575.93,Divorced,Male,...,0.0,1.0,1.0,1.0,0,1,Blood Work,10.58577,3726.70286,17939.40342
1,2,FL,30.84513,-85.22907,Urban,3.0,51,46805.99,Married,Female,...,0.0,0.0,0.0,0.0,1,0,Intravenous,15.129562,4193.190458,17612.99812
2,3,SD,43.54321,-96.63772,Suburban,3.0,53,14370.14,Widowed,Female,...,0.0,0.0,0.0,0.0,0,0,Blood Work,4.772177,2434.234222,17505.19246
3,4,MN,43.89744,-93.51479,Suburban,0.0,78,39741.49,Married,Male,...,0.0,0.0,0.0,0.0,1,1,Blood Work,1.714879,2127.830423,12993.43735
4,5,VA,37.59894,-76.88958,Rural,1.0,22,1209.56,Widowed,Female,...,1.0,0.0,0.0,1.0,0,0,CT Scan,1.254807,2113.073274,3716.525786


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          1000 non-null   int64  
 1   State               1000 non-null   object 
 2   Lat                 1000 non-null   float64
 3   Lng                 1000 non-null   float64
 4   Area                1000 non-null   object 
 5   Children            993 non-null    float64
 6   Age                 1000 non-null   int64  
 7   Income              1000 non-null   float64
 8   Marital             1000 non-null   object 
 9   Gender              1000 non-null   object 
 10  ReAdmis             1000 non-null   int64  
 11  VitD_levels         1000 non-null   float64
 12  Doc_visits          1000 non-null   int64  
 13  Full_meals_eaten    1000 non-null   int64  
 14  vitD_supp           1000 non-null   int64  
 15  Soft_drink          1000 non-null   int64  
 16  Initial

In [4]:
df['Complication_risk'].value_counts()

Medium    462
High      312
Low       222
Med         4
Name: Complication_risk, dtype: int64

In [5]:
# Ordinal encode 'Complication_risk'
df['Complication_risk'].replace({'Low': 0, 'Medium': 1, 'Med': 1, 'High': 2}, inplace = True)
df['Complication_risk'].value_counts()

1    466
2    312
0    222
Name: Complication_risk, dtype: int64

In [6]:
# Get the featuers and target then split the data
X = df.drop(columns = ['Unnamed: 0', 'Additional_charges'])
y = df['Additional_charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [7]:
# Make a selector to separate the categorical data
cat_selector = make_column_selector(dtype_include = 'object')
cat_selector(X_train)

['State', 'Area', 'Marital', 'Gender', 'Initial_admin', 'Services']

In [8]:
# Separate the categorical data
train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]
train_cat_data

Unnamed: 0,State,Area,Marital,Gender,Initial_admin,Services
82,TN,Urban,Never Married,Female,Emergency Admission,Intravenous
991,AL,Urban,Married,Male,Emergency Admission,Blood Work
789,TN,Urban,Married,Nonbinary,Observation Admission,Intravenous
894,SD,Rural,Never Married,Male,Observation Admission,Blood Work
398,MI,Suburban,Widowed,Female,Elective Admission,Blood Work
...,...,...,...,...,...,...
106,NY,Suburban,Widowed,Male,Observation Admission,Intravenous
270,IN,Suburban,Married,Male,Observation Admission,Blood Work
860,OH,Urban,Divorced,Male,Elective Admission,Blood Work
435,CA,Suburban,Separated,Male,Observation Admission,Blood Work


In [9]:
# Instantiate OneHotEncoder
ohe_encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
# Fit the OneHotEncoder on the training data
ohe_encoder.fit(train_cat_data)
# Transform the train and test data
train_ohe = ohe_encoder.transform(train_cat_data)
test_ohe = ohe_encoder.transform(test_cat_data)
train_ohe

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
# Convert to a DataFrame. Extract new column names from encoder
# Setp prefixes to original column names
ohe_column_names = ohe_encoder.get_feature_names_out(train_cat_data.columns)
train_ohe = pd.DataFrame(train_ohe, columns = ohe_column_names)
test_ohe = pd.DataFrame(test_ohe, columns = ohe_column_names)
train_ohe.head()

Unnamed: 0,State_AK,State_AL,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DC,State_FL,State_GA,...,Gender_f,Gender_m,Gender_male,Initial_admin_Elective Admission,Initial_admin_Emergency Admission,Initial_admin_Observation Admission,Services_Blood Work,Services_CT Scan,Services_Intravenous,Services_MRI
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
# Create a numeric selector
num_selector = make_column_selector(dtype_include = 'number')
# Isolate the numeric columns
train_nums = X_train[num_selector(X_train)].reset_index(drop = True)
test_nums = X_test[num_selector(X_test)].reset_index(drop = True)
# Recombine the numeric data with the one-hot-encoded data
X_train_processed = pd.concat([train_nums, train_ohe], axis = 1)
X_test_processed = pd.concat([test_nums, test_ohe], axis = 1)

In [15]:
X_train_processed

Unnamed: 0,Lat,Lng,Children,Age,Income,ReAdmis,VitD_levels,Doc_visits,Full_meals_eaten,vitD_supp,...,Gender_f,Gender_m,Gender_male,Initial_admin_Elective Admission,Initial_admin_Emergency Admission,Initial_admin_Observation Admission,Services_Blood Work,Services_CT Scan,Services_Intravenous,Services_MRI
0,36.16307,-86.66510,2.0,60,8459.99,0,19.034162,5,1,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,34.96594,-87.12179,5.0,78,22669.31,0,15.903388,7,1,0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,36.24648,-83.51232,1.0,60,25536.25,0,18.225040,4,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,45.42189,-97.91165,7.0,82,94863.57,0,15.809932,5,0,2,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,42.33661,-83.28292,0.0,37,30898.36,0,20.640410,5,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,42.05701,-77.43901,1.0,32,4788.93,0,19.029312,6,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
746,40.47773,-86.38658,4.0,27,29461.62,0,15.293840,5,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
747,40.56510,-81.07429,0.0,57,79094.04,0,19.459084,5,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
748,33.97472,-118.35549,0.0,56,25697.12,0,15.871725,5,1,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
