# Pre-Processing Exercise (Practice)
- **Student:** Michael McCann
- **Date:** 22 FEB 2022

In [1]:
## Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
## Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector

In [11]:
## Data Filepaths
insurance_filepath = '/content/drive/MyDrive/Data/insurance.csv'
ins_df = pd.read_csv(insurance_filepath)

ins_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Define Feature and Target

In [15]:
X = ins_df.drop(columns = ['charges'])
y = ins_df['charges']

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Identify Features

In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1003 entries, 693 to 1126
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1003 non-null   int64  
 1   sex       1003 non-null   object 
 2   bmi       1003 non-null   float64
 3   children  1003 non-null   int64  
 4   smoker    1003 non-null   object 
 5   region    1003 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 54.9+ KB


- age - Numerical: Already set up in numbers
- sex - Nominal: categorical male/female/nonbinary
- bmi - Numeric: BMI is an ordered list
- children - Numerical: Already set up in numbers
- smoker - Nominal: Binary yes/no
- region - Nominal: categorical based on region

## Ordinal Encode

There are no Ordinal Features in this dataset


## One Hot Encode
Need to OHE, sex, smoker, and region

In [17]:
# create category selector for object types
cat_selector = make_column_selector(dtype_include='object')

In [None]:
# Use selector to pull data to be OHE
train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]

In [26]:
# Instantiate OHE
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Train OHE
ohe_encoder.fit(train_cat_data)

ohe_column_names = ohe_encoder.get_feature_names_out(train_cat_data.columns)

#train_ohe = ohe_encoder.transform(train_cat_data)
#test_ohe = ohe_encoder.transform(test_cat_data)

train_ohe = pd.DataFrame(ohe_encoder.transform(train_cat_data), columns = ohe_column_names)
test_ohe = pd.DataFrame(ohe_encoder.transform(test_cat_data), columns = ohe_column_names)

## Scale Numerics
Age, BMI, and Children need to be scaled

In [32]:
num_selector = make_column_selector(dtype_include='number')

In [35]:
train_num_data = X_train[num_selector(X_train)]
test_num_data = X_test[num_selector(X_test)]

Unnamed: 0,age,bmi,children
693,24,23.655,0
1297,28,26.510,2
634,51,39.700,1
1022,47,36.080,1
178,46,28.900,2
...,...,...,...
1095,18,31.350,4
1130,39,23.870,5
1294,58,25.175,0
860,37,47.600,2


In [36]:
# instantiate StandardScaler
scaler = StandardScaler()

# Fit the data
scaler.fit(train_num_data)

StandardScaler()

In [46]:
train_scaled = pd.DataFrame(scaler.transform(train_num_data), columns = train_num_data.columns).reset_index(drop = True)
test_scaled = pd.DataFrame(scaler.transform(test_num_data), columns = test_num_data.columns).reset_index(drop = True)

## Concatenate back into a DF

In [52]:
X_train_processed = pd.concat([train_scaled, train_ohe], axis = 1)
X_test_processed = pd.concat([test_scaled, test_ohe], axis = 1)