<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/week05/pre_processing_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vRIzNQ_kKyVKD6_m8eatqTl7OA-yubTX2ai5sEyaiTyaK4U4NscmPgUuVVmmtUiwAWESvgBSv8tXfjg/pub?output=csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
# Get the target and features
y = df['charges']
X = df.drop(columns = 'charges')

In [None]:
# Determine which columns are numeric and categorical
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


0    574
1    324
2    240
3    157
4     25
5     18
Name: children, dtype: int64

> The numeric features are:
- age
- bmi
- children
- charges

> There are no ordinal features.

> The nominal features are:
- sex
- smoker
- region

In [None]:
# Get the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 44)

In [None]:
# Make a selector to get the categorical data
cat_selector = make_column_selector(dtype_include = 'object')

# Get the categorical data
train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]

In [None]:
# All the categorical data is nominal 
# so we one-hot encode it
ohe_encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
ohe_encoder.fit(train_cat_data)
train_ohe = ohe_encoder.transform(train_cat_data)
test_ohe = ohe_encoder.transform(test_cat_data)

In [None]:
# Get the numeric data
num_selector = make_column_selector(dtype_include = 'number')
train_num_data = X_train[num_selector(X_train)]
test_num_data = X_train[num_selector(X_test)]

In [None]:
# Scale the numeric data
scaler = StandardScaler()
scaler.fit(train_num_data)
train_num_scaled = scaler.transform(train_num_data)
test_num_scaled = scaler.transform(test_num_data)

In [None]:
# Convert the categorical data back into a DataFrame
ohe_column_names = ohe_encoder.get_feature_names(train_cat_data.columns)
train_ohe = pd.DataFrame(train_ohe, columns = ohe_column_names)
test_ohe = pd.DataFrame(test_ohe, columns = ohe_column_names)



In [None]:
# Concatenate numeric and categorical data back into one DataFrame

# Reset the index on the numeric data
train_num_data.reset_index(drop = True, inplace = True)
test_num_data.reset_index(drop = True, inplace = True)

# Combine the data
X_train_processed = pd.concat([train_ohe, train_num_data], axis = 1)
X_test_processed = pd.concat([test_ohe, test_num_data], axis = 1)
X_train_processed

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,60,32.80,0
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,47,36.20,1
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,18,33.33,0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,19,35.15,0
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,50,32.30,2
...,...,...,...,...,...,...,...,...,...,...,...
998,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,37,34.80,2
999,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,54,30.80,3
1000,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,18,37.29,1
1001,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,41,33.55,0
