# Abalone Preprocessing Exercise (Core)
- **Student:** Michael McCann
- **Date:** 23 FEB 2022

In [29]:
## Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
## Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer

In [31]:
## Load and Inspect the Data
#Column names retreived from the abalone.names documentation provided with the dataset
abalone_data_filepath = '/content/drive/MyDrive/Data/abalone.data'
col_names = ['sex', 'length', 'diameter', 'height', 'whole weight', 'shucked weight', 'viscera weight', 'shell weight', 'rings']
abalone_df = pd.read_csv(abalone_data_filepath, names = col_names)

abalone_df.head()

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


## Define Feature and Target

In [32]:
# Define target (y) and feature matrix (X)
X = abalone_df.drop(columns = ['rings'])
y = abalone_df['rings']

## Train Test Split

In [33]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Column Transformer

### Column Selectors

In [34]:
# Create 
cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include = 'number')

### OHE to encode categorical columns

In [35]:
# instantiate OHE
ohe = OneHotEncoder(handle_unknown='ignore', sparse = False)

### StandardScaler to encode numeric columns

In [36]:
#Instantiate Standard Scaler
scaler = StandardScaler()

### ColumnTransformer

In [37]:
# Create the tuple
num_tuple = (scaler, num_selector)
cat_tuple = (ohe, cat_selector)

In [38]:
# Instantiate make column transformer
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

In [39]:
# fit the data to the transformer
col_transformer.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f88b08a6b10>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f88b08a6d10>)])

In [40]:
# Transform the test/training sets
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

In [41]:
# Display X_train_processed
X_train_processed

array([[ 0.74929076,  0.46422584, -0.11886923, ...,  1.        ,
         0.        ,  0.        ],
       [-0.09025371, -0.14465442, -0.0016468 , ...,  1.        ,
         0.        ,  0.        ],
       [ 1.12708577,  1.22532616,  0.81891021, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.13223093, -0.14465442, -0.35331409, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.41347297,  0.56570588, -0.47053652, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.58138187,  0.66718592,  0.46724292, ...,  1.        ,
         0.        ,  0.        ]])

In [42]:
# Display X_test_processed
X_test_processed

array([[ 0.66533631,  0.46422584,  0.46724292, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.53940464,  0.31200577,  0.23279806, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.2875413 ,  0.36274579,  1.28779993, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.91719966,  1.02236608,  1.05335507, ...,  0.        ,
         0.        ,  1.        ],
       [-0.55200317, -0.34761451, -0.0016468 , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.03567796, -0.24613447, -0.35331409, ...,  1.        ,
         0.        ,  0.        ]])