<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/week05/abalone_preprocessing_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Abalone Preprocessing Exercise
- Michael Vincent
- 7/18/22

## Imports


In [1]:
# Imports
import numpy as np
import pandas as pd
from google.colab import drive

# Load the data

In [2]:
# Import the data
path = '/content/drive/MyDrive/Coding Dojo/05 Week 1: Intro to Machine Learning and Pre-Processing/abalone.data'
df = pd.read_csv(path, header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


# Enter the column names

In [3]:
# Make a list of the column names
col_dict = {0: 'Sex', 
            1: 'Length', 
            2: 'Diameter', 
            3: 'Height', 
            4: 'Whole weight', 
            5: 'Schucked weight',
            6: 'Viscera weight',
            7: 'Shell weight',
            8: 'Rings'}

In [5]:
# Give the columns their correct names
df.rename(columns = col_dict, inplace = True)
# Make sure the names have been changed
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sex              4177 non-null   object 
 1   Length           4177 non-null   float64
 2   Diameter         4177 non-null   float64
 3   Height           4177 non-null   float64
 4   Whole weight     4177 non-null   float64
 5   Schucked weight  4177 non-null   float64
 6   Viscera weight   4177 non-null   float64
 7   Shell weight     4177 non-null   float64
 8   Rings            4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


# Set the features and target and split the data

In [6]:
# Get the features and target
X = df.drop(columns = 'Rings')
y = df['Rings']

In [7]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# Setup the preprocessing

In [8]:
# Create column selectors
from sklearn.compose import make_column_selector
cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include = 'number')

In [9]:
# Construct column transformers
from sklearn.preprocessing import StandardScaler, OneHotEncoder
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown = 'ignore')

In [10]:
# Construct tuples for the transformers
cat_tuple = (ohe, cat_selector)
num_tuple = (scaler, num_selector)

In [11]:
# Construct the column transformer
from sklearn.compose import make_column_transformer
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

# Preprocess the data

In [12]:
# Fit the column transformer to the traning set
col_transformer.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b5d230510>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b5cf19ed0>)])

In [13]:
# Transform the data
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

In [14]:
# Convert the transformed data to DataFrames 
X_train_df = pd.DataFrame(X_train_processed)
X_test_df = pd.DataFrame(X_test_processed)

In [15]:
# Display the processed DataFrames
display(X_train_df)
display(X_test_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.749291,0.464226,-0.118869,0.457447,0.499098,0.743973,0.241135,1.0,0.0,0.0
1,-0.090254,-0.144654,-0.001647,-0.301655,-0.364269,-0.514040,-0.145838,1.0,0.0,0.0
2,1.127086,1.225326,0.818910,1.523852,1.692114,1.544526,1.179902,0.0,0.0,1.0
3,-0.593980,-0.449095,-1.056649,-0.651696,-0.617673,-0.738195,-0.647469,0.0,0.0,1.0
4,-0.258163,-0.093914,0.350020,-0.052352,-0.572823,-0.605532,0.785763,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
3127,-0.300140,-0.093914,-0.587759,-0.539765,-0.476395,-0.449995,-0.683300,1.0,0.0,0.0
3128,1.211040,1.428286,1.170577,1.132090,0.808565,1.114515,1.144071,1.0,0.0,0.0
3129,-0.132231,-0.144654,-0.353314,-0.516361,-0.530215,-0.440846,-0.375155,0.0,0.0,1.0
3130,0.413473,0.565706,-0.470537,0.446253,0.689711,0.336834,-0.038345,0.0,0.0,1.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.665336,0.464226,0.467243,0.548010,0.263634,1.096216,0.606609,0.0,0.0,1.0
1,0.539405,0.312006,0.232798,0.077896,0.111143,0.304812,0.033316,0.0,0.0,1.0
2,0.287541,0.362746,1.287800,0.298707,-0.256629,0.391729,0.678271,1.0,0.0,0.0
3,0.917200,0.819406,0.701688,0.869559,0.790624,0.775995,1.000748,1.0,0.0,0.0
4,-0.426072,-0.246134,0.115576,-0.441061,-0.570580,-0.674150,-0.181669,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1040,-0.468049,-0.398355,-0.470537,-0.740224,-0.797074,-0.697023,-0.611638,0.0,1.0,0.0
1041,0.455450,0.514966,0.232798,0.370954,0.577586,0.359707,0.176639,0.0,0.0,1.0
1042,0.917200,1.022366,1.053355,0.964193,0.815292,1.247178,0.979249,0.0,0.0,1.0
1043,-0.552003,-0.347615,-0.001647,-0.658819,-0.763437,-0.706173,-0.253330,0.0,0.0,1.0
