<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/week05/pipeline_activity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pipeline Activity
- Michael Vincent
- 7/20/22

## Imports and data loading

In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import set_config
set_config(display = 'diagram')

In [2]:
# Load the data. Set the index to be the name of the cereal
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTJvi9qGevLpM9zf98uxf6y9gZ7xsufCUac60w6P_LVJ5Nb95DF4ZsRAlRl-9EcwA/pub?output=csv'
df = pd.read_csv(url, index_col = 'name')
df.head()

Unnamed: 0_level_0,Manufacturer,type,calories per serving,grams of protein,grams of fat,milligrams of sodium,grams of dietary fiber,grams of complex carbohydrates,grams of sugars,milligrams of potassium,vitamins and minerals (% of FDA recommendation),Display shelf,Weight in ounces per one serving,Number of cups in one serving,Rating of cereal
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Apple Cinnamon Cheerios,General Mills,Cold,110.0,2,2.0,180.0,1.5,10.5,10.0,70,25.0,1,1.0,0.75,29.509541
Basic 4,General Mills,Cold,130.0,3,2.0,,2.0,18.0,,100,25.0,3,1.33,0.75,37.038562
Cheerios,General Mills,Cold,,6,2.0,290.0,2.0,17.0,1.0,105,25.0,1,1.0,1.25,50.764999
Cinnamon Toast Crunch,General Mills,Cold,120.0,1,3.0,210.0,0.0,13.0,9.0,45,25.0,2,1.0,0.75,19.823573
Clusters,General Mills,Cold,110.0,3,2.0,140.0,2.0,13.0,7.0,105,25.0,3,1.0,0.5,40.400208


## Classify the features

In [3]:
# Classify the features
print(df.select_dtypes(include = 'object').info(), '\n')
print(df.select_dtypes(include = 'number').info())

<class 'pandas.core.frame.DataFrame'>
Index: 77 entries, Apple Cinnamon Cheerios to Quaker Oatmeal
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Manufacturer  77 non-null     object
 1   type          68 non-null     object
dtypes: object(2)
memory usage: 1.8+ KB
None 

<class 'pandas.core.frame.DataFrame'>
Index: 77 entries, Apple Cinnamon Cheerios to Quaker Oatmeal
Data columns (total 13 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   calories per serving                             70 non-null     float64
 1   grams of protein                                 77 non-null     int64  
 2   grams of fat                                     69 non-null     float64
 3   milligrams of sodium                             76 non-null     float64
 4   grams of dietary fiber                           

> The nominal features are:
- Manufacturer
- Type

> The numeric features are:
- calories per serving
- grams of fat
- milligrams of sodium
- grams of dietary fiber
- grams of sugars
- milligrams of potassium
- vitamins and minerals
- Weight in ounces per one serving
- Number of cups in one serving
- Rating of cereal

> The ordinal features are:
- Display shelf. (One could argue this is nominal, but I am considering the display shelf as being ordered.)

## Set the target and features and split the data

In [4]:
# Set the features and target
features = ['Manufacturer', 
            'type', 
            'grams of fat', 
            'grams of sugars',
            'Weight in ounces per one serving']
X = df[features]
y = df['calories per serving']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Construct the Preprocessor

In [5]:
# Build the column selectors
num_selector = make_column_selector(dtype_include = 'number')
cat_selector = make_column_selector(dtype_include = 'object')

# Instantiate the coulmn transformers
scaler = StandardScaler()
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

# Instantiate the imputers
mean_imputer = SimpleImputer(strategy = 'mean')
freq_imputer = SimpleImputer(strategy = 'most_frequent')

# Instantiate the pipelines
num_pipeline = make_pipeline(mean_imputer, scaler)
cat_pipeline = make_pipeline(freq_imputer, ohe)

# Make tuples for the column transformer
num_tuple = (num_pipeline, num_selector)
cat_tuple = (cat_pipeline, cat_selector)

# Instantiate the ColumnTransformer
preprocessor = make_column_transformer(num_tuple, cat_tuple)

## Preprocess the data

In [6]:
# Fit the preprocessor
preprocessor.fit(X_train)

# Apply preprocessor to the data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [7]:
# Confirm there are no missing values
print('There are',  np.isnan(X_train_processed).sum().sum(), 'missing values in the traning set.')
print('There are', np.isnan(X_test_processed).sum().sum(), 'missing values in the test set.')

# Display the data
print()
print(X_train_processed)
print()
print(X_test_processed)

There are 0 missing values in the traning set.
There are 0 missing values in the test set.

[[-9.74679434e-01  9.94481647e-01 -1.32764897e-01  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  1.22191915e+00  2.03880702e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00]
 [-9.74679434e-01 -8.25018407e-01 -1.32764897e-01  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  1.67679417e+00  3.15749558e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00 -1.42705887e-01 -1.32764897e-01  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  1.00000000e+00  0.0000000