# Pipelines Activity (Core)
- **Student:** Michael McCann
- **Date:** 24 FEB 2022

## Setup -Mount Drive, Import Libraries and Data

In [2]:
## Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
## Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display='diagram')

In [4]:
## Load and Inspect the Data
cereal_filepath = '/content/drive/MyDrive/Data/Cereal_w_missing_values.xlsx'
cereal_df = pd.read_excel(cereal_filepath)

cereal_df.head()

Unnamed: 0,name,Manufacturer,type,calories per serving,grams of protein,grams of fat,milligrams of sodium,grams of dietary fiber,grams of complex carbohydrates,grams of sugars,milligrams of potassium,vitamins and minerals (% of FDA recommendation),Display shelf,Weight in ounces per one serving,Number of cups in one serving,Rating of cereal
0,Apple Cinnamon Cheerios,General Mills,Cold,110.0,2.0,2.0,180.0,1.5,10.5,10.0,70.0,25.0,1.0,1.0,0.75,29.509541
1,Basic 4,General Mills,Cold,130.0,3.0,2.0,,2.0,18.0,,100.0,25.0,3.0,1.33,0.75,37.038562
2,Cheerios,General Mills,Cold,,6.0,2.0,290.0,2.0,17.0,1.0,105.0,25.0,1.0,1.0,1.25,50.764999
3,Cinnamon Toast Crunch,General Mills,Cold,120.0,1.0,3.0,210.0,0.0,13.0,9.0,45.0,25.0,2.0,1.0,0.75,19.823573
4,Clusters,General Mills,Cold,110.0,3.0,2.0,140.0,2.0,13.0,7.0,105.0,25.0,3.0,1.0,0.5,40.400208


## Define Feature and Target

- Target: calories
- features matrix: manufacturer, type, grams of fat, grams of sugars, and weight per serving

In [5]:
list(cereal_df.columns)

['name',
 'Manufacturer',
 'type',
 'calories per serving',
 'grams of protein',
 'grams of fat',
 'milligrams of sodium',
 'grams of dietary fiber',
 'grams of complex carbohydrates',
 'grams of sugars',
 'milligrams of potassium',
 'vitamins and minerals (% of FDA recommendation)',
 'Display shelf',
 'Weight in ounces per one serving',
 'Number of cups in one serving',
 'Rating of cereal']

In [6]:
# Define target (y) and feature matrix (X)
X = cereal_df.drop(columns = ['calories per serving', 'name', 'grams of protein', 
                              'milligrams of sodium', 'grams of dietary fiber', 
                              'grams of complex carbohydrates', 'milligrams of potassium', 
                              'vitamins and minerals (% of FDA recommendation)', 'Display shelf', 
                              'Number of cups in one serving', 'Rating of cereal'])
y = cereal_df['calories per serving']

In [7]:
X.head()

Unnamed: 0,Manufacturer,type,grams of fat,grams of sugars,Weight in ounces per one serving
0,General Mills,Cold,2.0,10.0,1.0
1,General Mills,Cold,2.0,,1.33
2,General Mills,Cold,2.0,1.0,1.0
3,General Mills,Cold,3.0,9.0,1.0
4,General Mills,Cold,2.0,7.0,1.0


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 5 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Manufacturer                      77 non-null     object 
 1   type                              68 non-null     object 
 2   grams of fat                      69 non-null     float64
 3   grams of sugars                   68 non-null     float64
 4   Weight in ounces per one serving  77 non-null     float64
dtypes: float64(3), object(2)
memory usage: 3.1+ KB


type, grams of fat, and grams of sugar contain NA values. 
- Type is an object datatype and will be resolved using SimpleImputer most frequent strategy
- grams of fat and grams of sugar are numeric and will be resolved using SimpleImputer mean strategy

## Train Test Split

In [9]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Identify Features

- Manufacturer: Nominal
- Cereal Type: Nominal
- Grams of Fat: Numeric
- Grams of Sugar: Numeric
- Weight: Numeric

## Column Transformer

### Column Selectors

In [10]:
# Create 
cat_sel = make_column_selector(dtype_include = 'object')
num_sel = make_column_selector(dtype_include = 'number')

### Instantiate Transformers

In [11]:
cat_imputer = SimpleImputer(strategy = "most_frequent")
num_imputer = SimpleImputer(strategy = "mean")

scaler = StandardScaler()

ohe = OneHotEncoder(handle_unknown='ignore', sparse = False)

### Create Pipeline

In [12]:
num_pipe = make_pipeline(num_imputer, scaler)
# Alt: num_pipe = make_pipeline(SimpleImputer(strategy = 'mean'), StandardScaler())

cat_pipe = make_pipeline(cat_imputer, ohe)
#cat_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent', OneHotEncoder(handle_unknown = 'ignore', sparse = False)))

### ColumnTransformer

In [13]:
# Create the tuple
num_tuple = (num_pipe, num_sel)
cat_tuple = (cat_pipe, cat_sel)

In [14]:
# Instantiate make preproccesor
preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

In [15]:
# fit the data to the preproccesor
preprocessor.fit(X_train)

In [16]:
# Transform the test/training sets
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [17]:
# Display X_train_processed
display(np.isnan(X_train_processed).any())
print('\n\n')
X_train_processed

False






(57, 11)

In [24]:
# Display X_test_processed
display(np.isnan(X_test_processed).any())
print('\n\n')
X_test_processed

False






(20, 11)

### Pull out column names

In [19]:
cat_feat_names = preprocessor.named_transformers_['pipeline-2'].named_steps['onehotencoder'].get_feature_names_out(cat_sel(X_train))

In [20]:
final_cols = num_sel(X_train) + list(cat_feat_names)
final_cols

['grams of fat',
 'grams of sugars',
 'Weight in ounces per one serving',
 'Manufacturer_General Mills',
 'Manufacturer_Kelloggs',
 'Manufacturer_Nabisco',
 'Manufacturer_Post',
 'Manufacturer_Quaker Oats',
 'Manufacturer_Ralston Purina',
 'type_Cold',
 'type_Hot']

### change back into DF

In [21]:
X_train_output = pd.DataFrame(X_train_processed, columns = final_cols)
X_test_output = pd.DataFrame(X_test_processed, columns = final_cols)

In [22]:
print('Training Set DataFrame:')
display(X_train_output.head())
print('\n\n\nTesting Set DataFrame:')
X_test_output.head()

Training Set DataFrame:


Unnamed: 0,grams of fat,grams of sugars,Weight in ounces per one serving,Manufacturer_General Mills,Manufacturer_Kelloggs,Manufacturer_Nabisco,Manufacturer_Post,Manufacturer_Quaker Oats,Manufacturer_Ralston Purina,type_Cold,type_Hot
0,-0.974679,0.994482,-0.132765,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.221919,2.038807,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.974679,-0.825018,-0.132765,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.676794,3.157496,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,-0.142706,-0.132765,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0





Testing Set DataFrame:


Unnamed: 0,grams of fat,grams of sugars,Weight in ounces per one serving,Manufacturer_General Mills,Manufacturer_Kelloggs,Manufacturer_Nabisco,Manufacturer_Post,Manufacturer_Quaker Oats,Manufacturer_Ralston Purina,type_Cold,type_Hot
0,0.974679,0.084732,-0.132765,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.974679,1.449357,3.157496,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,-0.825018,-0.132765,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.974679,0.767044,-0.132765,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,-0.142706,-0.132765,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
