In [None]:
import pandas as pd
import np as np

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor

In [None]:
# Read the data
data = pd.read_csv('data\auto.csv')

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

### TRAIN | TEST | SPLIT

In [None]:
# presuming that data has been cleaned up.
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=44)

### Find categorical columns with values that don't map onto testing data

In [None]:
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that could be dropped from the dataset:', bad_label_cols)

In [None]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

### PIPELINES ###

With the pipeline, you preprocess the training data and fit the model in a single line of code.

In contrast, without a pipeline, you have to do imputation, one-hot encoding, and model training in separate steps. This becomes messy if you have to deal with both numerical and categorical variables.
With the pipeline, the unprocessed features are supplied in X_valid to the predict() command, and the pipeline automatically preprocesses the features before generating predictions. 

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

### one hot encoding

In [None]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index

OH_cols_train # this is col 'four' with the five different values translated
              # into a column for the five differnt values 

In [None]:
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_train

In [None]:
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_train

### using an imputer

In [None]:
### BASIC EXAMPLE ###
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train.select_dtypes(exclude=['object'])))
# Imputation removed column names; put them back
imputed_X_train.columns = X_train.select_dtypes(exclude=['object']).columns
imputed_X_train

In [None]:
### ADD A COLUMN TO IDENTIFY WHICH VALUES WERE MISSING ###

# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.select_dtypes(exclude=['object']).copy()

# Make new columns indicating what will be imputed
for col in imputed_X_train:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])