## Exercises

1. Fill the missing values in the numeric columns with the median using Scikit-Learn and see if that helps our best model's performance (hint: see [`sklearn.impute.SimpleImputer`](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer) for more).
2. Try putting multiple steps together (e.g. preprocessing -> modelling) with Scikit-Learn's [`sklearn.pipeline.Pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) features. 
3. Try using another regression model/estimator on our preprocessed dataset and see how it goes. See the [Scikit-Learn machine learning map](https://scikit-learn.org/stable/machine_learning_map.html) for potential model options.
4. Try replacing the `sklearn.preprocessing.OrdinalEncoder` we used for the categorical variables with `sklearn.preprocessing.OneHotEncoder` (you may even want to do this within a pipeline) with the `sklearn.ensemble.RandomForestRegressor` model and see how it performs. Which is better for our specific dataset? 

## Example Exercise Solutions

The following are examples of how to solve the above exercises.

### 1. Fill the missing values in the numeric columns with the median using Scikit-Learn and see if that helps our best model's performance

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_log_error


# Import train samples (making sure to parse dates and then sort by them)
train_df = pd.read_csv(filepath_or_buffer="../data/bluebook-for-bulldozers/Train.csv",
                       parse_dates=["saledate"],
                       low_memory=False).sort_values(by="saledate", ascending=True)

# Import validation samples (making sure to parse dates and then sort by them)
valid_df = pd.read_csv(filepath_or_buffer="../data/bluebook-for-bulldozers/Valid.csv",
                       parse_dates=["saledate"])

# The ValidSolution.csv contains the SalePrice values for the samples in Valid.csv
valid_solution = pd.read_csv(filepath_or_buffer="../data/bluebook-for-bulldozers/ValidSolution.csv")

# Map valid_solution to valid_df
valid_df["SalePrice"] = valid_df["SalesID"].map(valid_solution.set_index("SalesID")["SalePrice"])

# Make sure valid_df is sorted by saledate still
valid_df = valid_df.sort_values("saledate", ascending=True).reset_index(drop=True)

# How many samples are in each DataFrame?
print(f"[INFO] Number of samples in training DataFrame: {len(train_df)}")
print(f"[INFO] Number of samples in validation DataFrame: {len(valid_df)}")


# Make a function to add date columns
def add_datetime_features_to_df(df, date_column="saledate"):
    # Add datetime parameters for saledate
    df["saleYear"] = df[date_column].dt.year
    df["saleMonth"] = df[date_column].dt.month
    df["saleDay"] = df[date_column].dt.day
    df["saleDayofweek"] = df[date_column].dt.dayofweek
    df["saleDayofyear"] = df[date_column].dt.dayofyear

    # Drop original saledate column
    df = df.drop("saledate", axis=1)

    return df

# Add datetime features to DataFrames
train_df = add_datetime_features_to_df(df=train_df)
valid_df = add_datetime_features_to_df(df=valid_df)


# Split training data into features and labels
X_train = train_df.drop("SalePrice", axis=1)
y_train = train_df["SalePrice"]

# Split validation data into features and labels
X_valid = valid_df.drop("SalePrice", axis=1)
y_valid = valid_df["SalePrice"]


# Define numerical and categorical features
numerical_features = [label for label, content in X_train.items() if pd.api.types.is_numeric_dtype(content)]
categorical_features = [label for label, content in X_train.items() if not pd.api.types.is_numeric_dtype(content)]


### Filling missing values ###

# Create an ordinal encoder (turns category items into numeric representation)
ordinal_encoder = OrdinalEncoder(categories="auto",
                                 handle_unknown="use_encoded_value",
                                 unknown_value=np.nan,
                                 encoded_missing_value=np.nan) # treat unknown categories as np.nan (or None)

# Create a simple imputer to fill missing values with median
simple_imputer_median = SimpleImputer(missing_values=np.nan,
                                      strategy="median")

# Fit and transform the categorical and numerical columns of X_train
X_train_preprocessed = X_train.copy() # make copies of the oringal DataFrames so we can keep the original values in tact and view them later
X_train_preprocessed[categorical_features] = ordinal_encoder.fit_transform(X_train_preprocessed[categorical_features].astype(str)) # OrdinalEncoder expects all values as the same type (e.g. string or numeric only)
X_train_preprocessed[numerical_features] = simple_imputer_median.fit_transform(X_train_preprocessed[numerical_features])

# Transform the categorical and numerical columns of X_valid 
X_valid_preprocessed = X_valid.copy()
X_valid_preprocessed[categorical_features] = ordinal_encoder.transform(X_valid_preprocessed[categorical_features].astype(str)) # only use `transform` on the validation data
X_valid_preprocessed[numerical_features] = simple_imputer_median.transform(X_valid_preprocessed[numerical_features])


# Create function to evaluate our model
def show_scores(model, 
                train_features=X_train_preprocessed,
                train_labels=y_train,
                valid_features=X_valid_preprocessed,
                valid_labels=y_valid):
    
    # Make predictions on train and validation features
    train_preds = model.predict(X=train_features)
    val_preds = model.predict(X=valid_features)

    # Create a scores dictionary of different evaluation metrics
    scores = {"Training MAE": mean_absolute_error(y_true=train_labels, y_pred=train_preds),
              "Valid MAE": mean_absolute_error(y_true=valid_labels, y_pred=val_preds),
              "Training RMSLE": root_mean_squared_log_error(y_true=train_labels, y_pred=train_preds),
              "Valid RMSLE": root_mean_squared_log_error(y_true=valid_labels, y_pred=val_preds),
              "Training R^2": model.score(X=train_features, y=train_labels),
              "Valid R^2": model.score(X=valid_features, y=valid_labels)}
    
    return scores


# Instantiate a model with best hyperparameters 
ideal_model_2 = RandomForestRegressor(n_estimators=90,
                                      max_depth=None,
                                      min_samples_leaf=1,
                                      min_samples_split=5,
                                      max_features=0.5,
                                      n_jobs=-1,
                                      max_samples=None)

# Fit a model to the preprocessed data
ideal_model_2.fit(X_train_preprocessed, y_train)


# Evalute the model
ideal_model_2_scores = show_scores(model=ideal_model_2)
ideal_model_2_scores

[INFO] Number of samples in training DataFrame: 401125
[INFO] Number of samples in validation DataFrame: 11573


{'Training MAE': 1950.6890126915082,
 'Valid MAE': 5938.157777189047,
 'Training RMSLE': 0.1017304155606623,
 'Valid RMSLE': 0.2454972961150074,
 'Training R^2': 0.9811165852928445,
 'Valid R^2': 0.8820009919620663}

Looks like filling the missing numeric values made our `ideal_model_2` perform slightly worse than our original `ideal_model`.

`ideal_model_2` had a validation RMSLE of `0.2454972961150074` where as `ideal_model` had a validation RMSLE of `0.2442769808502894`.

### 2. Try putting multiple steps together (e.g. preprocessing -> modelling) with Scikit-Learn's `sklearn.pipeline.Pipeline`

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_log_error


# Import and prepare data
train_df = pd.read_csv("../data/bluebook-for-bulldozers/Train.csv",
                       parse_dates=["saledate"],
                       low_memory=False).sort_values(by="saledate", ascending=True)

valid_df = pd.read_csv("../data/bluebook-for-bulldozers/Valid.csv",
                       parse_dates=["saledate"])

valid_solution = pd.read_csv("../data/bluebook-for-bulldozers/ValidSolution.csv")
valid_df["SalePrice"] = valid_df["SalesID"].map(valid_solution.set_index("SalesID")["SalePrice"])
valid_df = valid_df.sort_values("saledate", ascending=True).reset_index(drop=True)

# Add datetime features
def add_datetime_features_to_df(df, date_column="saledate"):
    df = df.copy()
    df["saleYear"] = df[date_column].dt.year
    df["saleMonth"] = df[date_column].dt.month
    df["saleDay"] = df[date_column].dt.day
    df["saleDayofweek"] = df[date_column].dt.dayofweek
    df["saleDayofyear"] = df[date_column].dt.dayofyear

    return df.drop(date_column, axis=1)

# Apply datetime features
train_df = add_datetime_features_to_df(train_df)
valid_df = add_datetime_features_to_df(valid_df)


# Split data into features and labels
X_train = train_df.drop("SalePrice", axis=1)
y_train = train_df["SalePrice"]
X_valid = valid_df.drop("SalePrice", axis=1)
y_valid = valid_df["SalePrice"]


# Define feature types
numeric_features = [label for label, content in X_train.items() if pd.api.types.is_numeric_dtype(content)]
categorical_features = [label for label, content in X_train.items() if not pd.api.types.is_numeric_dtype(content)]


# Create preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('string_converter', FunctionTransformer(lambda x: x.astype(str))), # convert values to string
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value',
                              unknown_value=np.nan,
                              encoded_missing_value=np.nan)),
])

# Create preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_transforms', numeric_transformer, numeric_features),
        ('categorical_transforms', categorical_transformer, categorical_features)
    ])


# Create full pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=90,
        max_depth=None,
        min_samples_leaf=1,
        min_samples_split=5,
        max_features=0.5,
        n_jobs=-1,
        max_samples=None
    ))
])


# Function to evaluate the pipeline
def evaluate_pipeline(pipeline, X_train, y_train, X_valid, y_valid):
    # Make predictions
    train_preds = pipeline.predict(X_train)
    valid_preds = pipeline.predict(X_valid)
    
    # Calculate scores
    scores = {
        "Training MAE": mean_absolute_error(y_train, train_preds),
        "Valid MAE": mean_absolute_error(y_valid, valid_preds),
        "Training RMSLE": root_mean_squared_log_error(y_train, train_preds),
        "Valid RMSLE": root_mean_squared_log_error(y_valid, valid_preds),
        "Training R^2": pipeline.score(X_train, y_train),
        "Valid R^2": pipeline.score(X_valid, y_valid)
    }

    return scores


# Fit and evaluate pipeline
model_pipeline.fit(X_train, y_train)

pipeline_scores = evaluate_pipeline(model_pipeline, X_train, y_train, X_valid, y_valid)
print("\nPipeline Scores:")
pipeline_scores


Pipeline Scores:


{'Training MAE': 1950.4811108609263,
 'Valid MAE': 5949.404248941762,
 'Training RMSLE': 0.1018847331610539,
 'Valid RMSLE': 0.2466149973069877,
 'Training R^2': 0.9811144473380727,
 'Valid R^2': 0.8814994739990883}

### 3. Try using another regression model/estimator on our preprocessed dataset and see how it goes

Going to use [`sklearn.linear_model.HistGradientBoostingRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html#histgradientboostingregressor).

In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_log_error


# Import and prepare data
train_df = pd.read_csv("../data/bluebook-for-bulldozers/Train.csv",
                       parse_dates=["saledate"],
                       low_memory=False).sort_values(by="saledate", ascending=True)

valid_df = pd.read_csv("../data/bluebook-for-bulldozers/Valid.csv",
                       parse_dates=["saledate"])

valid_solution = pd.read_csv("../data/bluebook-for-bulldozers/ValidSolution.csv")
valid_df["SalePrice"] = valid_df["SalesID"].map(valid_solution.set_index("SalesID")["SalePrice"])
valid_df = valid_df.sort_values("saledate", ascending=True).reset_index(drop=True)


# Add datetime features
def add_datetime_features_to_df(df, date_column="saledate"):
    df = df.copy()
    df["saleYear"] = df[date_column].dt.year
    df["saleMonth"] = df[date_column].dt.month
    df["saleDay"] = df[date_column].dt.day
    df["saleDayofweek"] = df[date_column].dt.dayofweek
    df["saleDayofyear"] = df[date_column].dt.dayofyear

    return df.drop(date_column, axis=1)

# Apply datetime features
train_df = add_datetime_features_to_df(train_df)
valid_df = add_datetime_features_to_df(valid_df)


# Split data into features and labels
X_train = train_df.drop("SalePrice", axis=1)
y_train = train_df["SalePrice"]
X_valid = valid_df.drop("SalePrice", axis=1)
y_valid = valid_df["SalePrice"]


# Define feature types
numeric_features = [label for label, content in X_train.items() if pd.api.types.is_numeric_dtype(content)]
categorical_features = [label for label, content in X_train.items() if not pd.api.types.is_numeric_dtype(content)]


# Create preprocessing steps for different types of values
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])

categorical_transformer = Pipeline(steps=[
    ('string_converter', FunctionTransformer(lambda x: x.astype(str))), # convert values to string
    ('ordinal', OrdinalEncoder(categories='auto',
                               handle_unknown='use_encoded_value',
                               unknown_value=np.nan,
                               encoded_missing_value=np.nan)), 
])

# Create preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_transforms', numeric_transformer, numeric_features),
        ('categorical_transforms', categorical_transformer, categorical_features)
    ])

# Create full pipeline
model_pipeline_hist_gradient_boosting_regressor = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', HistGradientBoostingRegressor()) # Change model to HistGradientBoostingRegressor
])


# Function to evaluate the pipeline
def evaluate_pipeline(pipeline, X_train, y_train, X_valid, y_valid):
    # Make predictions
    train_preds = pipeline.predict(X_train)
    valid_preds = pipeline.predict(X_valid)
    
    # Calculate scores
    scores = {
        "Training MAE": mean_absolute_error(y_train, train_preds),
        "Valid MAE": mean_absolute_error(y_valid, valid_preds),
        "Training RMSLE": root_mean_squared_log_error(y_train, train_preds),
        "Valid RMSLE": root_mean_squared_log_error(y_valid, valid_preds),
        "Training R^2": pipeline.score(X_train, y_train),
        "Valid R^2": pipeline.score(X_valid, y_valid)
    }

    return scores

# Fit and evaluate pipeline
print("[INFO] Fitting HistGradientBoostingRegressor model with pipeline...")
model_pipeline_hist_gradient_boosting_regressor.fit(X_train, y_train)

print("[INFO] Evaluating HistGradientBoostingRegressor model with pipeline...")
pipeline_hist_scores = evaluate_pipeline(model_pipeline_hist_gradient_boosting_regressor, X_train, y_train, X_valid, y_valid)

print("\nPipeline HistGradientBoostingRegressor Scores:")
pipeline_hist_scores

[INFO] Fitting HistGradientBoostingRegressor model with pipeline...
[INFO] Evaluating HistGradientBoostingRegressor model with pipeline...

Pipeline HistGradientBoostingRegressor Scores:


{'Training MAE': 5627.763561175847,
 'Valid MAE': 7241.016939519321,
 'Training RMSLE': 0.2690753757199006,
 'Valid RMSLE': 0.3026875195607664,
 'Training R^2': 0.8653678318987129,
 'Valid R^2': 0.8336503961646513}

### 4. Try replacing the `sklearn.preprocessing.OrdinalEncoder` we used for the categorical variables with `sklearn.preprocessing.OneHotEncoder`

> **Note:** This may take quite a long time depending on your machine. For example, on my ASUS Vivobook X1605ZA it took ~15 minutes with `n_estimators=10` (9x lower than what we used for our `best_model`). This is because using `sklearn.preprocessing.OneHotEncoder` adds many more features to our dataset (each feature gets turned into an array of 0's and 1's for each unique value). And the more features, the longer it takes to compute and find patterns between them.

In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_log_error


# Import and prepare data
train_df = pd.read_csv("../data/bluebook-for-bulldozers/Train.csv",
                       parse_dates=["saledate"],
                       low_memory=False).sort_values(by="saledate", ascending=True)

valid_df = pd.read_csv("../data/bluebook-for-bulldozers/Valid.csv",
                       parse_dates=["saledate"])

valid_solution = pd.read_csv("../data/bluebook-for-bulldozers/ValidSolution.csv")
valid_df["SalePrice"] = valid_df["SalesID"].map(valid_solution.set_index("SalesID")["SalePrice"])
valid_df = valid_df.sort_values("saledate", ascending=True).reset_index(drop=True)


# Add datetime features
def add_datetime_features_to_df(df, date_column="saledate"):
    df = df.copy()
    df["saleYear"] = df[date_column].dt.year
    df["saleMonth"] = df[date_column].dt.month
    df["saleDay"] = df[date_column].dt.day
    df["saleDayofweek"] = df[date_column].dt.dayofweek
    df["saleDayofyear"] = df[date_column].dt.dayofyear

    return df.drop(date_column, axis=1)

# Apply datetime features
train_df = add_datetime_features_to_df(train_df)
valid_df = add_datetime_features_to_df(valid_df)


# Split data
X_train = train_df.drop("SalePrice", axis=1)
y_train = train_df["SalePrice"]
X_valid = valid_df.drop("SalePrice", axis=1)
y_valid = valid_df["SalePrice"]


# Define feature types
numeric_features = [label for label, content in X_train.items() if pd.api.types.is_numeric_dtype(content)]
categorical_features = [label for label, content in X_train.items() if not pd.api.types.is_numeric_dtype(content)]


# Create preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('string_converter', FunctionTransformer(lambda x: x.astype(str))),
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # fill missing values with the term "missing"
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)) # use OneHotEncoder instead of OrdinalEncoder
])


# Create preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    verbose_feature_names_out=False  # Simplify feature names
)

# Create full pipeline
model_one_hot_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=10,
        max_depth=None,
        min_samples_leaf=1,
        min_samples_split=5,
        max_features=0.5,
        n_jobs=-1,
        max_samples=None
    ))
])


# Function to evaluate the pipeline
def evaluate_pipeline(pipeline, X_train, y_train, X_valid, y_valid):
    # Make predictions
    train_preds = pipeline.predict(X_train)
    valid_preds = pipeline.predict(X_valid)
    
    # Calculate scores
    scores = {
        "Training MAE": mean_absolute_error(y_train, train_preds),
        "Valid MAE": mean_absolute_error(y_valid, valid_preds),
        "Training RMSLE": root_mean_squared_log_error(y_train, train_preds),
        "Valid RMSLE": root_mean_squared_log_error(y_valid, valid_preds),
        "Training R^2": pipeline.score(X_train, y_train),
        "Valid R^2": pipeline.score(X_valid, y_valid)
    }

    return scores

# Fit and evaluate pipeline
print("[INFO] Fitting model with one hot encoded values...")
model_one_hot_pipeline.fit(X_train, y_train)

print("[INFO] Evaluating model with one hot encoded values...")
pipeline_one_hot_scores = evaluate_pipeline(model_one_hot_pipeline, X_train, y_train, X_valid, y_valid)

print("[INFO] Pipeline with one hot encoding scores:")
pipeline_one_hot_scores

[INFO] Fitting model with one hot encoded values...
[INFO] Evaluating model with one hot encoded values...
[INFO] Pipeline with one hot encoding scores:


{'Training MAE': 2118.909424271624,
 'Valid MAE': 6161.482251483096,
 'Training RMSLE': 0.10960067141291492,
 'Valid RMSLE': 0.2532635777630417,
 'Training R^2': 0.9763890510261058,
 'Valid R^2': 0.8706567646072217}