# Training Regression Models using sklearn pipelines

# Without Pipeline

## Importing Libraries

In [70]:
#importing Libraries
import pandas as pd
# import dvc.api
import os
import sys
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import mlflow
#import local libraries
#Adding scripts path
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.data_loader import load_df_from_csv
from scripts.ML_modelling_utils import *
from scripts.results_pickler import ResultPickler
from scripts.data_manipulation import DataManipulator
from scripts.data_information import DataInfo
from scripts.data_cleaner import DataCleaner

In [72]:
results = ResultPickler()

## Loading Clean Data

In [3]:
clean_data = load_df_from_csv('../data/train.csv')
y_values = clean_data['Sales']
x_values = clean_data.drop(['Sales'],axis=1)

## Training using Random Forest Regressor

In [4]:
# Splitting Data (60,20,20)
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [5]:
mlflow.autolog(log_input_examples=True, log_model_signatures=True, log_models=True, silent=True)
# Create a based model
rf = RandomForestRegressor()
with mlflow.start_run() as run:
    rf.fit(x_train, y_train)

    train_score = rf.score(x_train, y_train)
    valid_score = rf.score(x_valid,y_valid)
    valid_metrics = calculate_metrics(y_valid,rf.predict(x_valid),"Validation ")
    test_score = rf.score(x_test,y_test)
    test_metrics = calculate_metrics(y_test, rf.predict(x_test), "Test ")

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)
    save_model(rf, test_metrics['Test RMSE Score'])


Modelling Utilities:INFO->Validation  RMSE Score is: 23.20366%
Modelling Utilities:INFO->Validation  R2 Square Score is: 94.64278%
Modelling Utilities:INFO->Validation  MAE Score is: 13.35542%
Modelling Utilities:INFO->Test  RMSE Score is: 23.24267%
Modelling Utilities:INFO->Test  R2 Square Score is: 94.58028%
Modelling Utilities:INFO->Test  MAE Score is: 13.36098%
Modelling Utilities:ERROR->Failed to save model
Traceback (most recent call last):
  File "d:\10Academy\Week 3\pharmacy_sales_prediction\scripts\ML_modelling_utils.py", line 40, in save_model
    with open(file_name, 'wb') as handle:
FileNotFoundError: [Errno 2] No such file or directory: '../model/31-07-2021-20-07-56-23.24%.pkl'


In [6]:
train_score

0.9918316294617459

In [7]:
valid_score

0.9464277876115708

In [8]:
valid_metrics

{'Validation RMSE Score': 0.2320365947326469,
 'Validation R2_Squared': 0.9464277876115708,
 'Validation MAE Score': 0.13355418370087555}

In [9]:
test_metrics


{'Test RMSE Score': 0.23242674862611581,
 'Test R2_Squared': 0.9458027905681076,
 'Test MAE Score': 0.133609822914164}

In [10]:
features = pd.DataFrame()
features["Feature"] = x_train.columns
features["Importance"] = rf.feature_importances_
features = features.sort_values(by='Importance', ascending=False)
results.add_data('rf-normal-features',features)
features

Unnamed: 0,Feature,Importance
7,Open,0.460066
15,CompetitionDistance,0.161513
8,Promo,0.073335
17,CompetitionOpenSinceYear,0.050575
16,CompetitionOpenSinceMonth,0.049606
0,DayOfWeek,0.033238
19,Promo2SinceWeek,0.024697
13,StoreType,0.024501
5,Day,0.022847
3,Month,0.019423


## Parameter Tunning

In [14]:
# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True, False],
    'criterion': ['mse'],
    'max_depth': [10, 15, 20, 25],
    'max_features': [2, 3, 4, 5],
    'n_estimators': [10, 15, 20],
    'warm_start': [True, False]
}

rf2 = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf2, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=0)


**MSE** is used to check how close estimates or forecasts are to actual values. Lower the MSE, the closer is forecast to actual.

In [6]:
# Fit the grid search to the data
mlflow.autolog(log_input_examples=True, log_model_signatures=True, log_models=True, silent=True)
with mlflow.start_run() as run:
    best_model = grid_search.fit(x_train, y_train)

    train_score = best_model.score(x_train,y_train)
    valid_score = best_model.score(x_valid, y_valid)
    valid_metrics = calculate_metrics(
        y_valid, best_model.predict(x_valid), "Validation ")
    test_score = best_model.score(x_test, y_test)
    test_metrics = calculate_metrics(
        y_test, best_model.predict(x_test), "Test ")

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)


Modelling Utilities:INFO->Validation  RMSE Score is: 32.79238%
Modelling Utilities:INFO->Validation  R2 Square Score is: 89.30028%
Modelling Utilities:INFO->Validation  MAE Score is: 20.01977%
Modelling Utilities:INFO->Test  RMSE Score is: 32.47726%
Modelling Utilities:INFO->Test  R2 Square Score is: 89.41810%
Modelling Utilities:INFO->Test  MAE Score is: 19.87374%


In [7]:
save_model(best_model, test_metrics['Test RMSE Score'])

In [8]:
train_score

0.9629807562885361

In [9]:
valid_score


0.8930027877918323

In [10]:
valid_metrics

{'Validation RMSE Score': 0.32792378910963305,
 'Validation R2_Squared': 0.8930027877918323,
 'Validation MAE Score': 0.2001976505097582}

In [11]:
test_metrics

{'Test RMSE Score': 0.32477261016869247,
 'Test R2_Squared': 0.8941810365453975,
 'Test MAE Score': 0.19873742418354615}

In [12]:
best_model.best_params_


{'bootstrap': False,
 'criterion': 'mse',
 'max_depth': 25,
 'max_features': 5,
 'n_estimators': 25,
 'warm_start': True}

In [13]:
grid_features = pd.DataFrame()
grid_features["Feature"] = x_train.columns
grid_features["Importance"] = best_model.best_estimator_.feature_importances_
features = grid_features.sort_values(by='Importance', ascending=False)
results.add_data('grid-rf-features', features)
features


Unnamed: 0,Feature,Importance
0,DayOfWeek,0.246533
7,Open,0.169213
15,CompetitionDistance,0.124757
8,Promo,0.090544
1,WeekDay,0.052159
17,CompetitionOpenSinceYear,0.052151
16,CompetitionOpenSinceMonth,0.048443
9,StateHoliday,0.03902
13,StoreType,0.029305
19,Promo2SinceWeek,0.022674


# Using Pipeline

## Loading just merged unclean data

In [53]:
merged_data = load_df_from_csv('../data/train.csv')
y_values = merged_data['Sales']
x_values = merged_data.drop(['Sales'], axis=1)

## Spliting Data Sets

In [54]:
# Splitting Data (60,20,20)
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

## Creating Transformers for our numeric and categorical data

In [55]:
class CustomMaxImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        
    def fit(self, X, y=0):
        self.max_value = X.max()

        return self

    def transform(self, X, y=0):
        return np.where(X.isna(), self.max_value, X)


In [41]:
class CustomMostFrequentImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=0):
        most_occuring = Counter(X.flat).most_common(1)
        self.mode_value = most_occuring[0][0]

        return self

    def transform(self, X, y=0):
        return np.where(X.isna(), self.mode_value, X)


In [56]:
numeric_transformer = Pipeline(steps=[
    ('custom_max', CustomMaxImputer()),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('custom_mode', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])


## Identifying our columns and passing it to a ColumnTransformer

In [57]:
# Scaling Sales column
merged_data['Sales'] = numeric_transformer.fit_transform(merged_data[["Sales"]])


In [58]:
numeric_features = merged_data.select_dtypes(include=['int64', 'float64','uint8','uint16','float32']).columns

categorical_features = merged_data.select_dtypes(include=['object']).columns

class make_present_col_selector_class:
    def __init__(self, selected_columns):
        self.selected_columns = selected_columns

    def __call__(self, df):
        return [col for col in df.columns if col in self.selected_columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, make_present_col_selector_class(numeric_features)), 
        ('categorical', categorical_transformer, make_present_col_selector_class(categorical_features))
    ])


## Creating our RandomForestClassifier Pipeline with our preprocessor

In [16]:
pipeline = Pipeline(steps=[
    # ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])


## Training Model using Pipeline

In [60]:
# Fit the pipeline with the data
mlflow.autolog(log_input_examples=True, disable_for_unsupported_versions=True, silent=True)
with mlflow.start_run() as run:
    best_model = pipeline.fit(x_train, y_train)

    train_score = best_model.score(x_train, y_train)
    valid_score = best_model.score(x_valid, y_valid)
    valid_metrics = calculate_metrics(y_valid, best_model.predict(x_valid))
    test_score = best_model.score(x_test, y_test)
    test_metrics = calculate_metrics(y_test, best_model.predict(x_test))

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)


## Parameter Tunning

In [18]:
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [{
    "regressor": [RandomForestRegressor()],
    "regressor__n_estimators": [10, 15],
    "regressor__max_depth":[5, 8, 15],
    "regressor__min_samples_leaf":[1, 2],
    "regressor__bootstrap": [True, False],
    "regressor__criterion": ['mse'],
    "regressor__max_leaf_nodes": [2, 5],
    "regressor__max_features": [2, 3],
    "regressor__warm_start": [True, False]
}]
     
# create a gridsearch of the pipeline, the fit the best model
grid_search_pipeline = GridSearchCV(
    pipeline, grid_param, cv=3, verbose=0, n_jobs=-1)  # Fit grid search


In [19]:
# Fit the grid search to the data
with mlflow.start_run() as run:
    best_model = grid_search_pipeline.fit(x_train, y_train)

    train_score = best_model.score(x_train, y_train)
    valid_score = best_model.score(x_valid, y_valid)
    valid_metrics = calculate_metrics(y_valid, best_model.predict(x_valid))
    test_score = best_model.score(x_test, y_test)
    test_metrics = calculate_metrics(y_test, best_model.predict(x_test))

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)

Modelling Utilities:INFO-> RMSE Score is: 74.64988%
Modelling Utilities:INFO-> R2 Square Score is: 44.55214%
Modelling Utilities:INFO-> MAE Score is: 54.91547%
Modelling Utilities:INFO-> RMSE Score is: 74.16835%
Modelling Utilities:INFO-> R2 Square Score is: 44.81235%
Modelling Utilities:INFO-> MAE Score is: 54.87279%


In [23]:
valid_metrics

{'RMSE Score': 0.7464987779476941,
 'R2_Squared': 0.445521364001473,
 'MAE Score': 0.5491547335224897}

In [22]:
test_metrics

{'RMSE Score': 0.7416834902581729,
 'R2_Squared': 0.44812345685733945,
 'MAE Score': 0.5487279166109762}

In [24]:
save_model(best_model, test_metrics['RMSE Score'])

In [20]:
best_model.best_params_

{'regressor': RandomForestRegressor(bootstrap=False, max_depth=15, max_features=3,
                       max_leaf_nodes=5, min_samples_leaf=2, n_estimators=10),
 'regressor__bootstrap': False,
 'regressor__criterion': 'mse',
 'regressor__max_depth': 15,
 'regressor__max_features': 3,
 'regressor__max_leaf_nodes': 5,
 'regressor__min_samples_leaf': 2,
 'regressor__n_estimators': 10,
 'regressor__warm_start': False}

## Saving Store Reference for prediction purpose

In [73]:
# Export Filled Store for model predicition use later
store = load_df_from_csv('../data/store.csv')
store_manipulatior = DataManipulator(store)
store_info = DataInfo(store)
store_cleaner = DataCleaner(store)

In [74]:
store_info.get_missing_description()


The total number of missing values is 2343
21.01 % missing values.


In [75]:
store_info.get_column_based_missing_percentage()

Unnamed: 0,total_missing_values,missing_percentage
Store,0,0.0 %
StoreType,0,0.0 %
Assortment,0,0.0 %
CompetitionDistance,3,0.27 %
CompetitionOpenSinceMonth,354,31.75 %
CompetitionOpenSinceYear,354,31.75 %
Promo2,0,0.0 %
Promo2SinceWeek,544,48.79 %
Promo2SinceYear,544,48.79 %
PromoInterval,544,48.79 %


In [76]:
# Fill missing numeric values
store_manipulatior.fill_columns_with_max(store_info.get_numeric_columns())
# Fill non-numeric values (categorical values)
store_manipulatior.fill_columns_with_most_frequent(store_info.get_object_columns())

In [77]:
store_info.get_missing_description()

The total number of missing values is 0
0.0 % missing values.


In [78]:
# Label Object Columns
store_manipulatior.label_columns(store_info.get_object_columns())

{'StoreType': LabelEncoder(),
 'Assortment': LabelEncoder(),
 'PromoInterval': LabelEncoder()}

In [82]:
store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   uint16 
 1   StoreType                  1115 non-null   int32  
 2   Assortment                 1115 non-null   int32  
 3   CompetitionDistance        1115 non-null   float32
 4   CompetitionOpenSinceMonth  1115 non-null   float32
 5   CompetitionOpenSinceYear   1115 non-null   float32
 6   Promo2                     1115 non-null   uint8  
 7   Promo2SinceWeek            1115 non-null   float32
 8   Promo2SinceYear            1115 non-null   float32
 9   PromoInterval              1115 non-null   int32  
dtypes: float32(5), int32(3), uint16(1), uint8(1)
memory usage: 38.2 KB


In [80]:
# Save cleaned store in models for reference
store_cleaner.save_clean_data('../models/store_reference.csv')

## Saving Model Column Order Information For Prediciton Later

In [68]:
# Column Inputs Orders
results.add_data('model_input_columns',x_train.columns.to_list())

In [69]:
results.save_data('../models/column_reference.pkl')

# Prediciton Interval

In [15]:
# from sklearn.ensemble import GradientBoostingRegressor
# # Set lower and upper quantile
# LOWER_ALPHA = 0.1
# UPPER_ALPHA = 0.9
# # Each model has to be separate
# lower_model = GradientBoostingRegressor(loss="quantile",
#                                         alpha=LOWER_ALPHA)
# # The mid model will use the default loss
# mid_model = GradientBoostingRegressor(loss="ls")
# upper_model = GradientBoostingRegressor(loss="quantile",
#                                         alpha=UPPER_ALPHA)


In [17]:
# # Fit models
# lower_model.fit(x_train, y_train)
# mid_model.fit(x_train, y_train)
# upper_model.fit(x_train, y_train)

In [19]:
# # Record actual values on test set
# predictions = pd.DataFrame(best_model.predict(x_test),columns=['model'])
# # Predict
# predictions['lower'] = lower_model.predict(x_test)
# predictions['mid'] = mid_model.predict(x_test)
# predictions['upper'] = upper_model.predict(x_test)


In [29]:
# plt.figure(figsize=(20,10))
# a = predictions['lower']
# y = predictions.index
# plt.plot(y, a)
# plt.plot(predictions['model'], predicitons.index)
# plt.plot(predictions['upper'], predicitons.index)
