# BigStore Problem

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
#import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline

## Read Data

In [2]:
# read the train data
train_data = pd.read_csv('https://goz39a.s3.eu-central-1.amazonaws.com/bigstore_train.csv')

# check for the null values
train_data.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [3]:
train_x = train_data.drop(columns=['Item_Outlet_Sales'])
train_y = train_data['Item_Outlet_Sales']

## Create a custom Encoder for binary columns
We will add three binary variable to the dataset (1 or 0 as value)
- outlet_grocery_store
- outlet_supermarket_3
- outlet_supermarket_3

Note that this class enherits from the BaseEstimator class since we need the functions `fit` and `transform`

In [6]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

class OutletTypeEncoder(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset):
        x_dataset['outlet_grocery_store'] = (x_dataset['Outlet_Type'] == 'Grocery Store')*1
        x_dataset['outlet_supermarket_3'] = (x_dataset['Outlet_Type'] == 'Supermarket Type3')*1
        x_dataset['outlet_supermarket_3'] = (x_dataset['Outlet_Identifier'] == 'OUT027')*1
        
        return x_dataset

## Construct Column transformer with the following tasks
- drop unneeded columns
- fill the missing values using an Imputer
- scale the numerical values in the feature matrix


In [4]:
pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['Item_Identifier',
                                                                        'Outlet_Identifier',
                                                                        'Item_Fat_Content',
                                                                        'Item_Type',
                                                                        'Outlet_Identifier',
                                                                        'Outlet_Size',
                                                                        'Outlet_Location_Type',
                                                                        'Outlet_Type'
                                                                       ]),
                                              ('impute_item_weight', SimpleImputer(strategy='mean'), ['Item_Weight']),
                                              ('scale_data', StandardScaler(),['Item_MRP'])])

## Assemble the Pipeline
Add the forecasting model at the end of the pipeline (RandomForest) with depth 10

In [7]:
model_pipeline = Pipeline(steps=[('get_outlet_binary_columns', OutletTypeEncoder()), 
                                 ('pre_processing',pre_process),
                                 ('random_forest', RandomForestRegressor(max_depth=10,random_state=0))
                                 ])

## Testing a Suite of Models

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

model_1 = RandomForestRegressor(max_depth=10,random_state=0)
model_2 = LinearRegression(fit_intercept=True)
model_3 = Ridge(alpha=5)
model_4 = Lasso(alpha=10)
MSE = []
for mymodels in [model_1,model_2,model_3,model_4]:
    model_pipeline = Pipeline(steps=[('get_outlet_binary_columns', OutletTypeEncoder()), 
                                 ('pre_processing',pre_process),
                                 ('model', mymodels)
                                 ])
    model_pipeline.fit(train_x,train_y)
    MSE.append(mean_squared_error(train_y,model_pipeline.predict(train_x))**0.5)
    
print(np.round(MSE,2))   

[ 930.17 1133.15 1133.16 1134.06]


In [9]:
model_pipeline.get_params()

{'memory': None,
 'steps': [('get_outlet_binary_columns', OutletTypeEncoder()),
  ('pre_processing',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('drop_columns', 'drop',
                                    ['Item_Identifier', 'Outlet_Identifier',
                                     'Item_Fat_Content', 'Item_Type',
                                     'Outlet_Identifier', 'Outlet_Size',
                                     'Outlet_Location_Type', 'Outlet_Type']),
                                   ('impute_item_weight', SimpleImputer(),
                                    ['Item_Weight']),
                                   ('scale_data', StandardScaler(), ['Item_MRP'])])),
  ('model', Lasso(alpha=10))],
 'verbose': False,
 'get_outlet_binary_columns': OutletTypeEncoder(),
 'pre_processing': ColumnTransformer(remainder='passthrough',
                   transformers=[('drop_columns', 'drop',
                                  ['Item_Identifier', 'Outl

In [10]:
transformer = model_pipeline.get_params()['steps'][1][1]

In [11]:
my_scaler = transformer.get_params()['transformers'][2][1]

In [12]:
my_scaler.get_params()

{'copy': True, 'with_mean': True, 'with_std': True}