In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge

In [2]:
col_types = {'Store': np.int8,
             'Date':str,
             'Weekly_Sales':np.float64,
             'Holiday_Flag':np.float16,
             'Temperature':np.float16,
             'Fuel_Price':np.float16,
             'CPI':np.float16,
            'Unemployment':np.float16,}
data = pd.read_csv('./../data/raw/Walmart_Store_sales.csv', date_parser=True, usecols=col_types.keys(), dtype=col_types)#,  usecols=col_types.keys(), dtype=col_types)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         150 non-null    int8   
 1   Date          132 non-null    object 
 2   Weekly_Sales  136 non-null    float64
 3   Holiday_Flag  138 non-null    float16
 4   Temperature   132 non-null    float16
 5   Fuel_Price    136 non-null    float16
 6   CPI           138 non-null    float16
 7   Unemployment  135 non-null    float16
dtypes: float16(5), float64(1), int8(1), object(1)
memory usage: 4.1+ KB


In [4]:
data.describe()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,136.0,138.0,132.0,136.0,138.0,135.0
mean,9.866667,1249536.0,0.079712,61.40625,3.322266,180.0,7.601562
std,6.231191,647463.0,0.271973,18.375,0.478271,40.28125,1.577148
min,1.0,268929.0,0.0,18.796875,2.513672,126.125,5.144531
25%,4.0,605075.7,0.0,45.59375,2.851562,132.0,6.597656
50%,9.0,1261424.0,0.0,63.0,3.451172,198.0,7.46875
75%,15.75,1806386.0,0.0,76.3125,3.707031,214.875,8.148438
max,20.0,2771397.0,1.0,91.625,4.191406,227.0,14.3125


### Missing values

In [5]:
def missing_values(data):
    return pd.concat([data.isna().sum(), 
        np.round(data.isna().sum()*100/data.shape[0])], 
        axis=1).rename({0:'count_missing', 1:'%_missing'}, axis=1)

In [6]:
missing_values(data)

Unnamed: 0,count_missing,%_missing
Store,0,0.0
Date,18,12.0
Weekly_Sales,14,9.0
Holiday_Flag,12,8.0
Temperature,18,12.0
Fuel_Price,14,9.0
CPI,12,8.0
Unemployment,15,10.0


### Change Store type to str

In [7]:
data = data.astype({'Store':'str'})

### Split in train and test set

In [8]:
# Drop empty dates
#data = data.dropna(subset=['Weekly_Sales', 'Date'], axis=0)
data = data.dropna(subset=['Weekly_Sales'], axis=0)
X= data.drop('Weekly_Sales', axis=1)
y= data.loc[:,'Weekly_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [9]:
print(f'Train data')
X_train.shape

Train data


(108, 7)

In [10]:
print(f'Train data')
X_test.shape

Train data


(28, 7)

### Create new features from the date column

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

#Create a Transformer to use it in the pipeline
class DateAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, date_ix = 1):
        self.date_ix = date_ix
        self.new_features = []
    def fit(self, X, y=None):
        return self #Nothing else to do

    def transform(self, X):
        '''
        X is an array-like of shape (n_samples, 1).  1D array
        '''
        if isinstance(X, pd.DataFrame):
            dates = pd.to_datetime(X.iloc[:, self.date_ix])
        else:
            dates = pd.to_datetime(X)
        #dates = pd.to_datetime(X) # old code --> dates = pd.to_datetime(X[self.date_ix])
        year=dates.dt.year #Because of null values, year could be NA
        month=dates.dt.month#.astype('Int16')
        day=dates.dt.day#.astype('Int16')
        dayofweek=dates.dt.dayofweek#.astype('Int16')
        weekday=dates.dt.strftime("%A")
        week=dates.dt.isocalendar().week.astype(np.float64)
        self.new_features=['year', 'month', 'day', 'dayofweek', 'week']
        X_without_Date = X.drop('Date', axis=1)
        return np.c_[X_without_Date, year.values, month.values, day.values, dayofweek.values, week.values]

In [13]:
def explode_date(df:pd.DataFrame):
    data = df.copy()
    if data.dtypes['Date'] == 'object':
        data['Date'] = pd.to_datetime(data['Date'])
    data['year']=data['Date'].dt.year #Because of null values,
    data['month']=data['Date'].dt.month#.astype(np.float16)
    data['day']=data['Date'].dt.day#.astype('Int16')
    data['dayOfWeek']=data['Date'].dt.dayofweek#.astype('Int16')
    data['weekday']=data['Date'].dt.strftime("%A")
    data['week']=data['Date'].dt.isocalendar().week.astype(np.float16)
    return data

### Pre/processing

In [14]:
num_idx = [2, 3, 4, 5, 6 ]
cat_idx = [0] #Store
date_idx = [1] #not used.
num_features = ['Holiday_Flag','Temperature', 'Fuel_Price', 'CPI', 'Unemployment']#, 'year', 'month','day', 'dayOfWeek', 'week']
cat_features = ['Store']
date_feature = ['Date']


In [15]:
# Transformer for extra features from Date
date_transformer = Pipeline([
       # ('imputer_date', SimpleImputer(strategy = 'most_frequent')),
        ('attr_adder', DateAttributesAdder(date_ix=0)),
        ('imputer_date', SimpleImputer(strategy = 'most_frequent')),
        ('scaler', StandardScaler())
    ])

# transformer for numerical features
num_transformer = Pipeline([
        ('imputer_num', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ])
# transformer for categorical features
cat_transformer = Pipeline([
        ('imputer_cat', SimpleImputer(strategy = 'most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
        ('date', date_transformer, date_feature), #explode the Date feature into many
        ('categoricals', cat_transformer, cat_features),
        ('numericals', num_transformer, num_features)
    ],
    remainder = 'drop'
)

In [16]:
#trans_X_train = preprocessor.transformers[0][1][1].get_feature_names(['Store', 'Holiday_Flag'])

### Pipeline Preprocessing + Linear Regression

In [17]:
full_pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('lin_reg', LinearRegression())
    ])

In [18]:
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('date',
                                                  Pipeline(steps=[('attr_adder',
                                                                   DateAttributesAdder(date_ix=0)),
                                                                  ('imputer_date',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Date']),
                                                 ('categoricals',
                                                  Pipeline(steps=[('imputer_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                    

In [19]:
y_pred = full_pipeline.predict(X_train)

In [20]:
from sklearn.metrics import r2_score

In [21]:
print(f'R2 score train')
print(r2_score(y_train, y_pred))

R2 score train
0.9730436431571775


In [22]:
print(f'R2 score test')
y_pred_test = full_pipeline.predict(X_test)
print(r2_score(y_test, y_pred_test))

R2 score test
0.9203041286997493


In [23]:
print('Overfitting')

Overfitting


In [24]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor

### Linear Regression results

In [27]:
full_pipeline.steps[0][1].named_transformers_

{'date': Pipeline(steps=[('attr_adder', DateAttributesAdder(date_ix=0)),
                 ('imputer_date', SimpleImputer(strategy='most_frequent')),
                 ('scaler', StandardScaler())]),
 'categoricals': Pipeline(steps=[('imputer_cat', SimpleImputer(strategy='most_frequent')),
                 ('onehot', OneHotEncoder(handle_unknown='ignore'))]),
 'numericals': Pipeline(steps=[('imputer_num', SimpleImputer(strategy='median')),
                 ('scaler', StandardScaler())])}

In [28]:
full_pipeline.steps[0][1].named_transformers_['date']

Pipeline(steps=[('attr_adder', DateAttributesAdder(date_ix=0)),
                ('imputer_date', SimpleImputer(strategy='most_frequent')),
                ('scaler', StandardScaler())])

In [38]:
cat_encoder = full_pipeline.named_steps['preprocessing']

#### Coefficients

In [29]:
extra_atrib = full_pipeline.steps[0][1].named_transformers_['date'][0].new_features
cat_encoder = full_pipeline.steps[0][1].named_transformers_['categoricals'][1]
cat_attr = list(cat_encoder.categories_[0])

attributes = extra_atrib + cat_attr + num_features
attributes
print('(Attribute, coefficient)')
sorted(zip(attributes, full_pipeline.steps[1][1].coef_))

(Attribute, coefficient)


[('1', 191349.44822725153),
 ('10', 854691.8726457651),
 ('11', 15582.657957382222),
 ('12', 387817.33901325124),
 ('13', 824076.1728437055),
 ('14', 853687.4335406728),
 ('15', -414695.6321716911),
 ('16', -838749.1773393239),
 ('17', -288846.75614638906),
 ('18', 80349.52139502822),
 ('19', 339620.2935463835),
 ('2', 622989.7821567365),
 ('20', 597847.6894229373),
 ('3', -990309.7443166486),
 ('4', 951063.7128448143),
 ('5', -1107780.0121493563),
 ('6', 206329.54215408748),
 ('7', -764377.3721606745),
 ('8', -577491.3481664141),
 ('9', -943155.4232975498),
 ('CPI', 129107.62941202147),
 ('Fuel_Price', -7548.7940198528895),
 ('Holiday_Flag', -17645.734993297257),
 ('Temperature', -25277.005882919908),
 ('Unemployment', -95055.33138529448),
 ('day', -31123.827691664654),
 ('dayofweek', -15030.710846604537),
 ('month', -52882.014998182145),
 ('week', 60169.953119468264),
 ('year', -43446.442041053015)]

### Regularization

In [43]:
regressors = [
    Ridge(alpha=1.5),
    Lasso(alpha =100, max_iter=500, tol=0.005),
    ]
for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    print('----------------------------')
    print("model score train: %.4f" % pipe.score(X_train, y_train))
    print("model score test: %.4f" % pipe.score(X_test, y_test))
    print('\n')


Ridge(alpha=1.5)
----------------------------
model score train: 0.9209
model score test: 0.8459


Lasso(alpha=100, max_iter=500, tol=0.005)
----------------------------
model score train: 0.9730
model score test: 0.9196




### Try other regressors

In [44]:
regressors = [
    SGDRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    ]
for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    print("model score train: %.3f" % pipe.score(X_train, y_train))
    print("model score test: %.3f" % pipe.score(X_test, y_test))



SGDRegressor()
model score train: 0.963
model score test: 0.910
RandomForestRegressor()
model score train: 0.953
model score test: 0.697
AdaBoostRegressor()
model score train: 0.664
model score test: 0.457


### Feature importances with RandomForest

In [72]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RandomForestRegressor(max_depth=8, min_samples_split=6))])

In [73]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('date',
                                                  Pipeline(steps=[('attr_adder',
                                                                   DateAttributesAdder(date_ix=0)),
                                                                  ('imputer_date',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Date']),
                                                 ('categoricals',
                                                  Pipeline(steps=[('imputer_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                     

In [74]:
y_pred = pipe.predict(X_train)

In [75]:
print(f'R2 score train')
print(r2_score(y_train, y_pred))

R2 score train
0.9095736381655434


In [76]:
print(f'R2 score test')
y_pred_test = pipe.predict(X_test)
print(r2_score(y_test, y_pred_test))

R2 score test
0.6412295883052653


In [77]:
sorted(zip(pipe.named_steps['regressor'].feature_importances_,attributes), reverse=True)

[(0.16623501756663167, '3'),
 (0.11394060595711955, 'CPI'),
 (0.10446279733891618, 'Fuel_Price'),
 (0.09057083239731245, '5'),
 (0.08735210120840266, '14'),
 (0.05999207980916713, 'Unemployment'),
 (0.05956160106191121, 'Temperature'),
 (0.032425425594753754, '16'),
 (0.03218470427036486, '7'),
 (0.028146016261330602, '13'),
 (0.025080386699702174, '15'),
 (0.020014165115044096, '8'),
 (0.017630996592048774, '17'),
 (0.017260387450303456, '9'),
 (0.01693752906508953, '2'),
 (0.016454504388181336, 'week'),
 (0.015396961438932964, 'month'),
 (0.014044919097871792, '4'),
 (0.012338109930828432, '1'),
 (0.012265944344787906, 'dayofweek'),
 (0.010836000836434037, '20'),
 (0.00957516989346944, '19'),
 (0.009390794217494197, 'day'),
 (0.008116432880060667, '10'),
 (0.006672980359420806, '6'),
 (0.003939796123277337, 'year'),
 (0.0038668486445038747, '12'),
 (0.002921635787942191, '11'),
 (0.001224060324798784, '18'),
 (0.0011611953438982535, 'Holiday_Flag')]

> To improve the model (either linear regression or RandomForest), try the options: <br>
    - Get more data, with only 150, it's difficult to have a good model <br>
    - Drop getting rid of uninformative features like the month, the Holiday_Flag, Stores with few data