In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Ridge

In [2]:
col_types = {'Store': str,
             'Date':str,
             'Weekly_Sales':np.float64,
             'Holiday_Flag': np.float64,
             'Temperature':np.float16,
             'Fuel_Price':np.float16,
             'CPI':np.float16,
            'Unemployment':np.float16,}
data = pd.read_csv('./../data/raw/Walmart_Store_sales.csv', date_parser=True)#,  usecols=col_types.keys(), dtype=col_types)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         150 non-null    float64
 1   Date          132 non-null    object 
 2   Weekly_Sales  136 non-null    float64
 3   Holiday_Flag  138 non-null    float64
 4   Temperature   132 non-null    float64
 5   Fuel_Price    136 non-null    float64
 6   CPI           138 non-null    float64
 7   Unemployment  135 non-null    float64
dtypes: float64(7), object(1)
memory usage: 9.5+ KB


In [4]:
data.describe()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,136.0,138.0,132.0,136.0,138.0,135.0
mean,9.866667,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,1806386.0,0.0,76.345,3.70625,214.934616,8.15
max,20.0,2771397.0,1.0,91.65,4.193,226.968844,14.313


### Missing values

In [5]:
def missing_values(data):
    return pd.concat([data.isna().sum(), 
        np.round(data.isna().sum()*100/data.shape[0])], 
        axis=1).rename({0:'count_missing', 1:'%_missing'}, axis=1)

In [6]:
missing_values(data)

Unnamed: 0,count_missing,%_missing
Store,0,0.0
Date,18,12.0
Weekly_Sales,14,9.0
Holiday_Flag,12,8.0
Temperature,18,12.0
Fuel_Price,14,9.0
CPI,12,8.0
Unemployment,15,10.0


### Split in train and test set

In [7]:
# Drop empty dates
data = data.dropna(subset=['Weekly_Sales', 'Date'], axis=0)
X= data.drop('Weekly_Sales', axis=1)
y= data.loc[:,'Weekly_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### Create new features from the date colum

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin


class DateAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, date_ix = 1):
        self.date_ix = date_ix
        self.new_features = []
    def fit(self, X, y=None):
        return self #Nothing else to do

    def transform(self, X):
        '''
        X is an array-like of shape (n_samples, 1).  1D array
        '''
        if isinstance(X, pd.DataFrame):
            dates = pd.to_datetime(X.iloc[:, 0])
        else:
            dates = pd.to_datetime(X)
        #dates = pd.to_datetime(X) # old code --> dates = pd.to_datetime(X[self.date_ix])
        year=dates.dt.year #Because of null values, year could be 
        month=dates.dt.month.astype(np.int8)
        day=dates.dt.day.astype(np.int8)
        dayofweek=dates.dt.dayofweek.astype(np.int8)
        weekday=dates.dt.strftime("%A")
        week=dates.dt.isocalendar().week.astype(np.int8)
        self.new_features=['year', 'month', 'day', 'dayofweek', 'week']
        return np.c_[year.values, month.values, day.values, dayofweek.values, week.values]



In [9]:
attr_adder = DateAttributesAdder(date_ix = 0)
attr_adder.transform(pd.DataFrame(X_train['Date']))


array([[2011,    4,   22,    4,   16],
       [2012,    1,   20,    4,    3],
       [2011,    7,   29,    4,   30],
       [2010,    9,   24,    4,   38],
       [2010,   12,    2,    3,   48],
       [2012,    3,    2,    4,    9],
       [2010,    2,   19,    4,    7],
       [2010,    7,   30,    4,   30],
       [2011,    8,   26,    4,   34],
       [2012,    4,   27,    4,   17],
       [2010,    6,   25,    4,   25],
       [2010,    9,    7,    1,   36],
       [2011,    5,   27,    4,   21],
       [2011,   12,   16,    4,   50],
       [2011,    6,    5,    6,   22],
       [2010,    6,   18,    4,   24],
       [2012,   10,    2,    1,   40],
       [2010,   12,    2,    3,   48],
       [2011,    3,   25,    4,   12],
       [2011,   12,    8,    3,   49],
       [2010,    6,   18,    4,   24],
       [2010,    6,   25,    4,   25],
       [2010,    8,   27,    4,   34],
       [2011,    5,   13,    4,   19],
       [2012,   10,   19,    4,   42],
       [2010,    2,    4,

In [10]:
def explode_date(df:pd.DataFrame):
    data = df.copy()
    if data.dtypes['Date'] == 'object':
        data['Date'] = pd.to_datetime(data['Date'])
    data['year']=data['Date'].dt.year #Because of null values, year could be 
    data['month']=data['Date'].dt.month.astype(np.int8)
    data['day']=data['Date'].dt.day.astype(np.int8)
    data['dayOfWeek']=data['Date'].dt.dayofweek.astype(np.int8)
    data['weekday']=data['Date'].dt.strftime("%A")
    data['week']=data['Date'].dt.isocalendar().week.astype(np.int8)
    return data

In [11]:
#X_train = explode_date(X_train)
#X_train.head()

### Transform holiday flat to int

In [12]:
def float_to_int(x):
    if not np.isnan(x):
        return int(x)
    return np.NaN

### Pre/processing

In [13]:
num_idx = [3, 4, 5, 6 ] #[3, 4, 5, 6, 7, 8, 9, 10, 12]
cat_idx = [0, 2]
date_idx = [1]
num_features = X_train.columns[num_idx] #['Temperature', 'Fuel_Price', 'CPI', Unemployment', 'year', 'month','day', 'dayOfWeek', 'week']
cat_features = X_train.columns[cat_idx] #['Store', 'Holiday_Flag']
date_feature = X_train.columns[date_idx]


In [20]:
# Transformer for extra features from Date
date_transformer = Pipeline([
        #('imputer_date', SimpleImputer(strategy = 'most_frequent')),
        ('attr_adder', DateAttributesAdder(date_ix=0)),
        ('scaler', StandardScaler())
    ])

# transformer for numerical features
num_transformer = Pipeline([
        ('imputer_num', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ])
# transformer for categorical features
cat_transformer = Pipeline([
        ('imputer_cat', SimpleImputer(strategy = 'most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
        ('date', date_transformer, date_feature),
        ('categoricals', cat_transformer, cat_features),
        ('numericals', num_transformer, num_features)
    ],
    remainder = 'drop'
)

In [21]:
#trans_X_train = preprocessor.transformers[0][1][1].get_feature_names(['Store', 'Holiday_Flag'])

In [22]:
full_pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('lin_reg', LinearRegression())
    ])

In [23]:
full_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('date',
                                                  Pipeline(steps=[('attr_adder',
                                                                   DateAttributesAdder(date_ix=0)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Date'], dtype='object')),
                                                 ('categoricals',
                                                  Pipeline(steps=[('imputer_cat',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  

In [24]:
y_pred = full_pipeline.predict(X_train)

In [25]:
from sklearn.metrics import r2_score

In [26]:
print(r2_score(y_train, y_pred))

0.9732536818236963


In [28]:
#X_test = explode_date(X_test)
#X_test_tr = preprocessor.transform(X_test)
y_pred_test = full_pipeline.predict(X_test)
print(r2_score(y_test, y_pred_test))

0.9113268966033705


In [None]:
 from sklearn.linear_model import Ridge, Lasso
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.ensemble import AdaBoostClassifier

In [None]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoost(),
    GradientBoostingClassifier()
    ]
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipe.score(X_test, y_test))