In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold

from sklearn.pipeline import Pipeline

import mlflow

import logging

import sys
  
# setting path
sys.path.append('../scripts')
from preprocessing_functions import weekends, time_of_month, label_holidays

### Load Dataset

In [3]:
logging.info('Load Training Set')

In [4]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_column', None)
df_train = pd.read_csv('../data/train_clean.csv', na_values=['?', None, 'undefined'])
df_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Holiday,Day,Month,Year
0,1,5,2015-07-31,5263,555,1,1,0,1,1,31,7,2015
1,2,5,2015-07-31,6064,625,1,1,0,1,1,31,7,2015
2,3,5,2015-07-31,8314,821,1,1,0,1,1,31,7,2015
3,4,5,2015-07-31,13995,1498,1,1,0,1,1,31,7,2015
4,5,5,2015-07-31,4822,559,1,1,0,1,1,31,7,2015


In [5]:
logging.info('Load Test Set')

In [6]:
df_test = pd.read_csv('../data/test_clean.csv', na_values=['?', None, 'undefined'])
df_test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Holiday,Day,Month,Year
0,1,1,4,2015-09-17,1.0,1,0,0,0,17,9,2015
1,2,3,4,2015-09-17,1.0,1,0,0,0,17,9,2015
2,3,7,4,2015-09-17,1.0,1,0,0,0,17,9,2015
3,4,8,4,2015-09-17,1.0,1,0,0,0,17,9,2015
4,5,9,4,2015-09-17,1.0,1,0,0,0,17,9,2015


In [7]:
logging.info('Load Store Info')

In [8]:
df_store = pd.read_csv('../data/store.csv', na_values=['?', None, 'undefined'])
df_store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


### Add some more features to dataset

In [9]:
df_train['Weekend'] = df_train['DayOfWeek'].apply(weekends)
df_test['Weekend'] = df_test['DayOfWeek'].apply(weekends)

df_train['TimeOfMonth'] = df_train['Day'].apply(time_of_month)
df_test['TimeOfMonth'] = df_test['Day'].apply(time_of_month)

df_train['Holiday'] = df_train['StateHoliday'].apply(label_holidays)
df_test['Holiday'] = df_test['StateHoliday'].apply(label_holidays)

df_train['Date'] = pd.DatetimeIndex(df_train['Date'])


In [10]:
df_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Holiday,Day,Month,Year,Weekend,TimeOfMonth
0,1,5,2015-07-31,5263,555,1,1,0,1,1,31,7,2015,0,2
1,2,5,2015-07-31,6064,625,1,1,0,1,1,31,7,2015,0,2
2,3,5,2015-07-31,8314,821,1,1,0,1,1,31,7,2015,0,2
3,4,5,2015-07-31,13995,1498,1,1,0,1,1,31,7,2015,0,2
4,5,5,2015-07-31,4822,559,1,1,0,1,1,31,7,2015,0,2


In [28]:
# def days_to_holiday(x, holidays):
#     upcoming_holidays = holidays[holidays >= x]
#     passed_holidays = holidays[holidays <= x]
    

# def days_to_holiday(x, holidays):
#     upcoming_holidays = holidays[holidays >= x]
#     passed_holidays = holidays[holidays <= x]

In [29]:
# holidays = pd.Series(list(set(df_train[df_train['StateHoliday'] != '0']['Date'])))

In [30]:
x = df_train[['Store', 'DayOfWeek', 'Open', 'Promo', 'Holiday', 'SchoolHoliday', 'Day', 'Month', 'Year', 'Weekend' ,'TimeOfMonth']].values
y = df_train[['Sales']].values

### Linear Regression Pipeline with 5 fold cross validation

In [31]:

kf = KFold(n_splits=5)
for train_index, test_index in kf.split(x):
    with mlflow.start_run(run_name='linear_kfold', nested = True):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        reg_pipe = Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())])
        reg_pipe.fit(X_train, y_train)
        score = reg_pipe.score(X_test, y_test)
        print("score:", score)
        #log model params
        mlflow.log_param("lin_reg_score", score)

        # log model
        mlflow.sklearn.log_model(reg_pipe, "lin_regression_models")
        mlflow.end_run()

score: 0.5531988448600359
score: 0.5241824891184299
score: 0.5607267418284945
score: 0.5102658820341401
score: 0.5293602405887978


### Random Forest Regressor Pipeline with 5 fold cross validation

In [15]:
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(x):
    with mlflow.start_run(run_name='random_forest_reg_kfold', nested = True):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        forest_pipe = Pipeline([('scaler', StandardScaler()), ('model', RandomForestRegressor(n_estimators = 10, max_depth=3))])
        
        forest_pipe.fit(X_train, y_train)
        score = forest_pipe.score(X_test, y_test)
        print("score:", score)
        #log model params
        mlflow.log_param("rand_forest_reg_score", score)

        # log model
        mlflow.sklearn.log_model(forest_pipe, "random_forest_regression_models")
        mlflow.end_run()

score: 0.5627426983988746
score: 0.5441932299555508
score: 0.5811780631077926
score: 0.5278162162487741
score: 0.5398298722348633
