In [130]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold

import mlflow

In [118]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_column', None)
df_train = pd.read_csv('../data/train_clean.csv', na_values=['?', None, 'undefined'])
df_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Holiday,Day,Month,Year
0,1,5,2015-07-31,5263,555,1,1,0,1,1,31,7,2015
1,2,5,2015-07-31,6064,625,1,1,0,1,1,31,7,2015
2,3,5,2015-07-31,8314,821,1,1,0,1,1,31,7,2015
3,4,5,2015-07-31,13995,1498,1,1,0,1,1,31,7,2015
4,5,5,2015-07-31,4822,559,1,1,0,1,1,31,7,2015


In [119]:
df_test = pd.read_csv('../data/test_clean.csv', na_values=['?', None, 'undefined'])
df_test.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Holiday,Day,Month,Year
0,1,1,4,2015-09-17,1.0,1,0,0,0,17,9,2015
1,2,3,4,2015-09-17,1.0,1,0,0,0,17,9,2015
2,3,7,4,2015-09-17,1.0,1,0,0,0,17,9,2015
3,4,8,4,2015-09-17,1.0,1,0,0,0,17,9,2015
4,5,9,4,2015-09-17,1.0,1,0,0,0,17,9,2015


In [120]:
df_store = pd.read_csv('../data/store.csv', na_values=['?', None, 'undefined'])
df_store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


### Add some more features to dataset

In [121]:
def weekends(x):
    if x >= 6:
        return 1
    return 0

def time_of_month(x):
    if x <= 10:
        return 0
    if x <= 20:
        return 1
    return 2

def label_holidays(x):
    if x in [0,'0','a','b','c']:
        return [0,'0','a','b','c'].index(x)
    return 5

df_train['Weekend'] = df_train['DayOfWeek'].apply(weekends)
df_test['Weekend'] = df_test['DayOfWeek'].apply(weekends)

df_train['TimeOfMonth'] = df_train['Day'].apply(time_of_month)
df_test['TimeOfMonth'] = df_test['Day'].apply(time_of_month)

df_train['Holiday'] = df_train['StateHoliday'].apply(label_holidays)
df_test['Holiday'] = df_test['StateHoliday'].apply(label_holidays)

df_train['Date'] = pd.DatetimeIndex(df_train['Date'])


In [122]:
df_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Holiday,Day,Month,Year,Weekend,TimeOfMonth
0,1,5,2015-07-31,5263,555,1,1,0,1,1,31,7,2015,0,2
1,2,5,2015-07-31,6064,625,1,1,0,1,1,31,7,2015,0,2
2,3,5,2015-07-31,8314,821,1,1,0,1,1,31,7,2015,0,2
3,4,5,2015-07-31,13995,1498,1,1,0,1,1,31,7,2015,0,2
4,5,5,2015-07-31,4822,559,1,1,0,1,1,31,7,2015,0,2


In [123]:
# def days_to_holiday(x, holidays):
#     upcoming_holidays = holidays[holidays >= x]
#     passed_holidays = holidays[holidays <= x]
    

# def days_to_holiday(x, holidays):
#     upcoming_holidays = holidays[holidays >= x]
#     passed_holidays = holidays[holidays <= x]

In [124]:
# holidays = pd.Series(list(set(df_train[df_train['StateHoliday'] != '0']['Date'])))

In [125]:
x = df_train[['Store', 'DayOfWeek', 'Open', 'Promo', 'Holiday', 'SchoolHoliday', 'Day', 'Month', 'Year', 'Weekend' ,'TimeOfMonth']]
y = df_train[['Sales']]

In [126]:
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
X = x_scaler.fit_transform(x)
Y = y_scaler.fit_transform(y)

### Linear Regression

In [127]:

kf = KFold(n_splits=5)
for train_index, test_index in kf.split(X):
    with mlflow.start_run(run_name='linear_kfold', nested = True):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        reg = LinearRegression().fit(X_train, y_train)
        score = reg.score(X_test, y_test)
        print("score:", score)
        #log model params
        mlflow.log_param("lin_reg_score", score)

        # log model
        mlflow.sklearn.log_model(reg, "lin_regression_models")
        mlflow.end_run()

score: 0.5531988448600285
score: 0.524182489118358
score: 0.5607267418285327
score: 0.510265882034151
score: 0.5293602405887228


### Random Forest Regressor

In [132]:
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(X):
    with mlflow.start_run(run_name='random_forest_reg_kfold', nested = True):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        reg = RandomForestRegressor(n_estimators = 100, max_depth=5, random_state=0).fit(X_train, y_train)
        score = reg.score(X_test, y_test)
        print("score:", score)
        #log model params
        mlflow.log_param("rand_forest_reg_score", score)

        # log model
        mlflow.sklearn.log_model(reg, "random_forest_regression_models")
        mlflow.end_run()

score: 0.5720855145927195
score: 0.5661129682887178
score: 0.5871392185035886
score: 0.5521004806810201
score: 0.5535721443610709
