In [1]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# special matplotlib argument for improved plots
from matplotlib import rcParams

In [2]:
def load_train_test(train_file, test_file):
    """
    Loads in training and test data set
    
    Args:
    Training dataset filename
    Test dataset filename
    
    Returns:
    Training dataframe
    Test dataframe
    """
    
    train_df = pd.read_csv(train_file, parse_dates='Date')
    test_df = pd.read_csv(test_file, parse_dates='Date')
    
    return (train_df, test_df)

In [3]:
# load train and test data
rossman_train_df, rossman_test_df = load_train_test('./data/train.csv', './data/test.csv')

  data = self._reader.read(nrows)


## Exploratory Data Analysis

In [5]:
rossman_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
Store            1017209 non-null int64
DayOfWeek        1017209 non-null int64
Date             1017209 non-null object
Sales            1017209 non-null int64
Customers        1017209 non-null int64
Open             1017209 non-null int64
Promo            1017209 non-null int64
StateHoliday     1017209 non-null object
SchoolHoliday    1017209 non-null int64
dtypes: int64(7), object(2)
memory usage: 77.6+ MB


In [6]:
rossman_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41088 entries, 0 to 41087
Data columns (total 8 columns):
Id               41088 non-null int64
Store            41088 non-null int64
DayOfWeek        41088 non-null int64
Date             41088 non-null object
Open             41077 non-null float64
Promo            41088 non-null int64
StateHoliday     41088 non-null object
SchoolHoliday    41088 non-null int64
dtypes: float64(1), int64(5), object(2)
memory usage: 2.8+ MB


In [8]:
# take a look at the data
rossman_train_df.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [10]:
# Number of unique stores in training set
print 'Number of unique stores  %d ' %(len(rossman_train_df.Store.unique()))

Number of unique stores  1115 


In [11]:
# take a look at test data
rossman_test_df.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1,1,0,0
1,2,3,4,2015-09-17,1,1,0,0
2,3,7,4,2015-09-17,1,1,0,0
3,4,8,4,2015-09-17,1,1,0,0
4,5,9,4,2015-09-17,1,1,0,0


In [13]:
# Number of unique stores in test set
print 'Number of unique stores in test set %d ' %(len(rossman_test_df.Store.unique()))

Number of unique stores in test set 856 


In [15]:
# Describe 5 number summary for training and test dataset
rossman_train_df.describe()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday
count,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0,1017209.0
mean,558.429727,3.998341,5773.818972,633.145946,0.830107,0.381515,0.178647
std,321.908651,1.997391,3849.926175,464.411734,0.375539,0.485759,0.383056
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,280.0,2.0,3727.0,405.0,1.0,0.0,0.0
50%,558.0,4.0,5744.0,609.0,1.0,0.0,0.0
75%,838.0,6.0,7856.0,837.0,1.0,1.0,0.0
max,1115.0,7.0,41551.0,7388.0,1.0,1.0,1.0


In [16]:
rossman_test_df.describe()

Unnamed: 0,Id,Store,DayOfWeek,Open,Promo,SchoolHoliday
count,41088.0,41088.0,41088.0,41077.0,41088.0,41088.0
mean,20544.5,555.899533,3.979167,0.854322,0.395833,0.443487
std,11861.228267,320.274496,2.015481,0.352787,0.489035,0.496802
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,10272.75,279.75,2.0,1.0,0.0,0.0
50%,20544.5,553.5,4.0,1.0,0.0,0.0
75%,30816.25,832.25,6.0,1.0,1.0,1.0
max,41088.0,1115.0,7.0,1.0,1.0,1.0


In [17]:
# Various features
rossman_train_df.columns

Index([u'Store', u'DayOfWeek', u'Date', u'Sales', u'Customers', u'Open',
       u'Promo', u'StateHoliday', u'SchoolHoliday'],
      dtype='object')

In [41]:
# lets look into correlations
rossman_train_df.corr()

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday
Store,1.0,-8e-06,0.005126,0.024325,-4.7e-05,5.8e-05,0.000641
DayOfWeek,-8e-06,1.0,-0.462125,-0.386445,-0.528963,-0.392925,-0.205388
Sales,0.005126,-0.462125,1.0,0.894711,0.678472,0.452345,0.085124
Customers,0.024325,-0.386445,0.894711,1.0,0.616768,0.316169,0.071568
Open,-4.7e-05,-0.528963,0.678472,0.616768,1.0,0.295042,0.086171
Promo,5.8e-05,-0.392925,0.452345,0.316169,0.295042,1.0,0.067483
SchoolHoliday,0.000641,-0.205388,0.085124,0.071568,0.086171,0.067483,1.0


In [113]:
# features to consider
features_to_consider = ['Store', 'DayOfWeek', 'Open', 'Promo']

## Modelling

In [114]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split

In [115]:
train_data = rossman_train_df[features_to_consider]
train_target = rossman_train_df.Sales

In [116]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_target, test_size=0.3)

In [117]:
print 'Shape of training set %d and test set %d ' %(X_train.shape[0], X_test.shape[0])

Shape of training set 712046 and test set 305163 


## Linear Regression

In [118]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [119]:
# prediction on the training examples
train_est = lr.predict(X_train)

In [128]:
train_est = [pred if pred > 0 else 0.0 for pred in train_est]

In [148]:
# performance on the training set
print 'RMSPE on the trainig set %f ' %(rmspe(y_train.values, train_est))

RMSPE on the trainig set 0.503594 


In [149]:
# prediction on the test examples
test_est = lr.predict(X_test)

In [151]:
# performance on the test set
print 'RMSPE on the test set %f ' %(rmspe(y_test.values, test_est))

RMSPE on the test set 0.472949 


## Random Forest Regressor

In [152]:
from sklearn.ensemble import RandomForestRegressor

In [153]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [154]:
# prediction on the training examples
train_est_rf = rf.predict(X_train)

In [155]:
# performance on the training set
print 'RMSPE on the training set %f ' %(rmspe(y_train.values, train_est_rf))

RMSPE on the training set 0.225401 


In [156]:
# predictions on the test examples
test_est_rf = rf.predict(X_test)

In [158]:
# performance on the test set
print 'RMSPE on the test set %f ' %(rmspe(y_test.values, test_est_rf))

RMSPE on the test set 0.166980 


## Final Model preparation

In [81]:
# train linear regression on full training examples
lr.fit(train_data, train_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [159]:
# train random forest regressor on all training examples
rf.fit(train_data, train_target)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [160]:
test_df = rossman_test_df[features_to_consider]

In [161]:
# fill missing values in Open with 1
test_df.fillna(1, inplace=True)

In [89]:
linear_regression_preds = lr.predict(test_df)

In [163]:
rf_regression_preds = rf.predict(test_df)

## Loss Metric

In [146]:
from math import sqrt

def rmspe(y_true, y_pred):
    """
    Root Mean Square Percentage Error
    
    Args:
    y_true: true values for y
    y_pred: estimated values for y
    
    Returns: rmspe
    """
    
    n = len(y_true)
    e = []
    for i in range(n):
        if y_true[i] != 0:
            e.append((y_true[i] - y_pred[i]) / np.float(y_true[i]))
        else:
            e.append(0.0)
    
    e = np.array(e)
    e_squared = e ** 2
    return sqrt(np.sum(e_squared) / n)


## Create Submissions

In [14]:
def create_submission(ids, preds, filename):
    """
    Takes in ids and prediction for sales and submission filename
    
    Args:
    ids: Ids
    preds: Prediction for sales
    
    """
    
    submission_df = pd.DataFrame({'Id': ids, 'Sales': preds})
    submission_df.to_csv('./submissions/' + filename, index=False)
    

In [165]:
create_submission(rossman_test_df.Id.values, rf_regression_preds, 'rf_regression_preds.csv')