In [1]:
#Import data manip modules
import pandas as pd
import numpy as np

#Visualization libraries
from matplotlib import pyplot as plt
import seaborn as sns; sns.set()

In [2]:
#Read data in
dat = pd.read_csv('../Data/bike_sharing_hourly.csv', parse_dates=['dteday'])

In [3]:
# Convert to datetime format
dat['dteday'] = pd.to_datetime(dat.dteday) + dat.hr.astype('timedelta64[h]')

In [4]:
#Make dteday dataframe's index
dat.set_index('dteday', drop=True, inplace=True)

In [5]:
#Create new column with a log of counts
dat['log_cnt'] = np.log(dat.cnt) + 1 #add constant to change 0's to 1's

In [6]:
#Convert weathersit into multiple dummy variables
wsit_dummies = pd.get_dummies(dat.weathersit)
wsit_dummies = wsit_dummies.iloc[:,0:3] #Drop one to avoid dummy variables trap

#Join dummies to original df
dat = dat.join(wsit_dummies)

In [7]:
#Check columns of data
dat.tail(2)

Unnamed: 0_level_0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,log_cnt,1,2,3
dteday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2012-12-31 22:00:00,17378,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61,5.110874,1,0,0
2012-12-31 23:00:00,17379,1,1,12,23,0,1,1,1,0.26,0.2727,0.65,0.1343,12,37,49,4.89182,1,0,0


In [8]:
#Create dataframe of holidays
hol_df = pd.DataFrame({'holiday': 'hol', 'ds': dat[dat['holiday'] == 1].index})

## Visualizing Train and Test Splits

In [9]:
#Create train and test splits
train = dat['Jan 2011': 'Aug 2012']
val = dat['Sep 2012': 'Nov 2012']
test = dat['Dec 2012']

In [10]:
#Dates to create validation sets on
v_dates = ['Aug 2012', 'Sep 2012', 'Oct 2012', 'Nov 2012']

In [11]:
#Function to create train and test splits
def train_test_splits(dat, val_dates):
    
    '''
    Function that splits training data into training and validation 
    sets using an expanding window. 
    Inputs:       dat - Training data to be split
            val_dates - List of dates to create validation sets. Must include last month 
                        of training data
    Outputs: expanding_window_indices - List with tuple of train and validation set indices
    '''
    
    #Initialize counter
    pos = 0
    expanding_window_indices = []

    #Iterate over all values except the last in list of dates
    for date in val_dates[:-1]:
    
        #Create training set
        train = dat[ : date]
        val_len = len(val_dates) - 1 #Count starts from 0
    
        #Create variable to move to the next date
        next_date = pos+1 if (pos+1 < val_len) else val_len
    
        #Creat validation split
        val = dat[val_dates[next_date]]
    
        pos += 1#Update counter
    
        #Create train and test set indices
        expanding_window_indices.append(((train.index, val.index)))
        
    return expanding_window_indices   

In [13]:
splits = train_test_splits(dat, v_dates)

In [17]:
splits[0][1]

DatetimeIndex(['2012-09-01 00:00:00', '2012-09-01 01:00:00',
               '2012-09-01 02:00:00', '2012-09-01 03:00:00',
               '2012-09-01 04:00:00', '2012-09-01 05:00:00',
               '2012-09-01 06:00:00', '2012-09-01 07:00:00',
               '2012-09-01 08:00:00', '2012-09-01 09:00:00',
               ...
               '2012-09-30 14:00:00', '2012-09-30 15:00:00',
               '2012-09-30 16:00:00', '2012-09-30 17:00:00',
               '2012-09-30 18:00:00', '2012-09-30 19:00:00',
               '2012-09-30 20:00:00', '2012-09-30 21:00:00',
               '2012-09-30 22:00:00', '2012-09-30 23:00:00'],
              dtype='datetime64[ns]', name='dteday', length=720, freq=None)