In [1]:
import datetime as dt
import numpy    as np
import os.path
import pandas   as pd
import random   as rand
import seaborn  as sns
from pytz import *

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
fulldata2015_filepath = '../../data/raw/wroclawski-rower-miejski/wypozyczenia_wrm-sezon2015.xlsx'
sample2015_filepath   = '../../data/raw/wroclawski-rower-miejski/wypozyczenia_wrm-sezon2015_sample.xlsx'

fulldata2016_filepath = '../../data/raw/wroclawski-rower-miejski/wypozyczenia_wrm-sezon2016.xlsx'
sample2016_filepath   = '../../data/raw/wroclawski-rower-miejski/wypozyczenia_wrm-sezon2016_sample.xlsx'

use_sample = False

## Bike rentals

In [3]:
def create_sample(fulldata_filepath, sample_filepath, number_of_samples=98):
    """
    Sample will contain first and last record and other random records.
    """
    df = pd.read_excel(fulldata_filepath)
    sample_indices = [0, df.index[-1]] + rand.sample(range(1, df.index[-2]), number_of_samples)
    sample_indices.sort()
    sample_df = df.iloc[sample_indices, :]
    sample_df.to_excel(sample_filepath, index=False)

if use_sample:
    if not os.path.exists(sample2015_filepath):
        create_sample(fulldata2015_filepath, sample2015_filepath)

    if not os.path.exists(sample2016_filepath):
        create_sample(fulldata2016_filepath, sample2016_filepath)
        
    rentals2015_df = pd.read_excel(sample2015_filepath)      
    rentals2016_df = pd.read_excel(sample2016_filepath)    
else:
    rentals2015_df = pd.read_excel(fulldata2015_filepath)
    rentals2016_df = pd.read_excel(fulldata2016_filepath)

In [None]:
rentals = pd.concat([rentals2015_df, rentals2016_df])

#### Feature engineering

In [None]:
rentals['Godzina wynajmu'] = rentals['Data wynajmu'].apply(lambda x: dt.datetime(x.year, x.month, x.day, x.hour, 0))

In [None]:
rentals_per_hour = pd.DataFrame(rentals.groupby('Godzina wynajmu').count()['Numer roweru']).reset_index()
rentals_per_hour.columns = ['Godzina wynajmu', 'Ilość wypożyczeń']

rentals = rentals.join(rentals_per_hour.set_index('Godzina wynajmu'), on='Godzina wynajmu', how='left')
rentals.loc[rentals['Ilość wypożyczeń'].isnull(), 'Ilość wypożyczeń'] = 0
rentals.drop('Godzina wynajmu', axis=1, inplace=True)

In [None]:
def get_season_start_date_and_end_date(df):
    index_first = 0
    index_last  = df.index[-1]

    start_date = df.loc[index_first, "Data wynajmu"].date()
    end_date   = df.loc[index_last, "Data zwrotu"].date()
    return start_date, end_date

rentals2015_start_date, rentals2015_end_date = get_season_start_date_and_end_date(rentals2015_df)
print('Season 2015 started on: ', rentals2015_start_date)
print('Season 2015 ended on:   ', rentals2015_end_date)

rentals2016_start_date, rentals2016_end_date = get_season_start_date_and_end_date(rentals2016_df)
print('Season 2016 started on: ', rentals2016_start_date)
print('Season 2016 ended on:   ', rentals2016_end_date)

In [None]:
# Our final dataset will have rentals data aggregated 'per hour'
year2015 = pd.date_range(rentals2015_start_date, rentals2015_end_date + dt.timedelta(days=1), freq='H')
year2016 = pd.date_range(rentals2016_start_date, rentals2016_end_date + dt.timedelta(days=1), freq='H')
dates_frame = pd.DataFrame({'date': year2015.append(year2016)})