# Seasonal Decomposition

Here I will perform a seasonal decomposition on the data.


In [184]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.tsa.seasonal import seasonal_decompose

In [185]:
#find file to load
!ls 1.collect_data/data_files

2012.csv.zip                   2019.csv.zip
2012.pkl                       2019.pkl
2013.csv.zip                   48hrs.pkl
2013.pkl                       5min.2019.pkl
2014.csv.zip                   5min.pkl
2014.pkl                       Annual_Parking_Study_Data.csv
2015.csv.zip                   Blockface.csv
2015.pkl                       fixed.zip
2016.csv.zip                   kaggle_usholidays.csv
2016.pkl                       make_five_min_freq.py
2017.csv.zip                   [34mtest[m[m
2017.pkl                       transpose_and_trim_five_min.py
2018.csv.zip                   [34mweather[m[m
2018.pkl


In [186]:
#load in data
df = pd.read_pickle('1.collect_data/data_files/5min.pkl')

In [187]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PaidOccupancy,ParkingSpaceCount
SourceElementKey,OccupancyDateTime,Unnamed: 2_level_1,Unnamed: 3_level_1
1001,2012-01-03 09:00:00,0.0,7.0
1001,2012-01-03 09:05:00,0.0,7.0
1001,2012-01-03 09:10:00,0.4,7.0
1001,2012-01-03 09:15:00,1.0,7.0
1001,2012-01-03 09:20:00,1.0,7.0


In [188]:
# make a new column for percent of occupied spaces
df['PercentOccupied'] = df['PaidOccupancy']/df['ParkingSpaceCount']
#df['PercentOccupied'] = df['PercentOccupied'].clip(None, 1)

In [189]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PaidOccupancy,ParkingSpaceCount,PercentOccupied
SourceElementKey,OccupancyDateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,2012-01-03 09:00:00,0.0,7.0,0.0
1001,2012-01-03 09:05:00,0.0,7.0,0.0
1001,2012-01-03 09:10:00,0.4,7.0,0.057143
1001,2012-01-03 09:15:00,1.0,7.0,0.142857
1001,2012-01-03 09:20:00,1.0,7.0,0.142857


In [190]:

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [191]:
level_values = df.index.get_level_values
blocks = level_values(0).unique()

all_block_dfs = []
for block_ind in blocks:
    dfs = []
    for year in range(2012,2020):
        #df.loc[1001].index.hour.max()
        current_year = df.loc[block_ind][df.loc[block_ind].index.year == year]
        if len(current_year) > 0:
        
            max_hour = current_year.index.hour.max()
            min_hour = current_year.index.hour.min()

            current_year = current_year.resample(rule='15min').mean()

            # remove non paying hours:
            current_year = current_year.iloc[current_year.index.indexer_between_time('%d:00:00' % min_hour, 
                                                                  '%d:00:00' % max_hour, 
                                                                  include_start=True, 
                                                                  include_end=True)]
            # remove sundays
            current_year = current_year[current_year.index.dayofweek != 6]


            # remove holidays

            cal = calendar()
            holidays = cal.holidays(start=current_year.index[0], end=current_year.index[-1])

            current_year = current_year[~current_year.index.normalize().isin(holidays)]

            # replace nans with mean
            current_year = current_year.replace(np.inf, np.nan)
            means = current_year.mean()
            current_year.fillna(means, inplace=True)

            dfs.append(current_year)

    block_dfs = pd.concat(dfs).assign(SourceElementKey=block_ind).set_index('SourceElementKey', append=True).swaplevel(0,1)
    all_block_dfs.append(block_dfs)



KeyboardInterrupt: 

In [None]:
all_dfs = pd.concat(all_block_dfs)

In [None]:
all_dfs.to_pickle('1.collect_data/data_files/15min.pkl')