In [34]:
import warnings
import pandas as pd
import arviz as az

import numpy as np
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)
from libs.model import HGPforecaster
from libs.metrics import calculate_metrics, metrics_to_table
from libs.pre_processing import generate_groups_data
from libs.visual_analysis import visualize_fit, visualize_predict
az.style.use('arviz-darkgrid')
warnings.filterwarnings('ignore')

# Prison

In [35]:
prison = pd.read_csv('../data/prisonLF.csv', sep=",")
prison = prison.drop('Unnamed: 0', axis =1)
prison['Date'] = prison['t'].astype('datetime64[ns]')
prison = prison.drop('t', axis=1)
prison = prison.set_index('Date')
prison.columns= ['State', 'Gender', 'Legal', 'Count']

In [36]:
prison

Unnamed: 0_level_0,State,Gender,Legal,Count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-03-01,ACT,Female,Remanded,2
2005-06-01,ACT,Female,Remanded,4
2005-09-01,ACT,Female,Remanded,1
2005-12-01,ACT,Female,Remanded,4
2006-03-01,ACT,Female,Remanded,4
...,...,...,...,...
2015-12-01,WA,Male,Sentenced,3894
2016-03-01,WA,Male,Sentenced,3876
2016-06-01,WA,Male,Sentenced,3969
2016-09-01,WA,Male,Sentenced,4076


In [37]:
prison.to_csv('../data/prison_to_r.csv')

# Tourism

In [41]:
data = pd.read_csv('../data/TourismData_v3.csv')
data['Year'] = data['Year'].fillna(method='ffill')

d = dict((v,k) for k,v in enumerate(calendar.month_name))
data.Month = data.Month.map(d)
data = data.assign(t=pd.to_datetime(data[['Year', 'Month']].assign(day=1))).set_index('t')
data = data.drop(['Year', 'Month'], axis=1)

groups_input = {
    'state': [0,1],
    'zone': [0,2],
    'region': [0,3],
    'purpose': [3,6]
}

groups = generate_groups_data(data, groups_input, seasonality=12, h=8)

Number of groups: 4
	state: 7
	zone: 27
	region: 76
	purpose: 4
Total number of series: 304
Number of points per series for train: 220
Total number of points: 228
Seasonality: 12
Forecast horizon: 8


In [137]:
def prepare_data_to_r(groups):
    n = groups['predict']['n']
    s = groups['predict']['s']
    arrays = []
    names = []
    
    for group in groups['predict']['groups_names'].keys():
        names.append(group)
        arrays.append(groups['predict']['groups_names'][group][groups['predict']['groups_idx'][group]])

    index = pd.MultiIndex.from_arrays(arrays, names=names)
    
    df = pd.DataFrame(data=groups['predict']['data'], index = index, columns=['Count'])
    df['Date'] = np.tile(data.index, (s,))
    
    df = df.reset_index().set_index('Date')
    return df

In [141]:
df = prepare_data_to_r(groups)
df

Unnamed: 0_level_0,state,zone,region,purpose,Count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1998-01-01,A,AA,AAA,Hol,2015.444457
1998-02-01,A,AA,AAA,Hol,514.337600
1998-03-01,A,AA,AAA,Hol,532.097470
1998-04-01,A,AA,AAA,Hol,534.059083
1998-05-01,A,AA,AAA,Hol,505.222272
...,...,...,...,...,...
2016-08-01,G,GB,GBD,Oth,0.000000
2016-09-01,G,GB,GBD,Oth,0.000000
2016-10-01,G,GB,GBD,Oth,0.000000
2016-11-01,G,GB,GBD,Oth,0.000000


In [120]:
df.to_csv('../data/tourism_to_r.csv')