In [2]:
import warnings
import pandas as pd
import arviz as az

import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)
from libs.model import HGPforecaster
from libs.metrics import calculate_metrics, metrics_to_table
from libs.pre_processing import generate_groups_data
from libs.visual_analysis import visualize_fit, visualize_predict
import calendar
import numpy as np
az.style.use('arviz-darkgrid')
warnings.filterwarnings('ignore')

# Prison

In [35]:
prison = pd.read_csv('../data/prisonLF.csv', sep=",")
prison = prison.drop('Unnamed: 0', axis =1)
prison['Date'] = prison['t'].astype('datetime64[ns]')
prison = prison.drop('t', axis=1)
prison = prison.set_index('Date')
prison.columns= ['State', 'Gender', 'Legal', 'Count']

In [36]:
prison

Unnamed: 0_level_0,State,Gender,Legal,Count
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-03-01,ACT,Female,Remanded,2
2005-06-01,ACT,Female,Remanded,4
2005-09-01,ACT,Female,Remanded,1
2005-12-01,ACT,Female,Remanded,4
2006-03-01,ACT,Female,Remanded,4
...,...,...,...,...
2015-12-01,WA,Male,Sentenced,3894
2016-03-01,WA,Male,Sentenced,3876
2016-06-01,WA,Male,Sentenced,3969
2016-09-01,WA,Male,Sentenced,4076


In [37]:
prison.to_csv('../data/prison_to_r.csv')

# Tourism

In [22]:
data = pd.read_csv('../data/TourismData_v3.csv')
data['Year'] = data['Year'].fillna(method='ffill')

d = dict((v,k) for k,v in enumerate(calendar.month_name))
data.Month = data.Month.map(d)
data = data.assign(t=pd.to_datetime(data[['Year', 'Month']].assign(day=1))).set_index('t')
data = data.drop(['Year', 'Month'], axis=1)

groups_input = {
    'State': [0,1],
    'Zone': [0,2],
    'Region': [0,3],
    'Purpose': [3,6]
}

groups = generate_groups_data(data, groups_input, seasonality=12, h=8)

Number of groups: 4
	State: 7
	Zone: 27
	Region: 76
	Purpose: 4
Total number of series: 304
Number of points per series for train: 220
Total number of points: 228
Seasonality: 12
Forecast horizon: 8


In [5]:
def prepare_data_to_R(groups, date_points):
    idx = {}
    for group in groups['predict']['groups_names'].keys():
        idx[group] = groups['predict']['groups_names'][group][groups['predict']['groups_idx'][group]]
    
    idx['Count'] = groups['predict']['data']
    idx['Date'] = np.tile(date_points, (groups['predict']['s'],))

    df = pd.DataFrame.from_dict(idx)
    df = df.set_index('Date')
    
    return df

In [None]:
tourism = prepare_data_to_R(groups, data.index)
tourism.to_csv('../data/tourism_to_r.csv')

# Police

In [3]:
police = pd.read_excel('../data/NIBRSPublicView.Jan1-Nov30-2020.xlsx')

police = police.drop(['Occurrence\nHour', 'StreetName', 'Suffix', 'NIBRSDescription', 'Premise'], axis=1)
police.columns = ['Id','Date', 'Crime', 'Count', 'Beat', 'Block', 'Street', 'City', 'ZIP']
police = police.drop(['Id'], axis=1)
police = police.loc[police['City']=='HOUSTON']
police = police.drop(['City'], axis=1)

# I am not using Block since there are 11901 blocks in the data
police = police.groupby(['Date', 'Crime', 'Beat', 'Street', 'ZIP']).sum().reset_index().set_index('Date')

police['ZIP'] = police.ZIP.astype(str)

# Selecting the 500 groups combinations with higher counts
police = police.reset_index().set_index(['Crime', 'Beat', 'Street', 'ZIP']).loc[police.groupby(['Crime', 'Beat', 'Street', 'ZIP']).sum().sort_values(by=['Count'], ascending=False)[:500].index].reset_index().set_index('Date')


police_pivot = police.reset_index().pivot(index='Date',columns=['Crime', 'Beat', 'Street', 'ZIP'], values='Count')
police_pivot = police_pivot.fillna(0)

groups_input = {
    'Crime': [0],
    'Beat': [1],
    'Street': [2],
    'ZIP': [3]
}

groups = generate_groups_data(police_pivot, groups_input, seasonality=7, h=30)

Number of groups: 4
	Crime: 15
	Beat: 79
	Street: 9
	ZIP: 68
Total number of series: 500
Number of points per series for train: 275
Total number of points: 305
Seasonality: 7
Forecast horizon: 30


In [6]:
police = prepare_data_to_R(groups, police.index.unique())
police.to_csv('../data/police_to_r.csv')