This notebook processes the raw data and generates train and test sets and saves them in pickle diles.

In [1]:
import pickle

import yaml
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./crime.csv')

In [3]:
df.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,Other Theft,2003,5,12,16.0,15.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
1,Other Theft,2003,5,7,15.0,20.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
2,Other Theft,2003,4,23,16.0,40.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
3,Other Theft,2003,4,20,11.0,15.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
4,Other Theft,2003,4,12,17.0,45.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763


In [4]:
df['TYPE'].unique()

array(['Other Theft', 'Break and Enter Residential/Other', 'Mischief',
       'Break and Enter Commercial', 'Offence Against a Person',
       'Theft from Vehicle',
       'Vehicle Collision or Pedestrian Struck (with Injury)',
       'Vehicle Collision or Pedestrian Struck (with Fatality)',
       'Theft of Vehicle', 'Homicide', 'Theft of Bicycle'], dtype=object)

In [5]:
df['NEIGHBOURHOOD'].unique()

array(['Strathcona', 'Kerrisdale', 'Dunbar-Southlands',
       'Grandview-Woodland', 'Sunset', 'West End', nan,
       'Central Business District', 'Hastings-Sunrise',
       'Victoria-Fraserview', 'Fairview', 'Kensington-Cedar Cottage',
       'West Point Grey', 'Shaughnessy', 'Renfrew-Collingwood',
       'Killarney', 'Riley Park', 'Arbutus Ridge', 'Musqueam',
       'Mount Pleasant', 'Kitsilano', 'Stanley Park', 'South Cambie',
       'Marpole', 'Oakridge'], dtype=object)

For the dataset we only use "other theft" crime in one district, between May and December 2016

In [5]:
sub_df = df[df['NEIGHBOURHOOD'] == 'Central Business District'][df['TYPE'] == 'Other Theft']
sub_df = sub_df[sub_df['YEAR'] == 2016]
sub_df = sub_df[sub_df['MONTH'] > 5]
sub_df = sub_df[sub_df['MONTH'] < 12]
sub_df= sub_df.sort_values(['MONTH', 'DAY', 'HOUR', 'MINUTE'])

  """Entry point for launching an IPython kernel.


In [7]:
sub_df.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
487981,Other Theft,2016,6,1,11.0,2.0,3XX ABBOTT ST,Central Business District,492213.87,5458862.65,49.282468,-123.107058
502886,Other Theft,2016,6,1,12.0,16.0,5XX W HASTINGS ST,Central Business District,491826.94,5459079.69,49.284415,-123.112383
489436,Other Theft,2016,6,1,13.0,49.0,7XX GRANVILLE ST,Central Business District,491394.12,5458844.26,49.282291,-123.118329
488187,Other Theft,2016,6,1,15.0,46.0,3XX ABBOTT ST,Central Business District,492213.87,5458862.65,49.282468,-123.107058
482261,Other Theft,2016,6,1,16.0,22.0,7XX GRANVILLE ST,Central Business District,491295.24,5458744.77,49.281395,-123.119687


In [12]:
sub_df['date'] = pd.to_datetime(sub_df[['YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE']])
events = sub_df['date'].astype(int).to_numpy()
events -= events[0]
events_s = events * 1e-9 # convert to seconds
events_h = events_s / 3600 # hours
events_d = events_h / 24 # days
events_d = np.round(events_d, 3)

In [13]:
# make sure that no two events have the exact same time
events_d[0] = 0.001
seen_events = []
c = 0
for event in events_d:
    if event in seen_events:
        event += 0.001
        c+=1
    seen_events.append(event)
events_d = np.array(seen_events)

In [14]:
# Use the first 519 events as train data
train_events = [events_d[:519]]
test_events = [events_d[519:] - events_d[519]]

In [20]:
with open('./train_data.p', 'wb') as f:
    pickle.dump(train_events, f)

with open('./test_data_.p', 'wb') as f:
    pickle.dump(test_events, f)

In [30]:
# prepare config file for the inference algorithm

config = {'adapt_grad_step_size': False,
'data': '../../data/crime_dataset/nypd/train_data.p',
'convergence_criteria': 1.0e-04,
'grad_step_size': 0.01,
'hyper_updates': 1,
'infer_hypers': True,
'infer_max_intensity': True,
'init_val_for_kernel_cov': 50,
'init_val_for_memory_decay': 1.5,
'init_val_for_kernel_s_amp': 5.,
'init_val_for_kernel_s_cov': 200.,
'intensity_bound': 3.,
'lambda_a_prior': 1.3,
'lambda_b_prior': 0.1,
'memory_decay': 10.0,
'min_num_iterations': 10,
'num_inducing_points': 300,
'num_integration_points': 3000,
'real_data': True,
'save_steps': True,
'start_from_given': True,
'time_bound': 366,
'use_history': True}

with open(f'./settings_for_vancouver_train_data.yml', 'w') as f:
    yaml.dump(config, f)