This notebook processes the raw data and generates train and test sets and saves them in pickle diles.

In [8]:
import pickle
import yaml
import pandas as pd
import numpy as np

In [12]:
def get_dates_arr_from_df(df):
    """
    This function converts the dates columns from a dataframe to an array of numbers
    """
    dates_2016 = []
    for i in range(10808):
        try:
            date = pd.datetime.combine(df.iloc[i, -1], df.iloc[i, 2])
            if date.year == 2016:
                dates_2016.append(date.timestamp())
        except:
            continue
    dates_2016_arr = np.array(dates_2016)
    dates_2016_so = np.sort(dates_2016_arr)
    dates_2016_so_0 = dates_2016_so - dates_2016_so[0]
    dates_2016_so_0_uni = np.unique(dates_2016_so_0)
    dates_2016_days = dates_2016_so_0_uni / 3600 / 24
    
    return dates_2016_days

In [9]:
# read the raw data
df = pd.read_excel('./NYPD_Complaint_Data_Current_YTD.xlsx', engine='openpyxl')

In [20]:
df.head()

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,...,ADDR_PCT_CD,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,PARKS_NM,HADEVELOPT,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon
0,647421007.0,1971-01-01 00:00:00,00:01:00,12/31/1972,23:59:00,2017-07-03 00:00:00,233.0,SEX CRIMES,175.0,"SEXUAL ABUSE 3,2",...,104.0,INSIDE,RESIDENCE-HOUSE,,,,,,,
1,490502733.0,1977-01-05 00:00:00,23:55:00,2017-02-05 00:00:00,00:01:00,2017-02-05 00:00:00,121.0,CRIMINAL MISCHIEF & RELATED OF,269.0,"MISCHIEF,CRIMINAL, UNCL 2ND",...,78.0,INSIDE,RESIDENCE - APT. HOUSE,,,989149.0,186412.0,40.678341,-73.982339,"(40.678341494, -73.98233917)"
2,287906177.0,1977-01-05 00:00:00,02:00:00,,,2017-06-04 00:00:00,104.0,RAPE,157.0,RAPE 1,...,17.0,INSIDE,RESIDENCE - APT. HOUSE,,,,,,,
3,153128750.0,1977-06-02 00:00:00,10:29:00,2017-06-02 00:00:00,10:30:00,2017-06-02 00:00:00,578.0,HARRASSMENT 2,637.0,"HARASSMENT,SUBD 1,CIVILIAN",...,70.0,INSIDE,COMMERCIAL BUILDING,,,991590.0,169049.0,40.630682,-73.973558,"(40.63068216, -73.973557573)"
4,715732292.0,1981-01-01 00:00:00,00:01:00,,,05/30/2017,104.0,RAPE,157.0,RAPE 1,...,109.0,INSIDE,RESIDENCE-HOUSE,,,,,,,


Only larceny crimes are included in the dataset. Crimes in Brooklyn are used as training data and crimes in Queens for testing.

In [13]:
sub_df = df[df['OFNS_DESC'] =='PETIT LARCENY']
train_df = sub_df[sub_df['BORO_NM'] == 'BROOKLYN']
test_df = sub_df[sub_df['BORO_NM'] == 'QUEENS']

In [14]:
# clean the date
train_df['clean_date'] = pd.to_datetime(train_df['CMPLNT_FR_DT'])
test_df['clean_date'] = pd.to_datetime(test_df['CMPLNT_FR_DT'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
train_data = get_dates_arr_from_df(train_df)
test_data = get_dates_arr_from_df(test_df)

  


In [17]:
# write the train nd test datasets into pickle files.

with open('./train_data.p', 'wb') as f:
    pickle.dump([train_data], f)
with open('./test_data.p', 'wb') as f:
    pickle.dump([test_data], f)

In [19]:
# prepare config file for the inference algorithm

config = {'adapt_grad_step_size': False,
'data': '../../data/crime_dataset/nypd/train_data.p',
'convergence_criteria': 1.0e-04,
'grad_step_size': 0.01,
'hyper_updates': 1,
'infer_hypers': True,
'infer_max_intensity': True,
'init_val_for_kernel_cov': 50,
'init_val_for_memory_decay': 1.5,
'init_val_for_kernel_s_amp': 5.,
'init_val_for_kernel_s_cov': 200.,
'intensity_bound': 3.,
'lambda_a_prior': 1.3,
'lambda_b_prior': 0.1,
'memory_decay': 10.0,
'min_num_iterations': 10,
'num_inducing_points': 300,
'num_integration_points': 3000,
'real_data': True,
'save_steps': True,
'start_from_given': True,
'time_bound': 366,
'use_history': True}

with open(f'./settings_for_nypd_train_data.yml', 'w') as f:
    yaml.dump(config, f)