# Create training, testing sets and historical set

In this notebook we select 

1. two **training sets** (for scenario 1 and 2) for the GA from the year **2018**: 
    - training_set_scenario_1: 100 low demand time periods (weekday 8am-4pm)
    - training_set_scenario_2: 100 high-demand time periods (weekend (midnight-8am)

-> To train the GA


2. two **testing sets** (for scenario 1 and 2) for the GA (and benchmark configs for comparison) from the year **2019**: 
    - testing_set_scenario_1: 100 low demand time periods (weekday 8am-4pm)
    - testing_set_scenario_2: 100 high-demand time periods (weekend (midnight-8am)

-> To evaluate strategy performance and compare with non-GA discovered strategies from ABM experiments






3. two **historical sets** (for scenario 1 and 2) for the GA (and benchmark configs for comparison) from the year **2017**. 
    - historical_set_scenario_1: 100 low demand time periods (weekday 8am-4pm)
    - historical_set_scenario_2: 100 high-demand time periods (weekend (midnight-8am)

-> This is used in ABMs Env to identify hot streets to patrol. The crime/CFS incidents from those time periods may influence: 

(1) the response time (historical CFS, for GA single obj learning and evaluation and its benchmark config evaluation) 

(2) the crime deterrence score (historical crimes, for GA multi obj learning and evaluation and its benchmark config evaluation)


To be used in the ABM experiments too!

Both testing and training set contain different shifts as they belong to different years!

In [1]:
import os
print(os.getcwd())

import osmnx as ox
import pandas as pd

from datetime import datetime, timedelta, time
import numpy as np
import scipy as sp

import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import matplotlib as mpl
import pickle

import random

import warnings



/Users/natachachenevoy/Documents/GitHub/ABM-Detroit-Police-Dispatch/GA1_experiment


In [2]:
def getYearData(YEAR) :
    data = pd.read_csv("../Data/Incidents_new_preprocessed.csv")
    data.Date_Time = pd.to_datetime(data.Date_Time)
    data.Date_Time = data.Date_Time.dt.tz_localize(None)
    #cfs_incidents['Patrol_beat'] = cfs_incidents['Patrol_beat'].apply(str)
    #cfs_incidents['Precinct'] = cfs_incidents['Precinct'].apply(str)

    # SELECT 2018 for validation (so that 2017 can be used for historical crimes)
    data = data[(data['Date_Time'].dt.year == YEAR)]
    return data


In [3]:
def get_start_end_shift(date, start_time):
    duration_hours = 8
    start_time = time(start_time,0)


    SHIFT_START_DT = datetime.combine(date, start_time)
    SHIFT_END_DT = SHIFT_START_DT + timedelta(hours = duration_hours)
    END_TIME = SHIFT_END_DT.time()

    #print('Shift is from {} to {}'.format(SHIFT_START_DT,SHIFT_END_DT))

    return SHIFT_START_DT, SHIFT_END_DT

## Testing set (2019)

In [4]:
data = getYearData(2019)

## Scenario 1

In [5]:
# Select the weekdays (Monday to Friday)
df_weekdays = data[data.Date_Time.dt.weekday // 5 == 0]
df_weekdays = df_weekdays[(df_weekdays.Date_Time.dt.hour >= 8) & 
                                (df_weekdays.Date_Time.dt.hour < 16)]

dates_uniques_weekdays = df_weekdays['Date_Time'].sort_values().dt.date.unique()
# Remove first and last date for full days.
#dates_uniques_weekdays = dates_uniques_weekdays[1:-2]
print('There are {} unique dates in dataset'.format(len(dates_uniques_weekdays)))

## WEEKDAYS
# get all weekday shifts
all_weekday_shifts = [get_start_end_shift(date, 8) for date in  dates_uniques_weekdays]

# select 100 shifts at random
random.seed(222)
testing_set_scenario1 = random.sample(all_weekday_shifts, 100)

There are 261 unique dates in dataset


## Scenario 2

In [6]:
# Select the weekend days (Saturday and Sunday)
df_weekends = data[data.Date_Time.dt.weekday // 5 == 1]
df_weekends = df_weekends[(df_weekends.Date_Time.dt.hour >= 0) & 
                                (df_weekends.Date_Time.dt.hour < 8)]
dates_uniques_weekends = df_weekends['Date_Time'].sort_values().dt.date.unique()
# Remove first and last date for full days.
#dates_uniques_weekends = dates_uniques_weekends[1:-2]
print('There are {} unique dates in dataset'.format(len(dates_uniques_weekends)))



## WEEKENDS
# get all weekend shifts
all_weekend_shifts = [get_start_end_shift(date, 0) for date in  dates_uniques_weekends]

# select 100 shifts at random
random.seed(222)
testing_set_scenario2 = random.sample(all_weekend_shifts, 100)

There are 104 unique dates in dataset


In [7]:
import pickle
with open('./testing_set_scenario1.pkl', 'wb') as f:
    pickle.dump(testing_set_scenario1, f)
with open('./testing_set_scenario2.pkl', 'wb') as f:
    pickle.dump(testing_set_scenario2, f)

In [8]:
with open('testing_set_scenario1.pkl', 'rb') as f:
        testing_set_scenario1 = pickle.load(f)
testing_set_scenario1

[(datetime.datetime(2019, 3, 19, 8, 0), datetime.datetime(2019, 3, 19, 16, 0)),
 (datetime.datetime(2019, 6, 18, 8, 0), datetime.datetime(2019, 6, 18, 16, 0)),
 (datetime.datetime(2019, 7, 23, 8, 0), datetime.datetime(2019, 7, 23, 16, 0)),
 (datetime.datetime(2019, 8, 7, 8, 0), datetime.datetime(2019, 8, 7, 16, 0)),
 (datetime.datetime(2019, 1, 23, 8, 0), datetime.datetime(2019, 1, 23, 16, 0)),
 (datetime.datetime(2019, 1, 21, 8, 0), datetime.datetime(2019, 1, 21, 16, 0)),
 (datetime.datetime(2019, 6, 21, 8, 0), datetime.datetime(2019, 6, 21, 16, 0)),
 (datetime.datetime(2019, 3, 21, 8, 0), datetime.datetime(2019, 3, 21, 16, 0)),
 (datetime.datetime(2019, 6, 7, 8, 0), datetime.datetime(2019, 6, 7, 16, 0)),
 (datetime.datetime(2019, 9, 23, 8, 0), datetime.datetime(2019, 9, 23, 16, 0)),
 (datetime.datetime(2019, 9, 16, 8, 0), datetime.datetime(2019, 9, 16, 16, 0)),
 (datetime.datetime(2019, 6, 6, 8, 0), datetime.datetime(2019, 6, 6, 16, 0)),
 (datetime.datetime(2019, 6, 11, 8, 0), dateti

In [9]:
with open('testing_set_scenario2.pkl', 'rb') as f:
        testing_set_scenario2 = pickle.load(f)
testing_set_scenario2

[(datetime.datetime(2019, 12, 15, 0, 0),
  datetime.datetime(2019, 12, 15, 8, 0)),
 (datetime.datetime(2019, 2, 17, 0, 0), datetime.datetime(2019, 2, 17, 8, 0)),
 (datetime.datetime(2019, 4, 20, 0, 0), datetime.datetime(2019, 4, 20, 8, 0)),
 (datetime.datetime(2019, 5, 11, 0, 0), datetime.datetime(2019, 5, 11, 8, 0)),
 (datetime.datetime(2019, 5, 19, 0, 0), datetime.datetime(2019, 5, 19, 8, 0)),
 (datetime.datetime(2019, 1, 19, 0, 0), datetime.datetime(2019, 1, 19, 8, 0)),
 (datetime.datetime(2019, 1, 13, 0, 0), datetime.datetime(2019, 1, 13, 8, 0)),
 (datetime.datetime(2019, 8, 4, 0, 0), datetime.datetime(2019, 8, 4, 8, 0)),
 (datetime.datetime(2019, 4, 13, 0, 0), datetime.datetime(2019, 4, 13, 8, 0)),
 (datetime.datetime(2019, 7, 20, 0, 0), datetime.datetime(2019, 7, 20, 8, 0)),
 (datetime.datetime(2019, 11, 23, 0, 0),
  datetime.datetime(2019, 11, 23, 8, 0)),
 (datetime.datetime(2019, 11, 30, 0, 0),
  datetime.datetime(2019, 11, 30, 8, 0)),
 (datetime.datetime(2019, 7, 21, 0, 0), da

## Training set (2018)

In [10]:
data = getYearData(2018)

### Scenario 1

In [15]:
# Select the weekdays (Monday to Friday)
df_weekdays = data[data.Date_Time.dt.weekday // 5 == 0]
df_weekdays = df_weekdays[(df_weekdays.Date_Time.dt.hour >= 8) & 
                                (df_weekdays.Date_Time.dt.hour < 16)]

dates_uniques_weekdays = df_weekdays['Date_Time'].sort_values().dt.date.unique()
# Remove first and last date for full days.
#dates_uniques_weekdays = dates_uniques_weekdays[1:-2]
print('There are {} unique dates in dataset'.format(len(dates_uniques_weekdays)))

## WEEKDAYS
# get all weekday shifts
all_weekday_shifts = [get_start_end_shift(date, 8) for date in  dates_uniques_weekdays]

# select 100 shifts at random
random.seed(222)
training_set_scenario1 = random.sample(all_weekday_shifts, 100)

There are 261 unique dates in dataset


### Scenario 2

In [16]:
# Select the weekend days (Saturday and Sunday)
df_weekends = data[data.Date_Time.dt.weekday // 5 == 1]
df_weekends = df_weekends[(df_weekends.Date_Time.dt.hour >= 0) & 
                                (df_weekends.Date_Time.dt.hour < 8)]
dates_uniques_weekends = df_weekends['Date_Time'].sort_values().dt.date.unique()
# Remove first and last date for full days.
#dates_uniques_weekends = dates_uniques_weekends[1:-2]
print('There are {} unique dates in dataset'.format(len(dates_uniques_weekends)))

## WEEKENDS

# get all weekend shifts
all_weekend_shifts = [get_start_end_shift(date, 0) for date in  dates_uniques_weekends]

# select 100 shifts at random
random.seed(222)
training_set_scenario2 = random.sample(all_weekend_shifts, 100)

There are 104 unique dates in dataset


In [53]:
import pickle
with open('./training_set_scenario1.pkl', 'wb') as f:
    pickle.dump(training_set_scenario1, f)
with open('./training_set_scenario2.pkl', 'wb') as f:
    pickle.dump(training_set_scenario2, f)


## Historical set (2017)

In [23]:
data = getYearData(2017)

In [24]:
data.shape

(45826, 10)

### Scenario 1

In [25]:
# Select the weekdays (Monday to Friday)
df_weekdays = data[data.Date_Time.dt.weekday // 5 == 0]
df_weekdays = df_weekdays[(df_weekdays.Date_Time.dt.hour >= 8) & 
                                (df_weekdays.Date_Time.dt.hour < 16)]

dates_uniques_weekdays = df_weekdays['Date_Time'].sort_values().dt.date.unique()
# Remove first and last date for full days.
#dates_uniques_weekdays = dates_uniques_weekdays[1:-2]
print('There are {} unique dates in dataset'.format(len(dates_uniques_weekdays)))

## WEEKDAYS
# get all weekday shifts
all_weekday_shifts = [get_start_end_shift(date, 8) for date in  dates_uniques_weekdays]

# select 100 shifts at random
random.seed(222)
historical_set_scenario1 = random.sample(all_weekday_shifts, 100)

There are 260 unique dates in dataset


### Scenario 2

In [26]:
# Select the weekend days (Saturday and Sunday)
df_weekends = data[data.Date_Time.dt.weekday // 5 == 1]
df_weekends = df_weekends[(df_weekends.Date_Time.dt.hour >= 0) & 
                                (df_weekends.Date_Time.dt.hour < 8)]
dates_uniques_weekends = df_weekends['Date_Time'].sort_values().dt.date.unique()
# Remove first and last date for full days.
#dates_uniques_weekends = dates_uniques_weekends[1:-2]
print('There are {} unique dates in dataset'.format(len(dates_uniques_weekends)))

## WEEKENDS

# get all weekend shifts
all_weekend_shifts = [get_start_end_shift(date, 0) for date in  dates_uniques_weekends]

# select 100 shifts at random
random.seed(222)
historical_set_scenario2 = random.sample(all_weekend_shifts, 100)

There are 105 unique dates in dataset


In [27]:
import pickle
with open('./historical_set_scenario1.pkl', 'wb') as f:
    pickle.dump(training_set_scenario1, f)
with open('./historical_set_scenario2.pkl', 'wb') as f:
    pickle.dump(training_set_scenario2, f)