In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import scipy
from scipy.stats import zscore
from tqdm import tqdm
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.metrics import mean_squared_log_error,r2_score,mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [3]:
"""Load DataFrame"""
df=pd.read_csv("NYPD_Calls_for_Service__Historic_.csv")
df=pd.DataFrame(df)

### 2.数据选择

In [4]:
df = df[["DISP_TS", "TYP_DESC","Latitude","Longitude"]]
df

Unnamed: 0,DISP_TS,TYP_DESC,Latitude,Longitude
0,02/17/2020 03:29:46 AM,VISIBILITY PATROL: DIRECTED,40.677802,-73.871348
1,02/17/2020 03:56:56 PM,VISIBILITY PATROL: DIRECTED,40.672100,-73.871114
2,02/17/2020 04:32:35 AM,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835
3,02/17/2020 03:55:38 PM,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835
4,02/17/2020 12:24:36 AM,VISIBILITY PATROL: DIRECTED,40.669050,-73.870359
...,...,...,...,...
1048570,04/17/2020 05:55:58 AM,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758
1048571,04/17/2020 08:35:59 AM,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758
1048572,04/17/2020 05:18:18 PM,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758
1048573,04/17/2020 02:50:29 AM,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758


In [5]:
# 统计每一列的种类
df.dtypes.value_counts()

object     2
float64    2
dtype: int64

In [6]:
object_data = df.select_dtypes( include=[ "object"])
object_data.sample(3)

Unnamed: 0,DISP_TS,TYP_DESC
25267,07/02/2020 10:32:14 AM,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL
679213,05/16/2020 03:18:54 AM,VISIBILITY PATROL: DIRECTED
200021,06/18/2020 09:21:04 AM,VISIBILITY PATROL: DIRECTED


In [7]:
object_data["TYP_DESC"].value_counts()

VISIBILITY PATROL: DIRECTED                       274821
SEE COMPLAINANT: OTHER/INSIDE                     114576
STATION INSPECTION BY TRANSIT BUREAU PERSONNEL     72261
TRAIN RUN/MOBILE ORDER MAINTENANCE SWEEP           52319
INVESTIGATE/POSSIBLE CRIME: SERIOUS/OTHER          50255
                                                   ...  
OTHER CRIMES (IN PROGRESS): PANIC ALARM ATS            1
AMBULANCE CASE: OTHER/LTD ACC HWY                      1
DISPUTE: KNIFE/LTD ACC HWY                             1
FIRE: OTHER/LTD ACC HWY                                1
OTHER CRIMES (PAST): CRIM MISCHIEF/LTD ACC HWY         1
Name: TYP_DESC, Length: 350, dtype: int64

### 3. 缺失值处理

In [8]:
df.isnull().any()

DISP_TS      False
TYP_DESC     False
Latitude     False
Longitude    False
dtype: bool

In [9]:
df.dropna(inplace=True)

In [197]:
df

Unnamed: 0,DISP_TS,TYP_DESC,Latitude,Longitude
0,02/17/2020 03:29:46 AM,VISIBILITY PATROL: DIRECTED,40.677802,-73.871348
1,02/17/2020 03:56:56 PM,VISIBILITY PATROL: DIRECTED,40.672100,-73.871114
2,02/17/2020 04:32:35 AM,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835
3,02/17/2020 03:55:38 PM,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835
4,02/17/2020 12:24:36 AM,VISIBILITY PATROL: DIRECTED,40.669050,-73.870359
...,...,...,...,...
5451083,11/12/2020 05:09:57 PM,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,40.712178,-74.004391
5451084,11/12/2020 05:57:54 PM,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,40.712178,-74.004391
5451085,11/12/2020 11:52:01 AM,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,40.719188,-74.001582
5451086,11/12/2020 04:31:54 PM,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,40.719188,-74.001582


In [10]:
df = np.array(df)
df1 = df[0:1638982]
df2 = df[1638983:]
df = np.concatenate((df1,df2),axis = 0)
df = pd.DataFrame(df, columns= ["DISP_TS","TYP_DESC","Latitude","Longitude"])
# df.drop(axis=0,index=1638982 ,inplace=True)
df

Unnamed: 0,DISP_TS,TYP_DESC,Latitude,Longitude
0,02/17/2020 03:29:46 AM,VISIBILITY PATROL: DIRECTED,40.677802,-73.871348
1,02/17/2020 03:56:56 PM,VISIBILITY PATROL: DIRECTED,40.6721,-73.871114
2,02/17/2020 04:32:35 AM,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835
3,02/17/2020 03:55:38 PM,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835
4,02/17/2020 12:24:36 AM,VISIBILITY PATROL: DIRECTED,40.66905,-73.870359
...,...,...,...,...
1048570,04/17/2020 05:55:58 AM,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758
1048571,04/17/2020 08:35:59 AM,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758
1048572,04/17/2020 05:18:18 PM,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758
1048573,04/17/2020 02:50:29 AM,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758


### 4.只保留日和周

In [11]:
# 会自己转化为24小时制的
date= pd.to_datetime(df["DISP_TS"])
date

0         2020-02-17 03:29:46
1         2020-02-17 15:56:56
2         2020-02-17 04:32:35
3         2020-02-17 15:55:38
4         2020-02-17 00:24:36
                  ...        
1048570   2020-04-17 05:55:58
1048571   2020-04-17 08:35:59
1048572   2020-04-17 17:18:18
1048573   2020-04-17 02:50:29
1048574   2020-04-17 00:43:03
Name: DISP_TS, Length: 1048575, dtype: datetime64[ns]

In [16]:
df['date'] = date
df = df[['TYP_DESC','Latitude','Longitude','date']]
df

Unnamed: 0,TYP_DESC,Latitude,Longitude,date
0,VISIBILITY PATROL: DIRECTED,40.677802,-73.871348,2020-02-17 03:29:46
1,VISIBILITY PATROL: DIRECTED,40.6721,-73.871114,2020-02-17 15:56:56
2,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835,2020-02-17 04:32:35
3,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835,2020-02-17 15:55:38
4,VISIBILITY PATROL: DIRECTED,40.66905,-73.870359,2020-02-17 00:24:36
...,...,...,...,...
1048570,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758,2020-04-17 05:55:58
1048571,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758,2020-04-17 08:35:59
1048572,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758,2020-04-17 17:18:18
1048573,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758,2020-04-17 02:50:29


In [53]:
# Extract dow but remove specific date
import datetime
weektime = []
for i in range(len(df)):
    weektime.append(pd.Timestamp(year=2020, month=1, day=df.loc[i, 'dow'] + 6, hour=df.loc[i, 'date'].hour, minute=df.loc[i,'date'].minute, second=df.loc[i, 'date'].second))
df['weektime'] = weektime

In [95]:
df['original_index'] = df.index
df = df[['TYP_DESC','Latitude','Longitude','weektime', 'original_index']]
df

Unnamed: 0,TYP_DESC,Latitude,Longitude,weektime,original_index
0,VISIBILITY PATROL: DIRECTED,40.677802,-73.871348,2020-01-06 03:29:46,0
1,VISIBILITY PATROL: DIRECTED,40.672100,-73.871114,2020-01-06 15:56:56,1
2,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835,2020-01-06 04:32:35,2
3,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835,2020-01-06 15:55:38,3
4,VISIBILITY PATROL: DIRECTED,40.669050,-73.870359,2020-01-06 00:24:36,4
...,...,...,...,...,...
1048570,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758,2020-01-10 05:55:58,1048570
1048571,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758,2020-01-10 08:35:59,1048571
1048572,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758,2020-01-10 17:18:18,1048572
1048573,VISIBILITY PATROL: DIRECTED,40.829882,-73.936758,2020-01-10 02:50:29,1048573


### 5. Sorting and dict extraction, dill cache

In [96]:
latsort = df.sort_values(by='Latitude', ascending=True, ignore_index=True)
lonsort = df.sort_values(by='Longitude', ascending=True, ignore_index=True)
timesort = df.sort_values(by='weektime', ascending=True, ignore_index=True)

In [97]:
latsort

Unnamed: 0,TYP_DESC,Latitude,Longitude,weektime,original_index
0,VISIBILITY PATROL: DIRECTED,40.499576,-74.241277,2020-01-06 02:32:17,809368
1,ALARMS: RESIDENTIAL/ HOLDUP,40.500391,-74.243919,2020-01-11 13:11:37,809850
2,AMBULANCE CASE: UNCONSCIOUS/INSIDE,40.500391,-74.243919,2020-01-11 08:37:53,809743
3,SEE COMPLAINANT: OTHER/INSIDE,40.500391,-74.243919,2020-01-07 16:52:06,809236
4,BURGLARY (IN PROGRESS): RESIDENCE,40.500918,-74.241253,2020-01-06 09:09:20,692073
...,...,...,...,...,...
1048570,SEE COMPLAINANT: OTHER/INSIDE,40.912536,-73.901390,2020-01-11 14:35:53,540509
1048571,VISIBILITY PATROL: FAMILY/HOME VISIT,40.912723,-73.902525,2020-01-06 12:46:06,288470
1048572,INVESTIGATE/POSSIBLE CRIME: SERIOUS/OTHER,40.912723,-73.902525,2020-01-08 11:25:23,287983
1048573,OTHER CRIMES (PAST): HARASSMENT/INSIDE,40.912723,-73.902525,2020-01-09 00:54:38,589524


In [98]:
lonsort

Unnamed: 0,TYP_DESC,Latitude,Longitude,weektime,original_index
0,ALARMS: COMMERCIAL/BURGLARY,40.508303,-74.254560,2020-01-11 05:09:00,810207
1,VERIFY AMB NEEDED,40.506569,-74.254377,2020-01-07 17:40:02,809417
2,INVESTIGATE/POSSIBLE CRIME: CALLS FOR HELP/INSIDE,40.502375,-74.253951,2020-01-08 08:40:18,693271
3,INVESTIGATE/POSSIBLE CRIME: SERIOUS OTHER/LTD ...,40.502375,-74.253951,2020-01-06 23:00:32,693268
4,INVESTIGATE/POSSIBLE CRIME: SERIOUS/OTHER,40.502375,-74.253951,2020-01-11 11:10:46,693267
...,...,...,...,...,...
1048570,VEHICLE ACCIDENT: DISPUTE,40.739814,-73.701324,2020-01-08 10:51:43,747697
1048571,VISIBILITY PATROL: DIRECTED,40.741990,-73.700957,2020-01-08 13:55:29,982737
1048572,VERIFY AMB NEEDED,40.739966,-73.700720,2020-01-07 22:10:16,748406
1048573,VISIBILITY PATROL: DIRECTED,40.739224,-73.700316,2020-01-11 16:39:49,983789


In [99]:
timesort

Unnamed: 0,TYP_DESC,Latitude,Longitude,weektime,original_index
0,INVESTIGATE/POSSIBLE CRIME: CALLS FOR HELP/OUT...,40.847064,-73.915070,2020-01-06 00:00:00,545181
1,DISPUTE: INSIDE,40.820780,-73.935986,2020-01-06 00:00:00,754163
2,AMBULANCE CASE: EDP/OUTSIDE,40.757095,-73.941844,2020-01-06 00:00:01,505728
3,VISIBILITY PATROL: DIRECTED,40.829598,-73.896548,2020-01-06 00:00:02,1001688
4,TRAIN RUN/MOBILE ORDER MAINTENANCE SWEEP,40.689453,-73.987229,2020-01-06 00:00:02,632191
...,...,...,...,...,...
1048570,SEE COMPLAINANT: OTHER/TRANSIT,40.684253,-73.979990,2020-01-12 23:59:58,973923
1048571,SEE COMPLAINANT: OTHER/INSIDE,40.700511,-73.941540,2020-01-12 23:59:58,602177
1048572,BURGLARY (PAST): RESIDENCE,40.834209,-73.925707,2020-01-12 23:59:59,281198
1048573,VISIBILITY PATROL:PUBLIC/PRIVATE EDUCATIONAL F...,40.715205,-73.853243,2020-01-12 23:59:59,79927


In [100]:
latsort.to_csv('latsort.csv')
lonsort.to_csv('lonsort.csv')
timesort.to_csv('timesort.csv')
df.to_csv('df.csv')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import dill
from os.path import exists
from collections import defaultdict
%matplotlib inline

import scipy
from scipy.stats import zscore
from tqdm import tqdm
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.metrics import mean_squared_log_error,r2_score,mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Re-read from disk for repeatability
latsort = pd.read_csv('latsort.csv')
lonsort = pd.read_csv('lonsort.csv')
timesort = pd.read_csv('timesort.csv')
df = pd.read_csv('df.csv')

In [None]:

for i in range(len(df)):
    timesort.loc[i, 'weektime'] = pd.Timestamp(timesort.loc[i, 'weektime'])
    df.loc[i, 'weektime'] = pd.Timestamp(df.loc[i, 'weektime'])

In [3]:
# Dont need to run
# Create mapping dicts for increased speed retrieving indexes later
index_to_lat_index = dict()
index_to_lon_index = dict()
index_to_time_index = dict()

lat_index_to_index = dict()
lon_index_to_index = dict()
time_index_to_index = dict()
for i in range(len(df)):
    index_to_lat_index[latsort.loc[i, 'original_index']] = i
    index_to_lon_index[lonsort.loc[i, 'original_index']] = i
    index_to_time_index[timesort.loc[i, 'original_index']] = i

    lat_index_to_index[i] = latsort.loc[i, 'original_index']
    lon_index_to_index[i] = lonsort.loc[i, 'original_index']
    time_index_to_index[i] = timesort.loc[i, 'original_index']


In [4]:
# Dont need to run
lat_index_to_lat = dict()
lon_index_to_lon = dict()
time_index_to_time = dict()

index_to_type = dict()
for i in range(len(df)):
    lat_index_to_lat[i] = latsort.loc[i, 'Latitude']
    lon_index_to_lon[i] = lonsort.loc[i, 'Longitude']
    time_index_to_time[i] = timesort.loc[i, 'weektime']

    index_to_type[i] = df.loc[i, 'TYP_DESC']

In [9]:
# Dont need to run
dill.dump(index_to_lat_index, open('index_to_lat_index.dat', 'wb'))
dill.dump(index_to_lon_index, open('index_to_lon_index.dat', 'wb'))
dill.dump(index_to_time_index, open('index_to_time_index.dat', 'wb'))
dill.dump(lat_index_to_index, open('lat_index_to_index.dat', 'wb'))
dill.dump(lon_index_to_index, open('lon_index_to_index.dat', 'wb'))
dill.dump(time_index_to_index, open('time_index_to_index.dat', 'wb'))
dill.dump(lat_index_to_lat, open('lat_index_to_lat.dat', 'wb'))
dill.dump(lon_index_to_lon, open('lon_index_to_lon.dat', 'wb'))
dill.dump(time_index_to_time, open('time_index_to_time.dat', 'wb'))

In [4]:
# NEED TO RUN FROM HERE DOWN. START HERE
index_to_lat_index = dill.load(open('index_to_lat_index.dat', 'rb'))
index_to_lon_index = dill.load(open('index_to_lon_index.dat', 'rb'))
index_to_time_index = dill.load(open('index_to_time_index.dat', 'rb'))
lat_index_to_index = dill.load(open('lat_index_to_index.dat', 'rb'))
lon_index_to_index = dill.load(open('lon_index_to_index.dat', 'rb'))
time_index_to_index = dill.load(open('time_index_to_index.dat', 'rb'))
lat_index_to_lat = dill.load(open('lat_index_to_lat.dat', 'rb'))
lon_index_to_lon = dill.load(open('lon_index_to_lon.dat', 'rb'))
time_index_to_time = dill.load(open('time_index_to_time.dat', 'rb'))

In [5]:
timesort

Unnamed: 0.1,Unnamed: 0,TYP_DESC,Latitude,Longitude,weektime,original_index
0,0,INVESTIGATE/POSSIBLE CRIME: CALLS FOR HELP/OUT...,40.847064,-73.915070,2020-01-06 00:00:00,545181
1,1,DISPUTE: INSIDE,40.820780,-73.935986,2020-01-06 00:00:00,754163
2,2,AMBULANCE CASE: EDP/OUTSIDE,40.757095,-73.941844,2020-01-06 00:00:01,505728
3,3,VISIBILITY PATROL: DIRECTED,40.829598,-73.896548,2020-01-06 00:00:02,1001688
4,4,TRAIN RUN/MOBILE ORDER MAINTENANCE SWEEP,40.689453,-73.987229,2020-01-06 00:00:02,632191
...,...,...,...,...,...,...
1048570,1048570,SEE COMPLAINANT: OTHER/TRANSIT,40.684253,-73.979990,2020-01-12 23:59:58,973923
1048571,1048571,SEE COMPLAINANT: OTHER/INSIDE,40.700511,-73.941540,2020-01-12 23:59:58,602177
1048572,1048572,BURGLARY (PAST): RESIDENCE,40.834209,-73.925707,2020-01-12 23:59:59,281198
1048573,1048573,VISIBILITY PATROL:PUBLIC/PRIVATE EDUCATIONAL F...,40.715205,-73.853243,2020-01-12 23:59:59,79927


### 6. Windowing algorithm

In [6]:
# Use multiprocess instead of multiprocessing to avoid hanging threadpool bugs
!pip install multiprocess
import dill
import math
from multiprocess import Pool



In [20]:
# A single sharded dist computation section
def compute_range_section(subset, dist_win, percent, boost, offset):
    dist_win_lat_sq = (dist_win / 111320) ** 2
    dist_win_lon_sq = (dist_win / (40075 * math.cos(40.7) / 360) / 1000) ** 2
    dist_window_selections = []
    for i in range((subset - 1) * int(100000 * percent) + offset, min((subset * int(100000 * percent) + offset), len(df))):
        # lat
        lat_indexes = set()
        lat_index = index_to_lat_index[i]
        target_lat = df.loc[i, 'Latitude']
        target_lon = df.loc[i, 'Longitude']
        search_radius = 1
        while lat_index + search_radius < len(df):
            search_index = lat_index + search_radius
            if (target_lat - lat_index_to_lat[search_index]) ** 2 < dist_win_lat_sq:
                if (target_lon - lon_index_to_lon[index_to_lon_index[lat_index_to_index[search_index]]]) ** 2 < dist_win_lon_sq:
                    lat_indexes.add(lat_index_to_index[search_index])
                search_radius += 1
            else:
                break
        search_radius = 1
        while lat_index - search_radius >= 0:
            search_index = lat_index - search_radius
            if (target_lat - lat_index_to_lat[search_index]) ** 2 < dist_win_lat_sq:
                if (target_lon - lon_index_to_lon[index_to_lon_index[lat_index_to_index[search_index]]]) ** 2 < dist_win_lon_sq:
                    lat_indexes.add(lat_index_to_index[search_index])
                search_radius += 1
            else:
                break
        dist_window_selections.append(lat_indexes)
    return dist_window_selections

# Entrypoint. Call this to start a dist run. Handles multithreaded run to increase speed.
def compute_loc_window(dist_win, percent, boost, offset, cores):
    with Pool(cores) as pool:
        sr = list(pool.starmap(compute_range_section, [(i, dist_win, percent, boost, offset) for i in range(1, 12)]))
        return [*sr[1],*sr[2],*sr[3],*sr[4],*sr[5],*sr[6],*sr[7],*sr[8],*sr[9],*sr[10]]

In [21]:
def get_dist_filename(dist_win, percent, offset):
    offset_str = ''
    if offset != 0: offset_str = str(offset) + '+'
    return 'dist_win_sel_' + str(dist_win) + '_' + offset_str + str(percent) + '.dat'

def get_time_filename(time_win, percent, offset):
    offset_str = ''
    if offset != 0: offset_str = str(offset) + '+'
    return 'time_win_sel_' + str(time_win) + '_' + offset_str  + str(percent) + '.dat'

In [22]:
# dist hyperparams
def process_dist(dist_params, percent, force, offset, cores):
    boost = None
    for dist_win in dist_params:
        print('Computing for dist radius of ' + str(dist_win))
        filename = get_dist_filename(dist_win, percent, offset)
        if exists(filename) and not force:
            print('  ' + filename + ' Already exists')
            continue
        dist_win_sel = compute_loc_window(dist_win, percent, boost, offset, cores)
        print('  dumping to ' + filename)
        dill.dump(dist_win_sel, open(filename, 'wb'))

In [23]:
# A single sharded time computation section
def compute_time_section(subset, time_win, percent, offset):
    time_win *= 60
    time_window_selections = []
    for i in range((subset - 1) * int(100000 * percent) + offset, min((subset * int(100000 * percent) + offset), len(df))):
        # Time
        time_indexes = set()
        time_index = index_to_time_index[i]
        target_time = time_index_to_time[time_index]
        search_radius = 1
        while time_index + search_radius < len(df):
            search_index = time_index + search_radius
            if (target_time - time_index_to_time[search_index]).seconds < time_win:
                time_indexes.add(time_index_to_index[search_index])
                search_radius += 1
            else:
                break
        search_radius = 1
        while time_index - search_radius >= 0:
            search_index = time_index - search_radius
            if (target_time - time_index_to_time[search_index]).seconds < time_win:
                time_indexes.add(time_index_to_index[search_index])
                search_radius += 1
            else:
                break
        time_window_selections.append(time_indexes)
    return time_window_selections

# Entrypoint. Call this to start a time run. Handles multithreaded run to increase speed.
def compute_time_window(time_win, percent, offset, cores):
    with Pool(cores) as pool:
        sr = list(pool.starmap(compute_time_section, [(i, time_win, percent, offset) for i in range(1, 12)]))
        return [*sr[1],*sr[2],*sr[3],*sr[4],*sr[5],*sr[6],*sr[7],*sr[8],*sr[9],*sr[10]]

In [24]:
# time hyperparams
def process_time(time_params, percent, force, offset, cores):
    for time_win in time_params:
        print('Computing for time radius of ' + str(time_win))
        filename = get_time_filename(time_win, percent, offset)
        if exists(filename) and not force:
            print('  ' + filename + ' Already exists')
            continue
        time_win_sel = compute_time_window(time_win, percent, offset, cores)
        print('  dumping to ' + filename)
        dill.dump(time_win_sel, open(filename, 'wb'))

In [25]:
# RUN THIS MODULE TO START
# Adjust cores to the number of CPU cores on your computer
# Offset is the number of previous rows already processed.
# Percent is the percentage of the full dataset to process this time
# force will allow overwriting of stored data on disk. Set to false to ensure this can be canceled and restarted.

# These runs can take a LONG time, with 10 cores, about 1 hour every 10k rows!
# You will need about 10gb of free disk space per 5% of total dataset to process.
dist_params = [800, 400, 200, 100, 50]
time_params = [60, 30, 20, 10]
percent = 0.05
force = True
row_count = int(100000 * percent) * 10
# offset = 0 # Set to zero to start a completely new preprocessing run.
offset = 50000 # Use offset to resume processing additional data in a separate instance.
cores = 7
print(row_count)
process_dist(dist_params, percent, force, offset, cores)
process_time(time_params, percent, force, offset, cores)


50000
Computing for dist radius of 800
  dumping to dist_win_sel_800_50000+0.05.dat


In [79]:
final_df = df.loc[:offset + row_count - 1,['TYP_DESC', 'Latitude','Longitude','weektime']]
final_df['weektime'] = pd.to_datetime(final_df['weektime'])
final_df['dow'] = final_df['weektime'].dt.day_of_week
final_df['hour'] = final_df['weektime'].dt.hour
final_df['minute'] = final_df['weektime'].dt.minute
final_df['second'] = final_df['weektime'].dt.second
final_df = final_df[['TYP_DESC', 'Latitude','Longitude','dow','hour','minute','second']]

for dist_win in dist_params:
    for time_win in time_params:
        print(str(dist_win) + ' | ' + str(time_win))
        dist_sel = dill.load(open(get_dist_filename(dist_win, percent), 'rb'))
        time_sel = dill.load(open(get_time_filename(time_win, percent), 'rb'))
        num = 0
        new_col = []
        col_name = 'type_dist_' + str(dist_win) + '_time_' + str(time_win)
        for i in range(len(dist_sel)):
            merged_set = set()
            for index in time_sel[i]:
                if index in dist_sel[i]:
                    merged_set.add(index)
            mode = df.loc[list(merged_set), 'TYP_DESC'].mode()
            if len(merged_set) == 0:
                new_col.append(df.loc[i, 'TYP_DESC'])
            else:
                new_col.append(df.loc[list(merged_set), 'TYP_DESC'].mode()[0])
            num += len(merged_set)
        final_df[col_name] = new_col

final_df

800 | 60
800 | 30
800 | 20
800 | 10
400 | 60
400 | 30
400 | 20
400 | 10
200 | 60
200 | 30
200 | 20
200 | 10
100 | 60
100 | 30
100 | 20
100 | 10
50 | 60
50 | 30
50 | 20
50 | 10


Unnamed: 0,TYP_DESC,Latitude,Longitude,dow,hour,minute,second,type_dist_800_time_60,type_dist_800_time_30,type_dist_800_time_20,...,type_dist_200_time_20,type_dist_200_time_10,type_dist_100_time_60,type_dist_100_time_30,type_dist_100_time_20,type_dist_100_time_10,type_dist_50_time_60,type_dist_50_time_30,type_dist_50_time_20,type_dist_50_time_10
0,VISIBILITY PATROL: DIRECTED,40.677802,-73.871348,0,3,29,46,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,...,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,ALL OUT DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,ALL OUT DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED
1,VISIBILITY PATROL: DIRECTED,40.672100,-73.871114,0,15,56,56,ALL OUT DIRECTED,ALL OUT DIRECTED,ALL OUT DIRECTED,...,ALL OUT DIRECTED,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,ALL OUT DIRECTED,ALL OUT DIRECTED,ALL OUT DIRECTED,VISIBILITY PATROL: DIRECTED,ALL OUT DIRECTED,ALL OUT DIRECTED,ALL OUT DIRECTED,VISIBILITY PATROL: DIRECTED
2,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835,0,4,32,35,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,...,SEE COMPLAINANT: OTHER/INSIDE,SEE COMPLAINANT: OTHER/INSIDE,SEE COMPLAINANT: OTHER/INSIDE,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED
3,VISIBILITY PATROL: DIRECTED,40.672943,-73.870835,0,15,55,38,TRAIN RUN/MOBILE ORDER MAINTENANCE SWEEP,TRAIN RUN/MOBILE ORDER MAINTENANCE SWEEP,TRAIN RUN/MOBILE ORDER MAINTENANCE SWEEP,...,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED
4,VISIBILITY PATROL: DIRECTED,40.669050,-73.870359,0,0,24,36,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,SEE COMPLAINANT: OTHER/INSIDE,...,ALL OUT DIRECTED,ALL OUT DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,ALL OUT DIRECTED,ALL OUT DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,ALL OUT DIRECTED,ALL OUT DIRECTED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,VISIBILITY PATROL: DIRECTED,40.635980,-74.007272,1,13,21,37,AMBULANCE CASE: SERIOUS/INSIDE,AMBULANCE CASE: SERIOUS/INSIDE,AMBULANCE CASE: SERIOUS/INSIDE,...,SEE COMPLAINANT: OTHER/INSIDE,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED
49996,VISIBILITY PATROL: DIRECTED,40.636166,-74.010659,1,11,10,16,INVESTIGATE/POSSIBLE CRIME: SERIOUS/OTHER,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,...,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED
49997,VISIBILITY PATROL: DIRECTED,40.636166,-74.010659,1,14,26,23,STATION INSPECTION BY TRANSIT BUREAU PERSONNEL,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,...,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED
49998,VISIBILITY PATROL: DIRECTED,40.636166,-74.010659,1,14,48,23,SEE COMPLAINANT: OTHER/INSIDE,SEE COMPLAINANT: OTHER/INSIDE,SEE COMPLAINANT: OTHER/INSIDE,...,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED,VISIBILITY PATROL: DIRECTED


In [None]:
# Stack the previously trained rows with the newly trained rows.
if (offset != 0):
    processed_df = dill.load(open('processed_df_' + str(offset) + '.dat', 'rb'))
    final_df = pd.concat(processed_df, final_df)

In [80]:
# Save to disk
dill.dump(final_df, open('processed_df_' + str(len(final_df)) + '.dat', 'wb'))

In [60]:
import dill
processed_df = dill.load(open('processed_df_' + str(len(final_df)) + '.dat', 'rb'))