In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import localtime, strftime
from collections import defaultdict

%matplotlib inline
pd.set_option('display.max_rows', 200)

In [2]:
def read_orig_file(data_path=None, orig_file=None):
    """Read the original data file into a pandas DataFrame.
    
    Parameters
    ----------
    data_path : string, optional
        directory containing original file
    orig_file : string, optional
        filename containing original file 
    
    Returns
    -------
    raw_data : DataFrame

    """
    orig_file_defaults = {
        'data_path': '../precrime_data/',
        'orig_file': 'NYPD_Complaint_Data_Historic.csv',
        'dtype': {
            'CMPLNT_NUM': np.int64,
            'CMPLNT_FR_DT': str,
            'CMPLNT_FR_TM': str,
            'RPT_DT': str,
            'KY_CD': np.int32,
            'OFNS_DESC': str,
            'LAW_CAT_CD': str,
            'BORO_NM': str,
            'ADDR_PCT_CD': str,
            'Latitude': np.float64,
            'Longitude': np.float64
        },
        'index_col': 'CMPLNT_NUM',
        'usecols': [
            'CMPLNT_NUM',
            'CMPLNT_FR_DT',
            'CMPLNT_FR_TM',
            'RPT_DT',
            'KY_CD',
            'OFNS_DESC',
            'LAW_CAT_CD',
            'BORO_NM',
            'ADDR_PCT_CD',
            'Latitude',
            'Longitude'
        ],
        'parse_dates_cols': ['RPT_DT'],
    }
    
    if data_path is None:
        data_path = orig_file_defaults['data_path']
    if orig_file is None:
        orig_file = orig_file_defaults['orig_file']
        
    raw_data = pd.read_csv(
        filepath_or_buffer = data_path+orig_file,
        index_col = orig_file_defaults['index_col'],
        usecols = orig_file_defaults['usecols'],
        dtype = orig_file_defaults['dtype'],
        parse_dates = orig_file_defaults['parse_dates_cols'],
        infer_datetime_format = True,
    )
    return raw_data
        

In [3]:
def filter_raw_data(raw_data, output_file=None):
    """Get rid of useless rows.

    Removes non-felonies or rows with nonexistent report dates.
    
    Parameters
    ----------
    raw_data : DataFrame
    output_file : string
    
    Returns
    -------
    nypd_data : DataFrame

    """

    if output_file is None:
        output_file = '../precrime_data/raw_dated_felonies.csv'

    raw_data.dropna(
        subset=['CMPLNT_FR_DT', 'CMPLNT_FR_TM']
    )
    raw_data = raw_data[raw_data['LAW_CAT_CD'] == 'FELONY']
    raw_data = raw_data[pd.to_numeric(raw_data['ADDR_PCT_CD'], errors='coerce').fillna(-1) != -1]
    raw_data.to_csv(output_file)
   

In [4]:
def save_dated_felonies(output_file=None):
    print('Starting ({0})...'.format(strftime("%Y-%m-%d %H:%M:%S", localtime())))
    raw_data = read_orig_file()
    print('Saving filtered output ({0})...'.format(strftime("%Y-%m-%d %H:%M:%S", localtime())))
    filter_raw_data(raw_data, output_file)
    print('Done ({0})'.format(strftime("%Y-%m-%d %H:%M:%S", localtime())))

In [5]:
def load_dated_felonies(data_path=None, filtered_file=None):
    filtered_file_defaults = {
        'data_path': '../precrime_data/',
        'filtered_file': 'raw_dated_felonies.csv',
        'dtype': {
            'CMPLNT_NUM': np.int64,
            'CMPLNT_FR_DT': str,
            'CMPLNT_FR_TM': str,
            'RPT_DT': str,
            'KY_CD': np.int32,
            'OFNS_DESC': str,
            'BORO_NM': str,
            'ADDR_PCT_CD': np.int32,
            'Latitude': np.float64,
            'Longitude': np.float64,
        },
        'index_col': 'CMPLNT_NUM',
        'usecols': [
            'CMPLNT_NUM',
            'CMPLNT_FR_DT',
            'CMPLNT_FR_TM',
            'RPT_DT',
            'KY_CD',
            'OFNS_DESC',
            'BORO_NM',
            'ADDR_PCT_CD',
            'Latitude',
            'Longitude',
        ],
        'parse_dates_dict': {
            'COMPLAINT_DATETIME': ['CMPLNT_FR_DT', 'CMPLNT_FR_TM'],
            'REPORT_DATE': ['RPT_DT'],
        },
    }
    
    if data_path is None:
        data_path = filtered_file_defaults['data_path']
    if filtered_file is None:
        filtered_file = filtered_file_defaults['filtered_file']
        
    nypd_data = pd.read_csv(
        filepath_or_buffer = data_path+filtered_file,
        index_col = filtered_file_defaults['index_col'],
        usecols = filtered_file_defaults['usecols'],
        dtype = filtered_file_defaults['dtype'],
        parse_dates = filtered_file_defaults['parse_dates_dict'],
        infer_datetime_format = True,
    )
    nypd_data['COMPLAINT_DATETIME'] = pd.to_datetime(nypd_data['COMPLAINT_DATETIME'], errors='coerce')
    nypd_data.dropna(subset=['COMPLAINT_DATETIME'])
    return nypd_data[nypd_data['COMPLAINT_DATETIME'] >= '2006-01-02 00:00:00'] # Weird data on 2006-01-01.

In [6]:
def save_clean_felonies(output_file=None):
    if output_file is None:
        output_file = '../precrime_data/clean_felonies.csv'
    print('Starting ({0})...'.format(strftime("%Y-%m-%d %H:%M:%S", localtime())))
    filtered_felonies = load_dated_felonies()
    print('Done ({0})'.format(strftime("%Y-%m-%d %H:%M:%S", localtime())))
    filtered_felonies.to_csv(output_file)

In [7]:
def load_clean_felonies(data_path=None, clean_file=None):
    clean_file_defaults = {
        'data_path': '../precrime_data/',
        'clean_file': 'clean_felonies.csv',
        'dtype': {
            'CMPLNT_NUM': np.int64,
            'COMPLAINT_DATETIME': str,
            'REPORT_DATE': str,
            'KY_CD': np.int32,
            'OFNS_DESC': str,
            'BORO_NM': str,
            'ADDR_PCT_CD': np.int32,
            'Latitude': np.float64,
            'Longitude': np.float64,
        },
        'index_col': 'CMPLNT_NUM',
        'usecols': [
            'CMPLNT_NUM',
            'COMPLAINT_DATETIME',
            'REPORT_DATE',
            'KY_CD',
            'OFNS_DESC',
            'BORO_NM',
            'ADDR_PCT_CD',
            'Latitude',
            'Longitude',
        ],
        'parse_dates_cols': ['REPORT_DATE', 'COMPLAINT_DATETIME'],
    }
    
    if data_path is None:
        data_path = clean_file_defaults['data_path']
    if clean_file is None:
        clean_file = clean_file_defaults['clean_file']
        
    nypd_data = pd.read_csv(
        filepath_or_buffer = data_path+clean_file,
        index_col = clean_file_defaults['index_col'],
        usecols = clean_file_defaults['usecols'],
        dtype = clean_file_defaults['dtype'],
        parse_dates = clean_file_defaults['parse_dates_cols'],
        infer_datetime_format = True,
    )
    
    nypd_data.sort_values(by='COMPLAINT_DATETIME', inplace=True)
    return nypd_data

In [8]:
def add_offense_category(df):
    offense_category = defaultdict(lambda : 'Other')

    offense_category[101] = 'Homicide'
    offense_category[102] = 'Homicide'
    offense_category[103] = 'Homicide'

    offense_category[104] = 'Rape'
    offense_category[116] = 'Rape'
    
    offense_category[105] = 'Robbery'           # Mugging
    offense_category[106] = 'FelonyAssault'
    offense_category[107] = 'Burglary'          # Breaking and entering
    offense_category[109] = 'GrandLarceny'      
    offense_category[110] = 'GrandLarcenyAuto'
    
    offense_category[112] = 'Fraud'
    offense_category[113] = 'Forgery'
    offense_category[114] = 'Arson'
    offense_category[117] = 'Drugs'
    offense_category[118] = 'Weapons'
    offense_category[121] = 'CriminalMischief'  # Graffiti
    
    df['OFFENSE'] = df['KY_CD'].map(offense_category).astype('category')
    df['OFFENSE'].cat.set_categories([
        'Homicide', 'Rape', 'Robbery', 'FelonyAssault', 'Burglary', 'GrandLarceny', 'GrandLarcenyAuto',
        'Fraud', 'Forgery', 'Arson', 'Drugs', 'Weapons', 'CriminalMischief', 'Other'
    ], inplace=True)


In [9]:
# Before running: execute "tar -xvf cf.tar.gz" from the root directory of the git repo
# copy clean_felonies.csv into that directory,
# and unzip it 
#
#
# To make the files from scratch (not necessary):
# save_dated_felonies()
# save_clean_felonies()


nypd_data = load_clean_felonies()
add_offense_category(nypd_data)

In [10]:
nypd_data.pivot_table(
    index=[
        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.year),
        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.month),
#        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.day),
#        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.hour),
#        'BORO_NM',
        'ADDR_PCT_CD',   # These are not duplicated across boros.
    ],
    values='KY_CD',
    columns='OFFENSE',
    fill_value=0,
    aggfunc=len
)

Unnamed: 0_level_0,Unnamed: 1_level_0,OFFENSE,Homicide,Rape,Robbery,FelonyAssault,Burglary,GrandLarceny,GrandLarcenyAuto,Fraud,Forgery,Arson,Drugs,Weapons,CriminalMischief,Other
COMPLAINT_DATETIME,COMPLAINT_DATETIME,ADDR_PCT_CD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2006,1,1,0,1,5,7,23,125,5,4,6,1,0,0,8,0
2006,1,5,0,2,16,12,14,52,2,0,25,0,1,3,3,0
2006,1,6,0,0,16,5,27,96,2,3,6,0,7,1,10,0
2006,1,7,1,1,17,4,9,26,3,2,7,0,6,2,3,0
2006,1,9,0,0,17,8,19,63,8,5,3,1,4,3,7,0
2006,1,10,0,2,7,12,16,62,11,7,9,0,13,3,1,0
2006,1,13,0,2,31,10,35,130,5,25,10,1,4,1,6,0
2006,1,14,0,2,29,15,49,234,3,7,68,1,6,3,10,0
2006,1,17,1,2,7,6,21,86,3,6,5,0,0,0,10,0
2006,1,18,0,0,13,10,26,177,5,6,7,2,1,0,9,0


In [23]:
nypd_data['ADDR_PCT_CD']

CMPLNT_NUM
318229393     18
554420424     68
218039893      6
542037522     76
492019346     70
566020947     28
689081600     18
149555949    110
403560215     62
347348875     62
971121792     62
953322722     81
564089611     60
350136722    107
195884831    115
291323622     42
889376827    101
263290065     42
503091230     70
978501077     75
669057929     28
239796420     50
664040799     90
608867844    105
781173366     41
484715525     14
886457370      9
123052636     72
916488609     40
121708672     75
981655954     44
745435497     46
875247018    123
935745514     24
885900384     88
801641589     78
749988983    113
888491125     42
190276591     52
738667644     33
366882087    102
869589006     79
721071131     48
568644274    104
697451116     10
974691933     10
370210007     25
642237202    111
475142513     90
157301263     48
725927894     71
455395913     28
560298670     67
698936505    114
974080581    107
153188156     81
260553402     81
236109607      6
470

In [33]:
import pylicors

In [35]:
import inspect
inspect.getsourcelines(pylicors)

(['"""\n',
  'Includes modules for \n',
  '  - matrix_utils\n',
  '  - utils\n',
  '  - clustering\n',
  '  - tests\n',
  '"""\n',
  '\n',
  '\n',
  "__all__ = ['matrix_utils', 'utils', 'clustering']\n",
  'import matrix_utils\n',
  'import utils\n',
  'import clustering\n'],
 0)