In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

### Import and Transform Data Into Useable Format

In [49]:
allcrime = pd.read_csv('denver_crime_8-26-2019.csv')
offense_codes = pd.read_csv('denver_offense_codes.csv')

In [89]:
offense_codes.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_CODE_EXTENSION,OFFENSE_TYPE_ID,OFFENSE_TYPE_NAME,OFFENSE_CATEGORY_ID,OFFENSE_CATEGORY_NAME,IS_CRIME,IS_TRAFFIC
0,2804,1,stolen-property-possession,Possession of stolen property,all-other-crimes,All Other Crimes,1,0
1,2804,2,fraud-possess-financial-device,Possession of a financial device,all-other-crimes,All Other Crimes,1,0
2,2901,0,damaged-prop-bus,Damaged business property,public-disorder,Public Disorder,1,0
3,2902,0,criminal-mischief-private,Criminal mischief to private property,public-disorder,Public Disorder,1,0
4,2903,0,criminal-mischief-public,Criminal mischief to public property,public-disorder,Public Disorder,1,0


In [52]:
allcrime.head()

Unnamed: 0,INCIDENT_ID,OFFENSE_ID,OFFENSE_CODE,OFFENSE_CODE_EXTENSION,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,FIRST_OCCURRENCE_DATE,LAST_OCCURRENCE_DATE,REPORTED_DATE,INCIDENT_ADDRESS,GEO_X,GEO_Y,GEO_LON,GEO_LAT,DISTRICT_ID,PRECINCT_ID,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
0,2016376978,2016376978521300,5213,0,weapon-unlawful-discharge-of,all-other-crimes,6/15/2016 11:31:00 PM,,6/15/2016 11:31:00 PM,,3193983.0,1707251.0,-104.809881,39.773188,5,521,montbello,1,0
1,20186000994,20186000994239900,2399,0,theft-other,larceny,10/11/2017 12:30:00 PM,10/11/2017 4:55:00 PM,1/29/2018 5:53:00 PM,,3201943.0,1711852.0,-104.781434,39.785649,5,522,gateway-green-valley-ranch,1,0
2,20166003953,20166003953230500,2305,0,theft-items-from-vehicle,theft-from-motor-vehicle,3/4/2016 8:00:00 PM,4/25/2016 8:00:00 AM,4/26/2016 9:02:00 PM,2932 S JOSEPHINE ST,3152762.0,1667011.0,-104.957381,39.66349,3,314,wellshire,1,0
3,201872333,201872333239900,2399,0,theft-other,larceny,1/30/2018 7:20:00 PM,,1/30/2018 10:29:00 PM,705 S COLORADO BLVD,3157162.0,1681320.0,-104.94144,39.702698,3,312,belcaro,1,0
4,2017411405,2017411405230300,2303,0,theft-shoplift,larceny,6/22/2017 8:53:00 PM,,6/23/2017 4:09:00 PM,2810 E 1ST AVE,3153211.0,1686545.0,-104.95537,39.717107,3,311,cherry-creek,1,0


In [53]:
allcrime.dtypes

INCIDENT_ID                 int64
OFFENSE_ID                  int64
OFFENSE_CODE                int64
OFFENSE_CODE_EXTENSION      int64
OFFENSE_TYPE_ID            object
OFFENSE_CATEGORY_ID        object
FIRST_OCCURRENCE_DATE      object
LAST_OCCURRENCE_DATE       object
REPORTED_DATE              object
INCIDENT_ADDRESS           object
GEO_X                     float64
GEO_Y                     float64
GEO_LON                   float64
GEO_LAT                   float64
DISTRICT_ID                 int64
PRECINCT_ID                 int64
NEIGHBORHOOD_ID            object
IS_CRIME                    int64
IS_TRAFFIC                  int64
dtype: object

Let's change some datatypes so we can better work with them. Especially the dates/times.

In [54]:
allcrime['FIRST_OCCURRENCE_DATE'] = pd.to_datetime(allcrime.FIRST_OCCURRENCE_DATE)

In [55]:
allcrime['LAST_OCCURRENCE_DATE'] = pd.to_datetime(allcrime.LAST_OCCURRENCE_DATE)

In [56]:
allcrime['REPORTED_DATE'] = pd.to_datetime(allcrime.REPORTED_DATE)

In [57]:
allcrime.dtypes

INCIDENT_ID                        int64
OFFENSE_ID                         int64
OFFENSE_CODE                       int64
OFFENSE_CODE_EXTENSION             int64
OFFENSE_TYPE_ID                   object
OFFENSE_CATEGORY_ID               object
FIRST_OCCURRENCE_DATE     datetime64[ns]
LAST_OCCURRENCE_DATE      datetime64[ns]
REPORTED_DATE             datetime64[ns]
INCIDENT_ADDRESS                  object
GEO_X                            float64
GEO_Y                            float64
GEO_LON                          float64
GEO_LAT                          float64
DISTRICT_ID                        int64
PRECINCT_ID                        int64
NEIGHBORHOOD_ID                   object
IS_CRIME                           int64
IS_TRAFFIC                         int64
dtype: object

We can eliminate a few things now, the documentation stated that the crime data is most accurate at least 30 days from when the crime occurred - the dataset used was last updated on August 26th, 2019.

In [62]:
update_date = pd.to_datetime("8/26/2019")

In [63]:
latest_accurate_date = update_date - pd.Timedelta(30, unit='d')

In [64]:
latest_accurate_date

Timestamp('2019-07-27 00:00:00')

Knowing this we can eliminate any rows where `FIRST_OCCURRENCE_DATE` is before July 27, 2019.

In [76]:
accurate_crime = allcrime[(allcrime['FIRST_OCCURRENCE_DATE'] < latest_accurate_date)]

In [78]:
accurate_crime.head()

Unnamed: 0,INCIDENT_ID,OFFENSE_ID,OFFENSE_CODE,OFFENSE_CODE_EXTENSION,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,FIRST_OCCURRENCE_DATE,LAST_OCCURRENCE_DATE,REPORTED_DATE,INCIDENT_ADDRESS,GEO_X,GEO_Y,GEO_LON,GEO_LAT,DISTRICT_ID,PRECINCT_ID,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
0,2016376978,2016376978521300,5213,0,weapon-unlawful-discharge-of,all-other-crimes,2016-06-15 23:31:00,NaT,2016-06-15 23:31:00,,3193983.0,1707251.0,-104.809881,39.773188,5,521,montbello,1,0
1,20186000994,20186000994239900,2399,0,theft-other,larceny,2017-10-11 12:30:00,2017-10-11 16:55:00,2018-01-29 17:53:00,,3201943.0,1711852.0,-104.781434,39.785649,5,522,gateway-green-valley-ranch,1,0
2,20166003953,20166003953230500,2305,0,theft-items-from-vehicle,theft-from-motor-vehicle,2016-03-04 20:00:00,2016-04-25 08:00:00,2016-04-26 21:02:00,2932 S JOSEPHINE ST,3152762.0,1667011.0,-104.957381,39.66349,3,314,wellshire,1,0
3,201872333,201872333239900,2399,0,theft-other,larceny,2018-01-30 19:20:00,NaT,2018-01-30 22:29:00,705 S COLORADO BLVD,3157162.0,1681320.0,-104.94144,39.702698,3,312,belcaro,1,0
4,2017411405,2017411405230300,2303,0,theft-shoplift,larceny,2017-06-22 20:53:00,NaT,2017-06-23 16:09:00,2810 E 1ST AVE,3153211.0,1686545.0,-104.95537,39.717107,3,311,cherry-creek,1,0


In [82]:
allcrime.shape

(507443, 19)

In [85]:
accurate_crime.shape

(500583, 19)

In [86]:
len(allcrime) - len(accurate_crime)

6860

Looks like in the last month alone there were 6,860 crimes and traffic violations reported. But for this analysis we're more concerned with crimes than traffic violations. Let's remove non-crime events from our data (this will still include traffic crimes).

In [87]:
crime = accurate_crime[(accurate_crime['IS_CRIME'] == 1)]

In [88]:
crime.shape

(370474, 19)