In [1]:
import requests
import matplotlib.pyplot as plt
from IPython.display import Image
import pandas as pd
import json
import geopandas as gpd
from io import StringIO
from csv import reader
from functools import reduce

A dataset containing details about Metro Nashville Police Department reported incidents is available at https://data.nashville.gov/Police/Metro-Nashville-Police-Department-Incidents/2u6v-ujjs. Make use of the API to find all aggravated burglary incidents that were reported during the six month period from January 1, 2021 through June 30, 2021.

In [2]:
# Create endpoint
crime_endpoint = 'https://data.nashville.gov/resource/2u6v-ujjs.geojson'

# Create paramters
crime_params = {
    'offense_description': 'BURGLARY- AGGRAVATED',
    '$where': 'incident_reported between \'2021-01-01\' and \'2021-06-30\'',
    '$limit': 2000   

}

# Generate response
response = requests.get(crime_endpoint, params = crime_params)

# Convert json object to geopandas DataFrame
crime = (gpd.read_file(StringIO(response.text)))

# Drop duplicate incident numbers
crime = crime.drop_duplicates(subset = 'incident_number')

Download the 2019 census tract shapefiles for Tennessee from https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.2019.html. (The FIPS code for Tennessee is 47). Perform a spatial join to determine the census tract in which each burglary incident occurred. Which census tract had the highest number of burglaries? Warning - each incident can appear multiple times if there are multiple victims, so be sure that you aren't double-counting any incidents.

In [3]:
# Read in census tract shapefile
tract = (gpd.read_file('data/tl_2019_47_tract.shp'))

# Clean up shapefile, covert to proper geometry
tract = tract[tract['COUNTYFP'] == '037']
tract = tract.rename(columns = {'TRACTCE': 'tract'})
tract = tract.to_crs('EPSG:4326')

# Spatial join burglaries and census tract tables, keep relevant columns
crimeBYtract = gpd.sjoin(crime, tract, how='inner', op ='within')
crimeBYtract = crimeBYtract[['incident_number', 'incident_reported', 'geometry', 'tract']]

# Tract with highest number of burglaries
crimeBYtract.groupby('tract')['incident_number'].nunique().nlargest(1)

tract
016300    53
Name: incident_number, dtype: int64

For this part, you'll need to request a census API key. Using the 2019 American Community Survey API, obtain, for each census tract, the population (B01001_001E in the detailed tables) and the median income (S1901_C01_012E in the subject tables). Hint: Tennessee's FIPS code is 47 and Davidson County's FIPS code is 37. Merge this new data with the burglaries data above.

In [4]:
# Load API key for American Community Survey API

with open ('data/census.json') as fi:
    credentials = json.load(fi)
    
api_key = credentials['api_key']

In [5]:
# Create endpoint, parameters, and generate response for population data
pop_endpoint = 'https://api.census.gov/data/2019/acs/acs5'
pop_params = {
    'get': 'B01001_001E',
    'for': 'tract:*',
    'in': 'state:47 county:037',
    'key': api_key
}

pop_response = requests.get(pop_endpoint, params = pop_params)

# Convert response to pandas DataFrame and clean up

pop = pop_response.json()
pop = pd.DataFrame(data = pop[1:], columns = pop[0])
pop['B01001_001E'] = pop['B01001_001E'].astype(int)
pop = pop.rename(columns = {'B01001_001E': 'population'})

In [6]:
 #Create endpoint, parameters, and generate response for income data

inc_endpoint = 'https://api.census.gov/data/2019/acs/acs5/subject'
inc_params = {
    'get': 'S1901_C01_012E',
    'for': 'tract:*',
    'in': 'state:47 county:037',
    'key': api_key
}

inc_response = requests.get(inc_endpoint, params = inc_params)

# Convert response to pandas DataFrame and clean up

inc = inc_response.json()
inc = pd.DataFrame(data = inc[1:], columns = inc[0])
inc = inc.rename(columns={'S1901_C01_012E': 'median income'})
inc['median income'] = inc['median income'].astype(int)

In [13]:
# Merge all into one DataFrame and clean up

dfs = [crimeBYtract, pop, inc]
crime_pop_inc = reduce(lambda left, right: pd.merge(left, right, on ='tract'), dfs)
crime_pop_inc.geometry.to_crs('EPSG:4326')
crime_pop_inc = crime_pop_inc[['incident_number', 'incident_reported', 'tract', 'median income', 'population', 'geometry']]

In [14]:
crime_pop_inc.head()

Unnamed: 0,incident_number,incident_reported,tract,median income,population,geometry
0,20210249540,2021-05-05T00:45:00,16000,37083,945,POINT (-86.77000 36.15000)
1,20210187070,2021-04-02T20:04:00,16000,37083,945,POINT (-86.76000 36.14000)
2,20210006869,2021-01-05T03:11:00,16000,37083,945,POINT (-86.76000 36.14000)
3,20210223220,2021-04-21T10:00:00,16000,37083,945,POINT (-86.77000 36.15000)
4,20210013709,2021-01-08T02:35:00,16000,37083,945,POINT (-86.76000 36.14000)


Create a choropleth showing the number of burglaries per 1000 residents for each census tract.

In [16]:
crime_pop_inc.groupby('tract')['incident_number'].nunique()

tract
010104     5
010105     3
010106     7
010201     4
010301     1
          ..
019300    19
019400     9
019500    19
019600     5
980100     3
Name: incident_number, Length: 147, dtype: int64

In [None]:
choropleth = pd.DataFrame(crime_pop_inc.groupby(['tract', 'population', 'median income'],
                                                               as_index = False)['incident_number'].nunique())
choropleth['incident_rate'] = chloropleth['incident_number'] / chloropleth['population'] * 1000
choropleth = pd.merge(left = davidson_county, right = chloropleth, how = 'left')

Finally, we'll build some statistical models to see how well we can explain the number of aggravated burglaries using the median income of each census tract. For this, we'll be using the Generalized Linear Models module of the statsmodels library.

a. Build a "base model" - a Poisson regression model with just an intercept term with target variable the rate of burglaries per census tract. (Offset using the [log of the] population so that we are looking at the rate of burglaries per population instead of the number of burglaries.)

b. Now, build a Poisson regression model with target variable the rate of burglaries and predictor variable the median income. (Don't forget to offset by the population).

c. Finally, try out a negative binomial model. To get started with a negative binomial model, you can check out this tutorial.

d. How do your models compare? Hint: the fit models have an AIC attribute.