In [63]:
import geopandas as gp
import pandas as pd
import tensorflow as tf
import numpy as np
from matplotlib import pyplot
from shapely.geometry import Point, Polygon
import requests

In [None]:
# Building CNN Data
# Data that varies with Time and Location:
# - Crime (OUTPUT - YOLO with crime and location)
# - L entries (8 layers - one per line)
# - Businesses (5+ layers - types of businesses)
# - SBIF grants (1 layer)
# - MMRP Permits (21 layers - types of permits)
# - Life Expectancy (1 layer)
#
# Data that varies with Time Only:
# - Temperature (3 layers - MIN, MAX, and PRECIPITATION)
# - Date (43 layers - 12 for month, 31 for day)
# - Time (5 layers - one for each time slot)
#
# Data that varies with Location Only:
# - Buildings (10 layers - stories|units|sqfeet for sound|minor repair|major repair.  Also uninhabitable or not.)
# - Waterways (1 layer)
# - Major Streets (1 layer)
# - Libraries (1 layer)
# - Public Parks (1 layer)
# - Forests (1 layer)
# - Schools (1 layer)

# Total layers: 60 + (date layers)

In [227]:
####################
# HELPER FUNCTIONS #
####################

# Geocode:  Use Google Maps API to find latitude and longitude from an address
# Input: Address (String)
# Output: Location (Point)
def geocode(address, region=None):
    GOOGLE_MAPS_API_URL = 'https://maps.googleapis.com/maps/api/geocode/json'
    params = {'address': address,'key':open('google_api.key').read().strip()}

    # Do the request and get the response data
    geo_request = requests.get(GOOGLE_MAPS_API_URL, params=params)
    geo_response = geo_request.json()
    
    if len(geo_response['results']) == 0: return None
    latitude = geo_response['results'][0]['geometry']['location']['lat']
    longitude = geo_response['results'][0]['geometry']['location']['lng']
    return Point(latitude, longitude)

In [192]:
# - MMRP Permits (21 layers)

MMRP_permits = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/MMRP Permits.csv')
permits_dataframe = MMRP_permits.loc[:,['Last Permit Activity Date/Time',
                              'Permit Type Description',
                              'Location']].rename(columns={'Last Permit Activity Date/Time':'Date',
                                                           'Permit Type Description':'Permit Type'}).dropna()
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/MMRP Permits.xlsx')
MMRP_permits.to_excel(writer)
writer.save()
print('MMRP permit dataset processed.')

MMRP permit dataset processed.


In [193]:
# - Buildings (10 layers)

buildings = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/Buildings.csv', low_memory=False)
# Removed demolished buildings
# For 'SOUND', 'NEEDS MAJOR REPAIR', and 'NEEDS MINOR REPAIR' in BLDG_CONDI:
# - Layer for NO_STORIES, NO_OF_UNIT, and BLDG_SQ_FO
# For 'UNNHABITABLE', 'UNINHABITABLE' in BLDG_CONDI:
# - 1 layer

buildings_dataframe = buildings.loc[:,['BLDG_CONDI',
                                       'NO_STORIES',
                                       'NO_OF_UNIT',
                                       'BLDG_SQ_FO',
                                       'the_geom']].rename(columns={'BLDG_CONDI':'Condition',
                                                                    'NO_STORIES':'Stories',
                                                                    'NO_OF_UNIT':'Units',
                                                                    'BLDG_SQ_FO':'Square Footage',
                                                                    'the_geom':'Footprint'}).dropna()
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Buildings.xlsx')
buildings_dataframe.to_excel(writer)
writer.save()
print('Building dataset processed.')

Building dataset processed.


In [290]:
# Community Areas (metadata)

communities = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/Community Areas.csv')
communities_dataframe = communities.loc[:,['the_geom',
                                           'COMMUNITY',
                                           'AREA_NUM_1']].rename(columns={'the_geom':'Outline',
                                                                          'COMMUNITY':'Name',
                                                                          'AREA_NUM_1':'Number'}).dropna()
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Communities.xlsx')
communities_dataframe.to_excel(writer)
writer.save()
print('Community dataset processed.')

Community dataset processed.


In [210]:
# Businesses (5 layers - types of businesses)

# Business Activity Categories:
#
# Food: 'Retail Sales of Perishable Foods',
#       'Wholesale Food Sales',
#       'Sale of Food Prepared Onsite With Dining Area',
#       'Sale of Food Prepared Onsite Without Dining Area',
#       'Retail Sales of General Merchandise and Non-Perishable Food'
# Tobacco: 'Retail Sale of Tobacco'
# Bar: 'Consumption of Liquor on Premises',
#      'Tavern - Consumption of Liquor on Premise',
#      'Sale of Liquor Until 4am, Monday - Saturday and 5am on Sunday',
#       'Special Event Beer & Wine',
#       'Consumption of Liquor on Premises, Not for Profit'
# Package Store: 'Retail Sales of Packaged Liquor',
#                'Sale of Liquor Outdoors on Private Property'
# Gas Station: 'Operation of a Fuel Filling Station'

business_licenses = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/Business Licenses (Chicago).csv', low_memory=False)
business_dataframe = business_licenses.loc[:,['BUSINESS ACTIVITY',
                                              'LICENSE TERM START DATE',
                                              'LICENSE TERM EXPIRATION DATE',
                                              'LOCATION']].rename(columns={'BUSINESS ACTIVITY':'Activity',
                                                                           'LICENSE TERM START DATE':'Start Date',
                                                                           'LICENSE TERM EXPIRATION DATE':'End Date',
                                                                           'LOCATION':'Location'}).dropna()
# Categorize the data
activity = business_dataframe['Activity']
business_dataframe['Food Service'] = ((activity == 'Retail Sales of Perishable Foods') | (activity == 'Wholesale Food Sales') | (activity == 'Sale of Food Prepared Onsite With Dining Area') | (activity == 'Sale of Food Prepared Onsite Without Dining Area') | (activity == 'Retail Sales of General Merchandise and Non-Perishable Food'))
business_dataframe['Tobacco Sale'] = (activity == 'Retail Sale of Tobacco')
business_dataframe['Alcohol Consumption'] = ((activity == 'Consumption of Liquor on Premises') | (activity == 'Consumption of Liquor on Premises, Not for Profit') | (activity == 'Special Event Beer & Wine') | (activity == 'Sale of Liquor Until 4am, Monday - Saturday and 5am on Sunday') | (activity == 'Tavern - Consumption of Liquor on Premise'))
business_dataframe['Package Store'] = ((activity == 'Retail Sales of Packaged Liquor') | (activity == 'Sale of Liquor Outdoors on Private Property'))
business_dataframe['Gas Station'] = (activity == 'Operation of a Fuel Filling Station')
# Drop the Activity column
business_dataframe = business_dataframe.drop(columns=['Activity'])
# Drop the rows which do not perform the activities specified above
business_dataframe = business_dataframe.loc[business_dataframe['Food Service']|business_dataframe['Tobacco Sale']|business_dataframe['Gas Station']|business_dataframe['Package Store']|business_dataframe['Alcohol Consumption'],:].reset_index()
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Businesses.xlsx')
business_dataframe.to_excel(writer)
writer.save()
print('Business dataset processed.')

Business dataset processed.


In [250]:
# SBIF grants (1 layer)

SBIF_grants = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/SBIF Grants (Chicago).csv', low_memory=False)

# def chicago_geocode(address):
#     # Chop up the address if it contains a '-'
#     fixed_address = address.split('-')[-1]
#     # Look up the address near Chicago
#     geo_point = geocode(fixed_address, region='Chicago, IL')
#     if geo_point == None:
#         print('Could not find address: '+ str(address))
#     return geo_point    

# SBIF_grant_locations = SBIF_grants['Address'].apply(chicago_geocode)
# Could not find address: 228 W Washtenaw
# Could not find address: 4201-A W Lawrence
# Could not find address: 4201-A W Lawrence

SBIF_grants['Location'] = SBIF_grant_locations
# Filter important columns and rename
SBIF_grant_dataframe = SBIF_grants.loc[:,['Completion Date',
                                          'Location',
                                          'Actual Grant']].rename(columns={'Completion Date':'Date',
                                                                           'Actual Grant':'Amount'}).dropna().reset_index()
# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/SBIF Grants.xlsx')
SBIF_grant_dataframe.to_excel(writer)
writer.save()
print('SBIF Grant dataset processed.')

SBIF Grant dataset processed.


In [375]:
# L entries (8 layers - one per line)

L_entries = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/L Station Entries (Chicago).csv', low_memory=False)
L_stations = gp.read_file('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/CTA_RailStations/CTA_RailStations.shp').to_crs(epsg=4326)

# Specify the L lines being considered.
L_lines = ['Green','Red','Brown','Purple','Yellow','Blue','Pink','Orange']
# Condition the L_station name data
L_stations['STATION_ID'] = L_stations['STATION_ID'].unique()+40000
# Merge L_station location with L_entries
L_entries_dataframe = L_entries.merge(L_stations.drop(columns=['LONGNAME', 'ADDRESS', 'ADA', 'PKNRD', 'POINT_X', 'POINT_Y', 'GTFS']), how='inner', left_on='station_id', right_on='STATION_ID')
# Retrieve and clean Line specification data.
for line in L_lines:
    L_entries_dataframe[line+str(' Line')] = L_entries_dataframe['LINES'].str.contains(line)
# Filter important columns and rename
L_entries_dataframe = L_entries_dataframe.drop(columns=['station_id',
                                                        'stationname',
                                                        'daytype',
                                                        'STATION_ID',
                                                        'LINES']).rename(columns={'date':'Date',
                                                                                  'rides':'Entries',
                                                                                  'geometry':'Location'}).dropna()
# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/L Entries.xlsx')
L_entries_dataframe.to_excel(writer)
writer.save()
print('L Entry dataset processed.')

L Entry dataset processed.


In [297]:
# Life Expectancy (1 layer)

life_expectancy_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/Life Expectancy (Chicago).csv').dropna()

# Drop unnecessary columns
life_expectancy_dataframe = life_expectancy_dataframe.drop(columns=['1990 Lower 95% CI', '1990 Upper 95% CI', '2000 Lower 95% CI', '2000 Upper 95% CI', '2010 Lower 95% CI', '2010 Upper 95% CI'])
# Build a new dataframe to interpolate/extrapolate life expectancy and reformat for merging
temp = pd.DataFrame({'Year':[],'Community Area':[],'Life Expectancy':[]})
index = 0
for year in range(2001,2019):
    for community in life_expectancy_dataframe['Community Area Number']:
        max_age_2000 = life_expectancy_dataframe.loc[life_expectancy_dataframe['Community Area Number']==community]['2000 Life Expectancy'].values[0]
        max_age_2010 = life_expectancy_dataframe.loc[life_expectancy_dataframe['Community Area Number']==community]['2010 Life Expectancy'].values[0]
        temp = temp.append({'Year':year,
                            'Community Area':community,
                            'Life Expectancy':((max_age_2010-max_age_2000)/(2010-2000)*(year-2000)+max_age_2000)},
                           ignore_index=True)
life_expectancy_dataframe = temp

# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Life Expectancy.xlsx')
life_expectancy_dataframe.to_excel(writer)
writer.save()
print('Life Expectancy dataset processed.')

Life Expectancy dataset processed.


In [300]:
# Temperature (3 layers - MIN, MAX, and PRECIPITATION)

temperature_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/Temperatures (Chicago).csv')

# Drop the TAVG column because it has too many NaNs
temperature_dataframe = temperature_dataframe
# Convert the Precipitation, max T, and min T columns to float
temperature_dataframe['PRCP'] = pd.to_numeric(temperature_dataframe['PRCP'])
temperature_dataframe['TMAX'] = pd.to_numeric(temperature_dataframe['TMAX'])
temperature_dataframe['TMIN'] = pd.to_numeric(temperature_dataframe['TMIN'])
temperature_dataframe['DATE'] = pd.to_datetime(temperature_dataframe['DATE']).dt.date
temperature_dataframe = temperature_dataframe.drop(columns=['TAVG']).rename(columns={'PRCP':'Precipitation',
                                                                                     'TMAX':'Max Temp',
                                                                                     'TMIN':'Min Temp',
                                                                                     'DATE':'Date'}).dropna()

# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Temperature.xlsx')
temperature_dataframe.to_excel(writer)
writer.save()
print('Temperature dataset processed.')

Temperature dataset processed.


In [309]:
# Waterways (1 layer)

waterways = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/Hydro.csv')

#Remove unnecessary columns and rename for consistency
waterways_dataframe = waterways.loc[:,['the_geom']].rename(columns={'the_geom':'Outline'}).dropna()

# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Waterways.xlsx')
waterways_dataframe.to_excel(writer)
writer.save()
print('Waterway dataset processed.')

Waterways dataset processed.


In [362]:
# Major Streets (1 layer)

streets = gp.read_file('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/Major_20Streets/Major_Streets.shp').to_crs(epsg=4326)

#Remove unnecessary columns and rename for consistency
street_dataframe = streets.loc[:,['geometry']].rename(columns={'geometry':'Centerline'}).dropna()

# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Streets.xlsx')
street_dataframe.to_excel(writer)
writer.save()
print('Street dataset processed.')

Street dataset processed.


In [363]:
# - Public Parks (1 layer)

parks = gp.read_file('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/park boundaries/park boundaries.shp').to_crs(epsg=4326)

#Remove unnecessary columns and rename for consistency
park_dataframe = parks.loc[:,['geometry']].rename(columns={'geometry':'Outline'}).dropna()

# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Parks.xlsx')
park_dataframe.to_excel(writer)
writer.save()
print('Park dataset processed.')

Park dataset processed.


In [364]:
# - Forests (1 layer)

forests = gp.read_file('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/Forestry/Forestry.shp').to_crs(epsg=4326)

#Remove unnecessary columns and rename for consistency
forest_dataframe = forests.loc[:,['geometry']].rename(columns={'geometry':'Outline'}).dropna()

# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Forests.xlsx')
forest_dataframe.to_excel(writer)
writer.save()
print('Forest dataset processed.')

Forest dataset processed.


In [365]:
# - Schools (1 layer)

schools = gp.read_file('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/School_20Grounds/School_Grounds.shp').to_crs(epsg=4326)

#Remove unnecessary columns and rename for consistency
school_dataframe = schools.loc[:,['geometry']].rename(columns={'geometry':'Outline'}).dropna()

# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Schools.xlsx')
school_dataframe.to_excel(writer)
writer.save()
print('School dataset processed.')

School dataset processed.


In [366]:
# - Libraries (1 layer)

libraries = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Unprocessed/libraries.csv')

#Remove unnecessary columns and rename for consistency
library_dataframe = libraries.loc[:,['LOCATION']].rename(columns={'LOCATION':'Location'}).dropna()

# Write to file
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/Libraries.xlsx')
library_dataframe.to_excel(writer)
writer.save()
print('Library dataset processed.')

Library dataset processed.
