In [39]:
import numpy as np
import pandas as pd
import math
import calendar

NUM_TIME_BINS_PER_DAY = 24

In [40]:
###################
# LOAD CRIME DATA #
###################
crime_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Crimes (Chicago).csv')

In [41]:
########################
# CONDITION CRIME DATA #
########################
# Delete columns that are redundant or unhelpful
columns_to_delete = ['Case Number', 'Location Description', 'Block', 'Arrest', 'Domestic', 'FBI Code', 'IUCR', 'Description','X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Location']
final_dataframe = crime_dataframe.drop(columns=columns_to_delete).copy().dropna()
print('Length: %d' % len(final_dataframe))

Length: 6059764


In [42]:
###################################
# CONVERT CRIME STATS TO READABLE #
###################################
final_dataframe['Beat'] = final_dataframe['Beat'].astype(int).astype(str).apply(lambda x: 'BEAT_'+x)
final_dataframe['District'] = final_dataframe['District'].astype(int).astype(str).apply(lambda x: 'DISTRICT_'+x)
final_dataframe['Ward'] = final_dataframe['Ward'].astype(int).astype(str).apply(lambda x: 'WARD_'+x)
final_dataframe['Community Area'] = final_dataframe['Community Area'].astype(int).astype(str).apply(lambda x: 'COMMUNITY_'+x)
print('Length: %d' % len(final_dataframe))

Length: 6059764


In [43]:
##################
# CONDITION DATE #
##################
# Convert crime dates to YEAR, MONTH, DAY, HOUR, MINUTE, and weekday columns
# Convert those to one-hot and concat with final dataframe
final_dataframe['Date'] = pd.to_datetime(crime_dataframe['Date'])
final_dataframe['YEAR'] = final_dataframe['Date'].dt.year.astype(str).apply(lambda x: 'YEAR_'+x)
final_dataframe['MONTH'] = final_dataframe['Date'].dt.month.apply(lambda x: calendar.month_abbr[x])
final_dataframe['DAY'] = final_dataframe['Date'].dt.day.astype(str).apply(lambda x: 'DAY_'+x)
final_dataframe['WEEKDAY'] = final_dataframe['Date'].dt.weekday.apply(lambda x: calendar.day_name[x])
hours = final_dataframe['Date'].dt.hour
minutes = final_dataframe['Date'].dt.minute
final_dataframe['TIME_OF_DAY'] = ((hours + minutes / 60.) / 24. * NUM_TIME_BINS_PER_DAY).astype(int).astype(str).apply(lambda x: 'TIME_SLOT_'+x)
print('Length: %d' % len(final_dataframe))

Length: 6059764


In [45]:
####################
# JOIN TEMPERATURE #
####################
temperature_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Temperatures (Chicago).csv')
# Drop the TAVG column because it has too many NaNs
temperature_dataframe = temperature_dataframe.drop(columns=['TAVG'])
# Convert the Precipitation, max T, and min T columns to float
temperature_dataframe['PRCP'] = pd.to_numeric(temperature_dataframe['PRCP'])
temperature_dataframe['TMAX'] = pd.to_numeric(temperature_dataframe['TMAX'])
temperature_dataframe['TMIN'] = pd.to_numeric(temperature_dataframe['TMIN'])
temperature_dataframe = temperature_dataframe.rename(columns={'PRCP':'PRECIPITATION', 'TMAX':'MAX TEMP', 'TMIN':'MIN TEMP'})
# Join with the final dataframe
temperature_dataframe['date_join'] = pd.to_datetime(temperature_dataframe['DATE']).dt.date
# Drop unnecessary columns before merge
temperature_dataframe = temperature_dataframe.drop(columns=['DATE', 'STATION', 'NAME'])
final_dataframe['date_join'] = final_dataframe['Date'].dt.date
final_dataframe = final_dataframe.merge(temperature_dataframe, on=['date_join'], how='left').drop(columns=['date_join']).dropna()
print('Length: %d' % len(final_dataframe))

Length: 6018887


In [73]:
#################################
# JOIN SOCIOECONOMIC INDICATORS #
#################################
socioeconomic_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Socioeconomic Indicators (Chicago).csv')
# Remove unhelpful data
socioeconomic_dataframe = socioeconomic_dataframe.drop(columns=['COMMUNITY AREA NAME']).dropna()
# This data is only for 2008 - 2012, so filter for that interval
final_dataframe = final_dataframe[(final_dataframe['Date'] >= '2008-01-01') & (final_dataframe['Date'] <= '2012-12-31')]['Date']
socioeconomic_dataframe = socioeconomic_dataframe.rename(columns={'Community Area Number':'Community Area'})
socioeconomic_dataframe['Community Area'] = socioeconomic_dataframe['Community Area'].astype(int).astype(str).apply(lambda x: 'COMMUNITY_'+x)
final_dataframe = final_dataframe.merge(socioeconomic_dataframe, on=['Community Area'], how='left').dropna()
print('Length: %d' % len(final_dataframe))

AttributeError: 'Series' object has no attribute 'merge'

# Write the new crime data to a temporary file in my workspace
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/25_November.xlsx')
final_dataframe.to_excel(writer)
writer.save()

In [70]:
########################
# JOIN LIFE EXPECTANCY #
########################
life_expectancy_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Life Expectancy (Chicago).csv')

In [71]:
life_expectancy_dataframe.columns

Index(['Community Area Number', 'Community Area', '1990 Life Expectancy',
       '1990 Lower 95% CI', '1990 Upper 95% CI', '2000 Life Expectancy',
       '2000 Lower 95% CI', '2000 Upper 95% CI', '2010 Life Expectancy',
       '2010 Lower 95% CI', '2010 Upper 95% CI'],
      dtype='object')

In [63]:
##################
# JOIN L ENTRIES #
##################
L_entry_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/L Station Entries (Chicago).csv')

In [64]:
L_entry_dataframe.columns

Index(['station_id', 'stationname', 'date', 'daytype', 'rides'], dtype='object')

In [52]:
####################
# JOIN SBIF GRANTS #
####################
SBIF_grant_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/SBIF Grants (Chicago).csv')

In [67]:
SBIF_grant_dataframe.columns

Index(['Company', 'Address', 'TIF District', 'Completion Date', 'Actual Costs',
       'Actual Grant', 'Work Items'],
      dtype='object')

In [50]:
##########################
# JOIN BUSINESS LICENSES #
##########################
business_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Business Licenses (Chicago).csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [72]:
business_dataframe.columns

Index(['ID', 'LICENSE ID', 'ACCOUNT NUMBER', 'SITE NUMBER', 'LEGAL NAME',
       'DOING BUSINESS AS NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP CODE',
       'WARD', 'PRECINCT', 'WARD PRECINCT', 'POLICE DISTRICT', 'LICENSE CODE',
       'LICENSE DESCRIPTION', 'BUSINESS ACTIVITY ID', 'BUSINESS ACTIVITY',
       'LICENSE NUMBER', 'APPLICATION TYPE', 'APPLICATION CREATED DATE',
       'APPLICATION REQUIREMENTS COMPLETE', 'PAYMENT DATE',
       'CONDITIONAL APPROVAL', 'LICENSE TERM START DATE',
       'LICENSE TERM EXPIRATION DATE', 'LICENSE APPROVED FOR ISSUANCE',
       'DATE ISSUED', 'LICENSE STATUS', 'LICENSE STATUS CHANGE DATE', 'SSA',
       'LATITUDE', 'LONGITUDE', 'LOCATION'],
      dtype='object')

In [None]:
# Dataframes not yet joined:
#  business licenses
#  socioeconomic indicators
#  SBIF
#  Life Expectancy
#  L Entries by Station

In [None]:
# AHS_2001_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2001.csv').dropna()
# AHS_2003_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2003.csv').dropna()
# AHS_2005_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2005.csv').dropna()
# AHS_2007_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2007.csv').dropna()
# AHS_2009_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2009.csv').dropna()
# AHS_2011_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2011.csv').dropna()
# AHS_2013_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2013.csv').dropna()
# AHS_2015_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2015.csv').dropna()