In [2]:
import numpy as np
import pandas as pd
import math
import calendar

In [3]:
###################
# LOAD CRIME DATA #
###################
crime_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Crimes (Chicago).csv')

In [4]:
########################
# CONDITION CRIME DATA #
########################
# Delete columns that are redundant or unhelpful
columns_to_delete = ['Case Number', 'Location Description', 'Block', 'Arrest', 'Domestic', 'FBI Code', 'Primary Type', 'Description','X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Location']
final_dataframe = crime_dataframe.drop(columns=columns_to_delete).copy().dropna()
print('Length: %d' % len(final_dataframe))

Length: 6059764


In [5]:
##################################
# CONVERT CRIME STATS TO ONE-HOT #
##################################
# Convert 'Beat', 'District', 'Ward', and 'Community Area' to one-hot vectors
beat_one_hot = pd.get_dummies(final_dataframe['Beat'].astype(int))
district_one_hot = pd.get_dummies(final_dataframe['District'].astype(int))
ward_one_hot = pd.get_dummies(final_dataframe['Ward'].astype(int))
community_one_hot = pd.get_dummies(final_dataframe['Community Area'].astype(int))
final_dataframe = pd.concat([final_dataframe.drop(columns=['Beat', 'District', 'Ward', 'Community Area']),
                            beat_one_hot,
                            district_one_hot,
                            ward_one_hot,
                            community_one_hot], axis=1).dropna()
print('Length: %d' % len(final_dataframe))

Length: 6059764


In [7]:
##################
# CONDITION DATE #
##################
# Convert crime dates to YEAR, MONTH, DAY, HOUR, MINUTE, and weekday columns
# Convert those to one-hot and concat with final dataframe
final_dataframe['Date'] = pd.to_datetime(crime_dataframe['Date'])
year_one_hot = pd.get_dummies(final_dataframe['Date'].dt.year.astype(str).apply(lambda x: 'YEAR_'+x))
month_one_hot = pd.get_dummies(final_dataframe['Date'].dt.month.apply(lambda x: calendar.month_abbr[x]))
day_one_hot = pd.get_dummies(final_dataframe['Date'].dt.day.astype(str).apply(lambda x: 'DAY_'+x))
hour_one_hot = pd.get_dummies(final_dataframe['Date'].dt.hour.astype(str).apply(lambda x: 'HOUR_'+x))
minute_one_hot = pd.get_dummies(final_dataframe['Date'].dt.minute.astype(str).apply(lambda x: 'MINUTE-'+x))
weekday_one_hot = pd.get_dummies(final_dataframe['Date'].dt.weekday.apply(lambda x: calendar.day_name[x]))
final_dataframe = pd.concat([final_dataframe,
                            year_one_hot,
                            month_one_hot,
                            day_one_hot,
                            hour_one_hot,
                            minute_one_hot,
                            weekday_one_hot], axis=1).dropna()
print('Length: %d' % len(final_dataframe))

Length: 6059764


In [12]:
####################
# JOIN TEMPERATURE #
####################
temperature_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Temperatures (Chicago).csv')
# Drop the TAVG column because it has too many NaNs
temperature_dataframe = temperature_dataframe.drop(columns=['TAVG'])
# Convert the Precipitation, max T, and min T columns to float
temperature_dataframe['PRCP'] = pd.to_numeric(temperature_dataframe['PRCP'])
temperature_dataframe['TMAX'] = pd.to_numeric(temperature_dataframe['TMAX'])
temperature_dataframe['TMIN'] = pd.to_numeric(temperature_dataframe['TMIN'])
temperature_dataframe.rename(columns={'PRCP':'PRECIPITATION'})
# Join with the final dataframe
temperature_dataframe['DATE'] = pd.to_datetime(temperature_dataframe['DATE'])
temperature_dataframe['DAY'] = temperature_dataframe['DATE'].dt.day
temperature_dataframe['MONTH'] = temperature_dataframe['DATE'].dt.month
temperature_dataframe['YEAR'] = temperature_dataframe['DATE'].dt.year
final_dataframe['DAY'] = final_dataframe['Date'].dt.day
final_dataframe['MONTH'] = final_dataframe['Date'].dt.month
final_dataframe['YEAR'] = final_dataframe['Date'].dt.year
final_dataframe = final_dataframe.merge(temperature_dataframe, on=['DAY', 'MONTH', 'YEAR'], how='left').drop(columns=['DATE', 'STATION', 'NAME', 'DAY', 'MONTH', 'YEAR']).dropna()
print('Length: %d' % len(final_dataframe))

Length: 6018887


In [None]:
########################
# JOIN LIFE EXPECTANCY #
########################
life_expectancy_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Life Expectancy (Chicago).csv')

In [None]:
##################
# JOIN L ENTRIES #
##################
L_entry_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/L Station Entries (Chicago).csv')

In [None]:
####################
# JOIN SBIF GRANTS #
####################
SBIF_grant_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/SBIF Grants (Chicago).csv')

In [None]:
##########################
# JOIN BUSINESS LICENSES #
##########################
business_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Business Licenses (Chicago).csv')

In [None]:
#################################
# JOIN SOCIOECONOMIC INDICATORS #
#################################
socioeconomic_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Socioeconomic Indicators (Chicago).csv')

In [None]:
# Write the new crime data to a temporary file in my workspace
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/23_November.xlsx')
final_dataframe.to_excel(writer)
writer.save()

In [None]:
# Dataframes not yet joined:
#  business licenses
#  socioeconomic indicators
#  SBIF
#  Life Expectancy
#  L Entries by Station

In [None]:
# AHS_2001_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2001.csv').dropna()
# AHS_2003_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2003.csv').dropna()
# AHS_2005_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2005.csv').dropna()
# AHS_2007_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2007.csv').dropna()
# AHS_2009_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2009.csv').dropna()
# AHS_2011_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2011.csv').dropna()
# AHS_2013_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2013.csv').dropna()
# AHS_2015_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2015.csv').dropna()