In [3]:
import numpy as np
import pandas as pd
import math
import calendar

NUM_TIME_BINS_PER_DAY = 24

In [17]:
###################
# LOAD CRIME DATA #
###################
crime_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Crimes (Chicago).csv')

In [24]:
########################
# CONDITION CRIME DATA #
########################
# Delete columns that are redundant or unhelpful
columns_to_delete = ['Case Number', 'Location Description', 'Block', 'Arrest', 'Domestic', 'FBI Code', 'Primary Type', 'Description','X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Location']
final_dataframe = crime_dataframe.drop(columns=columns_to_delete).copy().dropna()
print('Length: %d' % len(final_dataframe))

Length: 6059764


In [25]:
###################################
# CONVERT CRIME STATS TO READABLE #
###################################
final_dataframe['Beat'] = final_dataframe['Beat'].astype(str).apply(lambda x: 'BEAT_'+x)
final_dataframe['District'] = final_dataframe['District'].astype(str).apply(lambda x: 'DISTRICT_'+x)
final_dataframe['Ward'] = final_dataframe['Ward'].astype(str).apply(lambda x: 'WARD_'+x)
final_dataframe['Community Area'] = final_dataframe['Community Area'].astype(str).apply(lambda x: 'COMMUNITY_'+x)
print('Length: %d' % len(final_dataframe))

Length: 6059764


In [None]:
##################
# CONDITION DATE #
##################
# Convert crime dates to YEAR, MONTH, DAY, HOUR, MINUTE, and weekday columns
# Convert those to one-hot and concat with final dataframe
final_dataframe['Date'] = pd.to_datetime(crime_dataframe['Date'])
final_dataframe['YEAR'] = final_dataframe['Date'].dt.year.astype(str).apply(lambda x: 'YEAR_'+x)
final_dataframe['MONTH'] = final_dataframe['Date'].dt.month.apply(lambda x: calendar.month_abbr[x])
final_dataframe['DAY'] = final_dataframe['Date'].dt.day.astype(str).apply(lambda x: 'DAY_'+x)
final_dataframe['WEEKDAY'] = final_dataframe['Date'].dt.weekday.apply(lambda x: calendar.day_name[x])
hours = final_dataframe['Date'].dt.hour
minutes = final_dataframe['Date'].dt.minute
final_dataframe['TIME_OF_DAY'] = ((hours + minutes / 60.) / 24. * NUM_TIME_BINS_PER_DAY).astype(int).astype(str).apply(lambda x: 'TIME_SLOT_'+x)
print('Length: %d' % len(final_dataframe))

In [None]:
####################
# JOIN TEMPERATURE #
####################
temperature_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Temperatures (Chicago).csv')
# Drop the TAVG column because it has too many NaNs
temperature_dataframe = temperature_dataframe.drop(columns=['TAVG'])
# Convert the Precipitation, max T, and min T columns to float
temperature_dataframe['PRCP'] = pd.to_numeric(temperature_dataframe['PRCP'])
temperature_dataframe['TMAX'] = pd.to_numeric(temperature_dataframe['TMAX'])
temperature_dataframe['TMIN'] = pd.to_numeric(temperature_dataframe['TMIN'])
temperature_dataframe.rename(columns={'PRCP':'PRECIPITATION'})
# Join with the final dataframe
temperature_dataframe['date_join'] = pd.to_datetime(temperature_dataframe['DATE']).dt.date
final_dataframe['date_join'] = final_dataframe['Date'].dt.date
final_dataframe = final_dataframe.merge(temperature_dataframe, on=['date_join'], how='left').drop(columns=['DATE', 'STATION', 'NAME', 'date_join']).dropna()
print('Length: %d' % len(final_dataframe))

In [None]:
########################
# JOIN LIFE EXPECTANCY #
########################
life_expectancy_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Life Expectancy (Chicago).csv')

In [None]:
##################
# JOIN L ENTRIES #
##################
L_entry_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/L Station Entries (Chicago).csv')

In [None]:
####################
# JOIN SBIF GRANTS #
####################
SBIF_grant_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/SBIF Grants (Chicago).csv')

In [None]:
##########################
# JOIN BUSINESS LICENSES #
##########################
business_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Business Licenses (Chicago).csv')

In [None]:
#################################
# JOIN SOCIOECONOMIC INDICATORS #
#################################
socioeconomic_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/Socioeconomic Indicators (Chicago).csv')

In [None]:
######################
# CONVERT TO ONE-HOT #
######################
# Convert 'Beat', 'District', 'Ward', and 'Community Area' to one-hot vectors
one_hot_columns = ['Beat', 'District', 'Ward', 'Community Area', 'YEAR', 'MONTH', 'DAY', 'WEEKDAY', 'TIME_OF_DAY']
for column_name in one_hot_columns:
    one_hot_expanded_columns = pd.get_dummies(final_dataframe[column_name])
    final_dataframe = pd.concat([final_dataframe, one_hot_expanded_columns], axis=1).dropna()
final_dataframe.drop(columns=one_hot_columns)

In [None]:
# Write the new crime data to a temporary file in my workspace
writer = pd.ExcelWriter('/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/25_November.xlsx')
final_dataframe.to_excel(writer)
writer.save()

In [None]:
# Dataframes not yet joined:
#  business licenses
#  socioeconomic indicators
#  SBIF
#  Life Expectancy
#  L Entries by Station

In [None]:
# AHS_2001_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2001.csv').dropna()
# AHS_2003_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2003.csv').dropna()
# AHS_2005_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2005.csv').dropna()
# AHS_2007_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2007.csv').dropna()
# AHS_2009_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2009.csv').dropna()
# AHS_2011_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2011.csv').dropna()
# AHS_2013_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2013.csv').dropna()
# AHS_2015_dataframe = pd.read_csv('/Volumes/GoogleDrive/My Drive/Crime Data/Raw Data/AHS/2015.csv').dropna()