In [73]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as img
# Shapely
import shapely.wkt as wkt
import shapely.geometry as geo
import shapely.ops as ops
import shapely.affinity as aff
from shapely.prepared import prep
import time
import datetime as dtm
import json

In [113]:
#############
# CONSTANTS #
#############
X_MAX_PIXELS = 2048
Y_MAX_PIXELS = X_MAX_PIXELS
L_LINES = ['Green','Red','Brown','Purple','Yellow','Blue','Pink','Orange']
BUSINESS_CATEGORIES = ['Food Service', 'Tobacco Sale', 'Alcohol Consumption', 'Package Store', 'Gas Station']
SOCIO_INDICATORS = ['PERCENT OF HOUSING CROWDED', 'PERCENT HOUSEHOLDS BELOW POVERTY','PERCENT AGED 16+ UNEMPLOYED','PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA','PERCENT AGED UNDER 18 OR OVER 64', 'PER CAPITA INCOME ','HARDSHIP INDEX']
START_DATE = dtm.date(2001, 1, 1)
FINAL_DATE = dtm.date(2018, 1, 1)
NUM_DAYS = (FINAL_DATE - START_DATE).days
NUM_TIME_SLOTS = 12
CNN_final_path = '/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/CNN Final/'
processed_dataset_paths_xlsx = '/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/%s.xlsx' 
processed_dataset_paths_csv = '/Volumes/GoogleDrive/My Drive/Crime Data/Composite Data/Sean Workspace/Processed/%s.csv' 

In [75]:
# UNUSED OR UNPROCESSED DATASETS

# MMRP_permits = pd.read_excel(processed_dataset_paths_xlsx % 'MMRP Permits')
# SBIF_grants = pd.read_excel(processed_dataset_paths_xlsx % 'SBIF Grants')
# weather = pd.read_excel(processed_dataset_paths_xlsx % 'Weather')

In [76]:
# Building CNN Data
# Data that varies with Time and Location:
# - Crime (OUTPUT - YOLO with crime and location)
# - L entries (8 layers - one per line)
# - SBIF grants (1 layer)
# - MMRP Permits (21 layers - types of permits)
# - Life Expectancy (1 layer)
#
# Data that varies with Time Only:
# - Temperature (3 layers - MIN, MAX, and PRECIPITATION)
# - Date (43 layers - 12 for month, 31 for day)
# - Time (5 layers - one for each time slot)
#
# Data that varies with Location Only:
# - Businesses (5+ layers - types of businesses)
# - Buildings (10 layers - stories|units|sqfeet for sound|minor repair|major repair.  Also uninhabitable or not.)
# - Waterways (1 layer)
# - Major Streets (1 layer)
# - Libraries (1 layer)
# - Public Parks (1 layer)
# - Forests (1 layer)
# - Schools (1 layer)

# Total layers: 60 + (date layers)

In [77]:
#####################
# UTILITY FUNCTIONS #
#####################
communities = pd.read_csv(processed_dataset_paths_csv % 'Communities')
# Rasterize the communities
rastered_communities = communities.copy()
rastered_communities['Coordinates'] = communities.Outline.apply(lambda x: points_within(transform_latlong_to_frame(wkt.loads(x))))

# Preprocess necessary parameters for the transformations
polygons = []
for encoding in communities.Outline:
    polygons.extend(wkt.loads(encoding).geoms)
chicago = geo.MultiPolygon(polygons)
x_scale = float(X_MAX_PIXELS) / (chicago.bounds[2] - chicago.bounds[0])
y_scale = float(Y_MAX_PIXELS) / (chicago.bounds[3] - chicago.bounds[1])
x_off = -chicago.bounds[0] * x_scale
y_off = -chicago.bounds[1] * y_scale

# Transform geometry to the current frame
def transform_latlong_to_frame(geometry):
    return aff.affine_transform(geometry, [x_scale, 0, 0, y_scale, x_off, y_off])

# Return the points within the input polygon
def points_within(polygon):
    bounds = polygon.bounds
    offset_x = int(np.floor(bounds[0]))
    offset_y = int(np.floor(bounds[1]))
    width = int(np.ceil(bounds[2])-offset_x)
    height = int(np.ceil(bounds[3])-offset_y)
    pixel_x = []
    pixel_y = []
    for x in range(offset_x, offset_x+width):
        for y in range(offset_y, offset_y+height):
            if polygon.contains(geo.Point(x,y)):
                pixel_x.append(x)
                pixel_y.append(y)
    return (pixel_x, pixel_y)

# Apply the value in column_name of the given record at given coordinates of given frame 
def column_value_at_coordinate(record, column_name, frame):
    frame[record.Coordinates[0], record.Coordinates[1]] = record[column_name]

# Convert a pair "(XXXX, XXXX)" to a geographic point
def convert_pair_to_point(latlong):
    lat, long = latlong.replace('(','').replace(')','').split(',')
    return geo.Point(float(long), float(lat))

# Function for converting and placing weird format latlong into frame pixels
def mark_at_latlong(latlong, frame):
    location = transform_latlong_to_frame(convert_pair_to_point(latlong))
    frame[int(location.x)][int(location.y)] = True

In [12]:
################
# EMPTY PIXELS #
################
pixel_points = []
for x in range(X_MAX_PIXELS):
    for y in range(Y_MAX_PIXELS):
        pixel_points.append(geo.Point(x,y))

In [13]:
#######################
# CREATE STREET FRAME #
#######################
streets = pd.read_excel(processed_dataset_paths_xlsx % 'Streets')

# Load streets
street_lines = []
for encoding in streets.Centerline:
    street_lines.append(wkt.loads(encoding))
chicago_streets = transform_latlong_to_frame(geo.MultiLineString(street_lines))
# Buffer streets
buffered_streets = chicago_streets.buffer(0.5)
# Rasterize to numpy array
street_frame = np.array(list(map(prep(buffered_streets).contains, pixel_points))).reshape((X_MAX_PIXELS,Y_MAX_PIXELS))

np.savez_compressed(CNN_final_path + 'Streets Frame', street_frame=street_frame)
del streets, street_lines, chicago_streets, buffered_streets, street_frame
print('Street frame created and saved.')

Street frame created and saved.


In [14]:
##########################
# CREATE WATERWAYS FRAME #
##########################
waterways = pd.read_csv(processed_dataset_paths_csv % 'Waterways')

# Load waterways
waterway_outlines = []
for encoding in waterways.Outline:
    waterway_outlines.extend(wkt.loads(encoding).geoms)
chicago_waterways = transform_latlong_to_frame(geo.MultiPolygon(waterway_outlines))
# Rasterize to numpy array
waterway_frame = np.array(list(map(prep(chicago_waterways).contains, pixel_points))).reshape((X_MAX_PIXELS,Y_MAX_PIXELS))

np.savez_compressed(CNN_final_path + 'Waterway Frame', waterway_frame=waterway_frame)
del waterways, waterway_outlines, chicago_waterways, waterway_frame
print('Waterway frame created and saved.')

Waterway frame created and saved.


In [15]:
###############
# PARKS FRAME #
###############
parks = pd.read_csv(processed_dataset_paths_csv % 'Parks')

# Load parks
park_outlines = []
for encoding in parks.Outline:
    park_outlines.append(wkt.loads(encoding))
chicago_parks = transform_latlong_to_frame(geo.MultiPolygon(park_outlines))
# Rasterize to numpy array
park_frame = np.array(list(map(prep(chicago_parks).contains, pixel_points))).reshape((X_MAX_PIXELS,Y_MAX_PIXELS))

np.savez_compressed(CNN_final_path + 'Park Frame', park_frame=park_frame)
del parks, park_outlines, chicago_parks, park_frame
print('Park frame created and saved.')

Park frame created and saved.


In [16]:
#################
# FORESTS FRAME #
#################
forests = pd.read_csv(processed_dataset_paths_csv % 'Forests')

# Load forests
forest_outlines = []
for encoding in forests.Outline:
    forest = wkt.loads(encoding)
    if type(forest) is geo.Polygon:
        forest_outlines.append(forest)
    elif type(forest) is geo.MultiPolygon:
        forest_outlines.extend(forest.geoms)
chicago_forests = transform_latlong_to_frame(geo.MultiPolygon(forest_outlines))
# Rasterize to numpy array
forest_frame = np.array(list(map(prep(chicago_forests).contains, pixel_points))).reshape((X_MAX_PIXELS,Y_MAX_PIXELS))

np.savez_compressed(CNN_final_path + 'Forest Frame', forest_frame=forest_frame)
del forests, forest_outlines, chicago_forests, forest_frame
print('Forest frame created and saved.')

Forest frame created and saved.


In [17]:
#################
# SCHOOLS FRAME #
#################
schools = pd.read_csv(processed_dataset_paths_csv % 'Schools')

# Load schools
school_outlines = []
for encoding in schools.Outline:
    school_outlines.append(wkt.loads(encoding))
chicago_schools = transform_latlong_to_frame(geo.MultiPolygon(school_outlines))
# Rasterize to numpy array
school_frame = np.array(list(map(prep(chicago_schools).contains, pixel_points))).reshape((X_MAX_PIXELS,Y_MAX_PIXELS))

np.savez_compressed(CNN_final_path + 'School Frame', school_frame=school_frame)
del schools, school_outlines, chicago_schools, school_frame
print('Schools frame created and saved.')

Schools frame created and saved.


In [18]:
###################
# BUILDINGS FRAME #
###################
buildings = pd.read_excel(processed_dataset_paths_xlsx % 'Buildings')

# Load uninhabitable buildings
uninhabitable_building_outlines = []
for encoding in buildings.loc[buildings.Condition == 'UNINHABITABLE'].Footprint:
    building = wkt.loads(encoding)
    if type(building) is geo.Polygon:
        uninhabitable_building_outlines.append(building)
    elif type(building) is geo.MultiPolygon:
        uninhabitable_building_outlines.extend(building.geoms)
chicago_uninhabitable_buildings = transform_latlong_to_frame(geo.MultiPolygon(uninhabitable_building_outlines))
# Rasterize to numpy array
uninhabitable_building_frame = np.array(list(map(prep(chicago_uninhabitable_buildings).contains, pixel_points))).reshape((X_MAX_PIXELS,Y_MAX_PIXELS))
print('Uninhabitable buildings frame created.')

# Load habitable buildings
# Find out which pixels are inside each buildings' footprint
# Then, change the pixels in the frames accordingly

# SOUND BUILDINGS
sound_buildings = buildings[buildings.Condition == 'SOUND'].copy()
sound_buildings['Coordinates'] = sound_buildings.Footprint.apply(lambda x: points_within(transform_latlong_to_frame(wkt.loads(x))))

stories_of_sound_buildings_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
sound_buildings.apply(lambda building: column_value_at_coordinate(building, 'Stories', stories_of_sound_buildings_frame), axis=1)

units_of_sound_buildings_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
sound_buildings.apply(lambda building: column_value_at_coordinate(building, 'Units', units_of_sound_buildings_frame), axis=1)

area_of_sound_buildings_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
sound_buildings.apply(lambda building: column_value_at_coordinate(building, 'Square Footage', area_of_sound_buildings_frame), axis=1)
print('Sound building frames created.')

# MINOR REPAIR BUILDINGS
minor_repair_buildings = buildings[buildings.Condition == 'NEEDS MINOR REPAIR'].copy()
minor_repair_buildings['Coordinates'] = minor_repair_buildings.Footprint.apply(lambda x: points_within(transform_latlong_to_frame(wkt.loads(x))))

stories_of_minor_repair_buildings_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
minor_repair_buildings.apply(lambda building: column_value_at_coordinate(building, 'Stories', stories_of_minor_repair_buildings_frame), axis=1)

units_of_minor_repair_buildings_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
minor_repair_buildings.apply(lambda building: column_value_at_coordinate(building, 'Units', units_of_minor_repair_buildings_frame), axis=1)

area_of_minor_repair_buildings_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
minor_repair_buildings.apply(lambda building: column_value_at_coordinate(building, 'Square Footage', area_of_minor_repair_buildings_frame), axis=1)
print('Minor repair building frames created.')

# MAJOR REPAIR BUILDINGS
major_repair_buildings = buildings[buildings.Condition == 'NEEDS MAJOR REPAIR'].copy()
major_repair_buildings['Coordinates'] = major_repair_buildings.Footprint.apply(lambda x: points_within(transform_latlong_to_frame(wkt.loads(x))))

stories_of_major_repair_buildings_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
major_repair_buildings.apply(lambda building: column_value_at_coordinate(building, 'Stories', stories_of_major_repair_buildings_frame), axis=1)

units_of_major_repair_buildings_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
major_repair_buildings.apply(lambda building: column_value_at_coordinate(building, 'Units', units_of_major_repair_buildings_frame), axis=1)

area_of_major_repair_buildings_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
major_repair_buildings.apply(lambda building: column_value_at_coordinate(building, 'Square Footage', area_of_major_repair_buildings_frame), axis=1)

np.savez_compressed(CNN_final_path + 'Building Frames',
                    uninhabitable_building_frame=uninhabitable_building_frame,
                    stories_of_sound_buildings_frame=stories_of_sound_buildings_frame,
                    units_of_sound_buildings_frame=units_of_sound_buildings_frame,
                    area_of_sound_buildings_frame=area_of_sound_buildings_frame,
                    stories_of_minor_repair_buildings_frame=stories_of_minor_repair_buildings_frame,
                    units_of_minor_repair_buildings_frame=units_of_minor_repair_buildings_frame,
                    area_of_minor_repair_buildings_frame=area_of_minor_repair_buildings_frame,
                    stories_of_major_repair_buildings_frame=stories_of_major_repair_buildings_frame,
                    units_of_major_repair_buildings_frame=units_of_major_repair_buildings_frame,
                    area_of_major_repair_buildings_frame=area_of_major_repair_buildings_frame)
del area_of_major_repair_buildings_frame
del units_of_major_repair_buildings_frame
del stories_of_major_repair_buildings_frame
del major_repair_buildings
del area_of_minor_repair_buildings_frame
del units_of_minor_repair_buildings_frame
del stories_of_minor_repair_buildings_frame
del minor_repair_buildings
del area_of_sound_buildings_frame
del units_of_sound_buildings_frame
del stories_of_sound_buildings_frame
del sound_buildings
del uninhabitable_building_frame
del chicago_uninhabitable_buildings
del uninhabitable_building_outlines
del buildings
print('Major repair building frames created and saved.')

Uninhabitable buildings frame created.
Sound building frames created.
Minor repair building frames created.
Major repair building frames created and saved.


In [19]:
###################
# LIBRARIES FRAME #
###################
libraries = pd.read_excel(processed_dataset_paths_xlsx % 'Libraries')

# Create empty frame
library_frame = np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))
# Load libraries
libraries.Location.apply(lambda library: mark_at_latlong(library, library_frame))

np.savez_compressed(CNN_final_path + 'Library Frame', library_frame=library_frame)
del libraries, library_frame
print('Libraries frame created and saved.')

Libraries frame created and saved.


In [21]:
#########################
# LIFE EXPECTANCY FRAME #
#########################
life_expectancy = pd.read_excel(processed_dataset_paths_xlsx % 'Life Expectancy')

# Assign rasterized community to each year
chicago_life_expectancy = life_expectancy.merge(rastered_communities, left_on='Community Area', right_on='Number')
# Create an empty numpy frame for each year
life_expectancy_frame = np.zeros((len(chicago_life_expectancy.Year.unique()), X_MAX_PIXELS, Y_MAX_PIXELS))
# Apply the data to the frames
for year in range(2001, 2019):
    chicago_life_expectancy[chicago_life_expectancy.Year == year].apply(lambda record: column_value_at_coordinate(record, 'Life Expectancy', life_expectancy_frame[2001-year]), axis=1)

np.savez_compressed(CNN_final_path + 'Life Expectancy Frames', life_expectancy_frame=life_expectancy_frame)
del life_expectancy, chicago_life_expectancy, life_expectancy_frame
print('Life expectancy frame created and saved.')

Life expectancy frame created and saved.


In [None]:
##################
# BUSINESS FRAME #
##################
businesses = pd.read_excel(processed_dataset_paths_xlsx % 'Businesses')

# Create empty frame
business_frames = {'Food Service':np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS)),
                   'Tobacco Sale':np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS)),
                   'Alcohol Consumption':np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS)),
                   'Package Store':np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS)),
                   'Gas Station':np.zeros((X_MAX_PIXELS, Y_MAX_PIXELS))}
# Load businesses into frames
for column, frame in business_frames.items():
    businesses[businesses[column]==True].Location.apply(lambda business: mark_at_latlong(business, frame))

np.savez_compressed(CNN_final_path + 'Business Frames', **business_frames)
del businesses, business_frames
print('Business frames created and saved.')

Business frames created and saved.


In [72]:
###################
# L ENTRIES FRAME #
###################
L_entries = pd.read_excel(processed_dataset_paths_xlsx % 'L Entries')

# Transform points from latlong to frame 
translated_L_entries = L_entries.copy()
translated_L_entries['Location'] = L_entries.Location.apply(lambda point: transform_latlong_to_frame(wkt.loads(point)))
translated_L_entries['Date'] = pd.to_datetime(L_entries.Date).apply(lambda this_row_timestamp: (this_row_timestamp.date()-START_DATE).days)
num_days = (FINAL_DATE - START_DATE).days

# pandas dataframe:
#  column is L line
#  row is day number
#  Cell is numpy array:
#    row 1 is x coordinate of rail station
#    row 2 is y coordinate of rail station
#    row 3 is number of entries for rail station
entries_dataframe = pd.DataFrame(columns=L_LINES)

# Loop over every line
for line in entries_dataframe.columns:
    line_filtered_entries = translated_L_entries.loc[translated_L_entries[line + ' Line']]
    # Loop over every day
    for day_index in range(num_days):
        # Filter for that line and day
        # Create the numpy arrays
        filtered_entries = line_filtered_entries.loc[translated_L_entries['Date']==day_index]
        num_entries = filtered_entries['Entries'].values
        station_x = filtered_entries['Location'].apply(lambda point: int(point.x)).values
        station_y = filtered_entries['Location'].apply(lambda point: int(point.y)).values
        # filter y with x
        x_filter = (station_x<X_MAX_PIXELS)&(station_x>=0)
        station_y = station_y[x_filter]
        num_entries = num_entries[x_filter]
        station_x = station_x[x_filter]
        # filter x with y
        y_filter = (station_y<Y_MAX_PIXELS)&(station_y>=0)
        station_x = station_x[y_filter]
        num_entries = num_entries[y_filter]
        station_y = station_y[y_filter]
        entries_dataframe.loc[day_index,[line]] = json.dumps(np.array([station_x, station_y, num_entries]).tolist())

entries_dataframe.to_csv(CNN_final_path + 'L Entries.csv')
del L_entries, translated_L_entries, entries_dataframe
print('L entry frames created and saved.')

L entry frames created and saved.


In [98]:
##################################
# SOCIOECONOMIC INDICATORS FRAME #
##################################

socioeconomics = pd.read_excel(processed_dataset_paths_xlsx % 'Socioeconomics')

# Assign rasterized community to each year
chicago_socioeconomics = socioeconomics.merge(rastered_communities, left_on='Community Area', right_on='Number')
# Create an empty numpy frame for each year
socioeconomic_frame = np.zeros((len(SOCIO_INDICATORS), X_MAX_PIXELS, Y_MAX_PIXELS))
# Apply the data to the frames
for column_index, column in enumerate(SOCIO_INDICATORS):
    chicago_socioeconomics.apply(lambda record: column_value_at_coordinate(record, column, socioeconomic_frame[column_index]), axis=1)

np.savez_compressed(CNN_final_path + 'Socioeconomic Frames', socioeconomic_frame=socioeconomic_frame)
del socioeconomics, chicago_socioeconomics, socioeconomic_frame
print('Socioeconomic frames created and saved.')

Socioeconomic frames created and saved.


In [125]:
######################
# CRIME OUTPUT FRAME #
######################

crimes = pd.read_csv(processed_dataset_paths_csv % 'Crimes')

crimes['Date'] = pd.to_datetime(crimes['Date'])
crimes['Category'], categories = crimes['Category'].factorize()
# Convert to numpy array:
# - axis 0: day index
# - axis 1: time slot
# - axis 2: crime category
# - axis 3: 0 is x locations, 1 is y locations
# - axis 4: crime locations

In [None]:
def place_crime_in_frame(crime_record, frame):
    # Calculate the necessary indices
    crime_date = crime_record['Date'].date()
    crime_timstamp = crime_record['Date']
    day_index = (crime_date - START_DATE).days
    time_slot = int((crime_timstamp.hour + crime_timstamp.minute/60.)/24.*NUM_TIME_SLOTS)
    # Create the crime location
    location = transform_latlong_to_frame(convert_pair_to_point(crime_record['Location']))
    # Find out the index of this crime
    next_location_index = np.argwhere(frame[day_index][time_slot][crime_record['Category']] == np.nan)
    # Store the crime
    frame[day_index][time_slot][crime_record['Category']][0][next_location_index] = location.x
    frame[day_index][time_slot][crime_record['Category']][1][next_location_index] = location.y
    
# Initially, assume 200 crimes per time slot in Chicago
crime_frame = np.full((NUM_DAYS, NUM_TIME_SLOTS, len(crimes['Category'].unique()), 2, 200), np.nan)
# Place all the crimes in the crime frame
crimes.apply(lambda crime_record: place_crime_in_frame(crime_record, crime_frame), axis=1)

# np.savez_compressed(CNN_final_path + 'Crimes', crime_frame=crime_frame)
# del crimes, crime_frame
# print('Crime frames created and saved.')

In [97]:
# plt.imshow(socioeconomic_frame[6])
# img.imsave('waterway.png', waterway_frame)

In [None]:
np.max(np.argwhere(crimes == np.nan)[:,4])

In [139]:
type(crimes['Category'][0])

str