In [None]:
import pandas as pd

import matplotlib.pyplot as plt

import numpy as np

In [None]:
used_columns=['permit_type',
              'review_type',
              'application_start_date',
              'processing_time',
#              'building_fee_paid',
#              'zoning_fee_paid',
#              'other_fee_paid',
#              'subtotal_paid',
#              'building_fee_unpaid',
#              'zoning_fee_unpaid',
#              'other_fee_unpaid',
#              'subtotal_unpaid',
#              'building_fee_waived',
#              'zoning_fee_waived',
#              'other_fee_waived',
#              'subtotal_waived',
              'total_fee',
              'reported_cost',
              'community_area',
#              'census_tract',
#              'ward',
              'latitude',
              'longitude',
              'location'
             ]


# Fetch small data set
#permit_df = pd.read_json("https://data.cityofchicago.org/resource/building-permits.json")
# Store small data set in csv
#permit_df.to_csv('building-permits-small.csv', index=False)
# Open csv and only keep useful columns
#permit_df= pd.read_csv('building-permits-small.csv', usecols=used_columns)

# Change small letters to capitals (json had lower case column labels)
used_columns=[string.upper() for string in used_columns]

permit_df= pd.read_csv('Building_Permits.csv', usecols=used_columns)

# Change upper case column labels in data frame back to lower case.

permit_df.columns = map(str.lower, permit_df.columns)

In [None]:
permit_df.head()

## Data cleaning

In [None]:
permit_df_shape=permit_df.shape

print(f"There are {permit_df_shape[0]} total permits in data set.")

# Remove any rows with empty application_start_date

permit_df = permit_df[pd.notnull(permit_df['application_start_date'])]

permit_df_shape=permit_df.shape

print(f"There are {permit_df_shape[0]} total permits in data with nonempty start date.")

# Remove any rows with empty community_area and convert to integers

permit_df = permit_df[pd.notnull(permit_df['community_area'])]

permit_df['community_area']=permit_df['community_area'].astype('int')

permit_df_shape=permit_df.shape

print(f"There are {permit_df_shape[0]} total permits in data with nonempty start date and nonempty community area.")

# Remove any rows with nonexistant community_area 0

permit_df = permit_df[permit_df['community_area']!=0]

permit_df_shape=permit_df.shape

print(f"There are {permit_df_shape[0]} total permits in data with nonempty start date and nonzero, nonempty community area.")

# Fixing date format

permit_df['application_start_date']=pd.to_datetime(permit_df['application_start_date'])

# Add a start_month column

permit_df['start_month'] = permit_df['application_start_date'].dt.month

# Remove "permit - " text frome each permit type.

permit_df['permit_type'] = permit_df['permit_type'].map(lambda x: x.replace('PERMIT - ', '', 1))

# Dictionary for later use

community_areas_dict = {}
with open('data/community_areas_dict') as f:
    for line in f:
        (key, val) = line.rstrip("\n").split(";")
        community_areas_dict[int(key)] = val
        
permit_df.head()

In [None]:
# How many sign are there?
sign_permit_df=permit_df[permit_df['permit_type']=='SIGNS']
print(f'There are {sign_permit_df.shape[0]} sign permits')
# Remove sign permits
#permit_df=permit_df[permit_df['permit_type']!='PERMIT - SIGNS']
# How many EASY PERMIT WEB web permits are there?
easy_permit_df=permit_df[permit_df['review_type']=='EASY PERMIT WEB']
print(f'There are {easy_permit_df.shape[0]} easy web permits')
# Remove easy permits
#permit_df=permit_df[permit_df['review_type']!='EASY PERMIT WEB']

## Initial plot data

In [None]:
# How many applications for each type?

permit_proc_gb=permit_df[['permit_type','processing_time']].groupby(['permit_type'], as_index=False)

perm_type_num_applications_df=pd.DataFrame(permit_proc_gb.count())

perm_type_num_applications_df=perm_type_num_applications_df.rename(columns={"processing_time": "num_of_applications"})

# perm_type_num_applications_df.to_csv('plot_data/num_app_to_permit_type_bar_graph.csv', index=True)

perm_type_num_applications_df

In [None]:
sorted_app_to_permit_df = perm_type_num_applications_df.sort_values(by=['num_of_applications'])

In [None]:
plt.figure(figsize=(10,10))
plt.bar(sorted_app_to_permit_df['permit_type'], sorted_app_to_permit_df['num_of_applications'], color='b', alpha=0.75, align="center")

plt.title("Number of Apps by Permit Type")
plt.xlabel("Permit Type")
plt.xticks(rotation=90)
plt.ylabel("Number of Apps");
plt.savefig('figures/01_obj_0_number_of_apps_by_permit_type.png')

In [None]:
# How many applications are made each month?

month_proc_time_gb=permit_df[['start_month','processing_time']].groupby(['start_month'], as_index=False)

month_num_applications_df=pd.DataFrame(month_proc_time_gb.count())

month_num_applications_df=month_num_applications_df.rename(columns={"processing_time": "num_of_applications"})

month_num_applications_df.to_csv('plot_data/num_app_to_s_month_bar_graph.csv', index=True)

month_num_applications_df

In [None]:
sorted_app_to_s_month_df = month_num_applications_df.sort_values(by=['num_of_applications'])

In [None]:
plt.figure(figsize=(10,10))
plt.bar(sorted_app_to_s_month_df['start_month'], sorted_app_to_s_month_df['num_of_applications'], color='b', alpha=0.75, align="center")

plt.title("Number of Apps by Start Month")
plt.xlabel("Start Month")
plt.xticks(rotation=90)
plt.yscale('log')
plt.ylabel("Number of Apps");
plt.savefig('figures/02_obj_0_number_of_apps_by_start_month.png')

In [None]:
# How many applications were made in each community area?

comm_area_gb=permit_df[['community_area','processing_time']].groupby(['community_area'], as_index=False)

comm_area_num_applications_df=pd.DataFrame(comm_area_gb.count())

comm_area_num_applications_df=comm_area_num_applications_df.rename(columns={"processing_time": "num_of_applications"})

# Apply community area labels
comm_area_num_applications_df=comm_area_num_applications_df.replace({"community_area": community_areas_dict})

comm_area_num_applications_df.to_csv('plot_data/num_app_to_com_areas_bar_graph.csv', index=False)

comm_area_num_applications_df

In [None]:
sorted_app_to_comm_df = comm_area_num_applications_df.sort_values(by=['num_of_applications'])

In [None]:
plt.figure(figsize=(10,15))
plt.barh(sorted_app_to_comm_df['community_area'], sorted_app_to_comm_df['num_of_applications'], color='b', alpha=0.75, align="center")

plt.title("Number of Apps by Community Area")
plt.xlabel("Number of Apps")
plt.xticks(rotation=90)
plt.ylabel("Community Area");
plt.savefig('figures/03_obj_0_number_of_apps_by_community_area.png')

## Objective 1: How does start month affect processing time?

In [None]:
# How many permits are there? (Now defined in initial plots)
# month_proc_time_gb=permit_df[['start_month','processing_time']].groupby(['start_month'], as_index=True)

month_proc_time_df=pd.DataFrame(month_proc_time_gb.mean())

month_proc_time_df=month_proc_time_df.rename(columns={"processing_time": "ave_proc_time"})

# Make a temporary dataframe to hold data where processing time is nonzero.
temp_df=permit_df[['start_month','processing_time']]

temp_df=temp_df[temp_df['processing_time']!=0]

# Create a boxplot to compare means
temp_df.boxplot('processing_time', by='start_month', figsize=(20, 10))

plt.title('Time to approval vs. Application month')
# Remove annoying subtitle
plt.suptitle('')
plt.xlabel('Application month')
plt.ylabel('Time to approval')
plt.ylim((-100,150))
plt.savefig('figures/04_obj_1_start_month_proc_time_box_plot.png')


month_proc_time_df

In [None]:
# Todo ANOVA?

## Objective 2: How does community area affect processing time?

In [None]:
# Data from https://data.cityofchicago.org/Health-Human-Services/Census-Data-Selected-socioeconomic-indicators-in-C/kn9c-c2s2/data

# File to Load (Remember to Change These)
file_to_load = "data/Per_Capita_Income.csv"

# Read Purchasing File and store into Pandas dataframe
income_data = pd.read_csv(file_to_load)
# Remove empty community number (was for all Chicago)
income_data = income_data[pd.notnull(income_data['Community Area Number'])]


temp_df=income_data[['Community Area Number','PER CAPITA INCOME ']].sort_values(by=['PER CAPITA INCOME '])

community_list_srt_by_inc_idx=temp_df['Community Area Number'].to_list()

community_list_srt_by_inc_idx=[int(number) for number in community_list_srt_by_inc_idx]

community_list_srt_by_inc=[community_areas_dict[number] for number in community_list_srt_by_inc_idx]

temp_df.head()

In [None]:
# Make a temporary dataframe to hold data where processing time is nonzero.

temp_df=permit_df[['community_area','processing_time','total_fee']]

temp_df=temp_df[temp_df['processing_time']!=0]

com_ar_proc_time_gb=temp_df.groupby(['community_area'], as_index=False)


com_ar_proc_time_df=pd.DataFrame(com_ar_proc_time_gb.median())

com_ar_proc_time_df=com_ar_proc_time_df.replace({"community_area": community_areas_dict})

com_ar_proc_time_df

#high_inc_df.sort_values(by=['community_area'], ascending=False)

# Could do boxplots again.

In [None]:
plt.figure(figsize=(20,3))

# Old plot command
#plt.scatter(rank_comm_proc_df["community_area"], rank_comm_proc_df["processing_time"], marker="o", facecolors="blue", 
#           edgecolors="black", alpha=1);

y_values=[com_ar_proc_time_df["processing_time"][j-1] for j in community_list_srt_by_inc_idx]

plt.scatter(community_list_srt_by_inc, y_values, marker="o", facecolors="blue", 
           edgecolors="black", alpha=1);

plt.title("Avg Processing Time by Momth")
plt.xlabel("Month")
plt.xticks(rotation=90)
plt.ylabel("Avg Processing Time (Days)")
plt.ylim(0, 35);
plt.savefig('figures/05_obj_2_avg_proc_tm_by_mnth.png')

## Objective 3: How do permits per capita compare across all community areas?

In [None]:
# Data from https://datahub.cmap.illinois.gov/dataset/2010-census-data-summarized-to-chicago-community-areas

# File to Load (Remember to Change These)
file_to_load = "data/CCASF12010CMAP.csv"

# Read Purchasing File and store into Pandas dataframe
population_data = pd.read_csv(file_to_load)
population_data.head()

In [None]:
com_ar_ct_df=pd.DataFrame(com_ar_proc_time_gb.count())

com_ar_ct_df['Total Population']=''
for j in range(0,com_ar_ct_df.shape[0]):
    com_ar_ct_df['Total Population'].iloc[j]=population_data['P0050001'].iloc[int(com_ar_ct_df['community_area'].iloc[j])]
com_ar_ct_df['Total Population']=pd.to_numeric(com_ar_ct_df['Total Population'])

com_ar_ct_df['perm_p_person']=(com_ar_ct_df['processing_time']/com_ar_ct_df['Total Population'])

# Apply community area labels
com_ar_ct_df=com_ar_ct_df.replace({"community_area": community_areas_dict})

com_ar_ct_df

In [None]:
plt.figure(figsize=(10,15))
areas = com_ar_ct_df['community_area']
permits = (com_ar_ct_df['perm_p_person']*1000)
plt.barh(areas, permits)
plt.xlabel('Permits', fontsize=15, color='green')
plt.ylabel('Community Area', fontsize=15, color='green')
plt.title('Chicago Areas Permits Per 1,000 People')
plt.savefig('figures/06_obj_3_chi_are_perm_p_thou.png')

## Objective 4: What is the average permit fee for all projects in a community area?

In [None]:
com_ar_fee_df=pd.DataFrame(com_ar_proc_time_gb.mean())

# Apply community area labels
com_ar_fee_df=com_ar_fee_df.replace({"community_area": community_areas_dict})

com_ar_fee_df

In [None]:
plt.figure(figsize=(10,15))
# areas = com_ar_ct_df['community_area']
# fees = (com_ar_fee_df['total_fee'])
areas=community_list_srt_by_inc
fees=[com_ar_fee_df['total_fee'][j-1] for j in community_list_srt_by_inc_idx]
plt.barh(areas, fees)
plt.xlabel('Fees', fontsize=15, color='green')
plt.ylabel('Area', fontsize=15, color='green')
plt.title('Chicago Areas Average Permit Fees')
# plt.grid(True)
plt.xscale('log')
plt.axvline(x=1000, color='red')
plt.savefig('figures/07_obj_4_chi_are_perm_fees.png')

## Objective 5: How do types of permits applied for differ between the community areas with the highest and lowest per capita income?


In [None]:
# Change Community Area Numbers to integers.
income_data['Community Area Number']=income_data['Community Area Number'].astype('int')

income_data.head()

In [None]:
for col in income_data.columns: 
    print(col) 

In [None]:
max_index=income_data['PER CAPITA INCOME '].idxmax()
min_index=(-income_data['PER CAPITA INCOME ']).idxmax()
print(f'Highest per capita income in {community_areas_dict[max_index]}, community number {max_index}.')
print(f'Lowest per capita income in {community_areas_dict[min_index]}, community number {min_index}.')

In [None]:
# Find proportions of permit types in high income community area

high_inc_gb=permit_df[permit_df['community_area']==max_index][['permit_type','community_area']].groupby(['permit_type'], as_index=False)

high_inc_df=pd.DataFrame(high_inc_gb.count())

high_inc_df=high_inc_df.sort_values(by=['community_area'], ascending=False)

high_inc_df=high_inc_df.rename(columns={'community_area': 'high_proportion'})

high_inc_total=high_inc_df['high_proportion'].sum()

high_inc_df['high_proportion']=high_inc_df['high_proportion']/high_inc_total

high_inc_df

In [None]:
# Find proportions of permit types in low income community area

low_inc_gb=permit_df[permit_df['community_area']==min_index][['permit_type','community_area']].groupby(['permit_type'], as_index=False)

low_inc_df=pd.DataFrame(low_inc_gb.count())

low_inc_df=low_inc_df.sort_values(by=['community_area'], ascending=False)

low_inc_df=low_inc_df.rename(columns={'community_area': 'low_proportion'})

low_inc_total=low_inc_df['low_proportion'].sum()

low_inc_df['low_proportion']=low_inc_df['low_proportion']/low_inc_total

low_inc_df

In [None]:
# Merge data for low and high income


high_low_df=pd.merge(high_inc_df, low_inc_df, on='permit_type')

high_low_df.to_csv('plot_data/high_low_income_permit_type_proportions.csv', index=False)

high_low_df

In [None]:
high_prop_list=high_low_df['high_proportion'].to_list()
low_prop_list=high_low_df['low_proportion'].to_list()
permit_list=high_low_df['permit_type'].to_list()
y_data=[y for y in range(0,len(high_prop_list))]
fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111)
width = 0.35
ax.barh(y_data, low_prop_list, width, color='royalblue', label='low income')
ax.barh(y_data+np.array(width), high_prop_list, width, color='seagreen', label='high income')
ax.set_yticklabels(permit_list)
ax.set_yticks(np.arange(len(y_data)))
#plt.xscale('log')
plt.xlabel('Proportion of permits')
plt.ylabel('Permit type')
plt.legend()
plt.title('Permit type proportions in highest and lowest income community area')
plt.savefig('figures/08_obj_5_chi_are_perm_fees.png')

## What location paid the most permit fees?  (in each community area)

In [None]:
# Get list of all area indexes
com_area_indexes=range(1,78)

# Find indexes for rows with the largest fees for each community area
max_indices=[permit_df[(permit_df['community_area']==index)]['total_fee'].idxmax() for index in com_area_indexes]

In [None]:
# Create data frame to store information on highest permit fee locations and populate it.
max_fee_loc_df = pd.DataFrame(columns=['community_area','permit_type','total_fee','latitude','longitude'])                      

i=0
for j in max_indices:
    max_fee_loc_df.loc[i]=[permit_df['community_area'][j],
                           permit_df['permit_type'][j],
                           permit_df['total_fee'][j],                          
                           permit_df['latitude'][j],
                           permit_df['longitude'][j]]
    i+=1
    
# Apply community area labels
max_fee_loc_df['community_area_name']=''
max_fee_loc_df['community_area_name']=max_fee_loc_df.replace({"community_area": community_areas_dict})

max_fee_loc_df=max_fee_loc_df.sort_values(by=['community_area'])
#max_fee_loc_df=max_fee_loc_df.reindex(index=range(0,77))

max_fee_loc_df.sort_values(by=['total_fee'])

In [None]:
#com_ar_proc_time_gb=temp_df.groupby(['community_area'], as_index=False)


#com_ar_proc_time_df=pd.DataFrame(com_ar_proc_time_gb.median())

max_fee_per_type_gb=max_fee_loc_df[['permit_type','latitude']].groupby(['permit_type'])

pd.DataFrame(max_fee_per_type_gb.count()).rename(columns={'latitude':'count'})

In [None]:
import gmaps

import json
import requests

# Google developer API key
from api_keys import g_key

with open('Comm_Areas.geojson') as json_file:
    boundaries = json.load(json_file)

#boundaries = json.loads('Comm_Areas.geojson')

In [None]:
import pprint
import matplotlib.colors as mcol
import matplotlib.cm as cm

# Store community area numbers in order that they appear in geojson.
boundaries_com_idx_list=[int(boundaries['features'][j]['properties']['area_num_1']) for j in range(0,77)]

## Old Method (Not used due to very large maximum compared to rest of data)
#cmap = cm.get_cmap('rainbow')
# norm_max_fee_list=max_fee_loc_df['total_fee'].to_list()
# min_max_com_fee=min(norm_max_fee_list)
# max_minus_min_com_fee=max(norm_max_fee_list)-min_max_com_fee
# norm_max_fee_list=[(fee-min_max_com_fee)/max_minus_min_com_fee for fee in norm_max_fee_list]
# boundaries_com_idx_list=[int(boundaries['features'][j]['properties']['area_num_1']) for j in range(0,77)]
## Convert normalized fees to html colors
#com_area_colors=[matplotlib.colors.rgb2hex(cmap(norm_max_fee_list[j-1])[:3]) for j in boundaries_com_idx_list]

# New Method
# Make a user-defined colormap.
cmap = mcol.LinearSegmentedColormap.from_list("MyCmapName",["b","r"])
max_fee_list=max_fee_loc_df['total_fee'].to_list()
# Get indices of community areas after they have been sorted by max permit fee (ascending)
color_numbers=sorted(range(len(max_fee_list)), key=lambda k: max_fee_list[k])
# Initialize array to store colors
com_area_colors = [''] * 77
# Make vector with colors assigned to community area in order of the permit_fee
for j in range(0,77):
    com_area_colors[color_numbers[j]]=mcol.rgb2hex(cmap((j)/76)[:3])
# Copy colors
com_area_colors2=com_area_colors
# Reindex list of colors according to order in geojson
com_area_colors=[com_area_colors2[boundaries_com_idx_list[j]-1] for j in range(0,77)]

In [None]:
# Configure gmaps
gmaps.configure(api_key=g_key)

# Customize the size of the figure
figure_layout = {
    'width': '800px',
    'height': '1000px',
    'border': '1px solid black',
    'padding': '1px',
    'margin': '0 auto 0 auto'
}

# Create a map
marker_locations = max_fee_loc_df[['latitude','longitude']]

fig2 = gmaps.figure(layout=figure_layout)

import random

#geojson = gmaps.geojson_layer(boundaries, fill_color=colors, fill_opacity=0.2, stroke_weight=1)
geojson = gmaps.geojson_layer(boundaries, fill_color=com_area_colors, fill_opacity=0.7, stroke_weight=0.5)

# Assign the marker layer to a variable
markers = gmaps.marker_layer(marker_locations)

fig2.add_layer(markers)
fig2.add_layer(geojson)

fig2

In [None]:
per_type_gb=permit_df[['permit_type','processing_time','total_fee']].groupby(['permit_type'], as_index=False)

per_type_sum_df=pd.DataFrame(per_type_gb.sum())

per_type_sum_df

In [None]:
per_type_ct_df=pd.DataFrame(per_type_gb.count())

per_type_ct_df

In [None]:
per_type_mth_gb=permit_df[['permit_type','start_month','processing_time','total_fee']].groupby([
    'permit_type','start_month'], as_index=False)

per_type_mth_ct_df=pd.DataFrame(per_type_mth_gb.count())

per_type_mth_ct_df[per_type_mth_ct_df['permit_type']=='PERMIT - EASY PERMIT PROCESS']

In [None]:

per_type_mth_ct_df[per_type_mth_ct_df['permit_type']=='PERMIT - RENOVATION/ALTERATION']

In [None]:
per_type_mth_ct_df=pd.DataFrame(per_type_mth_gb.count())
per_type_mth_ct_df