# Crime Analysis in Chicago (2001 vs. 2018)

## Descriptive Data Analysis

### Setup

In [1]:
# Imports
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (30, 15) #increase figure size

In [2]:
# Load all datasets
df_2003 = pd.read_csv("data/crime_2003.csv")
df_2003.name = "Crime in 2003"
df_2003.year = "2003"

df_2007 = pd.read_csv("data/crime_2007.csv")
df_2007.name = "Crime in 2007"
df_2007.year = "2007"

df_2016 = pd.read_csv("data/crime_2016.csv")
df_2016.name = "Crime in 2016"
df_2016.year = "2016"

datasets = [df_2003,
            df_2007,
            df_2016]

df_2008 = pd.read_csv("data/crime_2008.csv")
df_2008.name = "Crime in 2008"
df_2008.year = "2008"

df_2009 = pd.read_csv("data/crime_2009.csv")
df_2009.name = "Crime in 2009"
df_2009.year = "2009"

df_2010 = pd.read_csv("data/crime_2010.csv")
df_2010.name = "Crime in 2010"
df_2010.year = "2010"

df_2011 = pd.read_csv("data/crime_2011.csv")
df_2011.name = "Crime in 2011"
df_2011.year = "2011"

df_2012 = pd.read_csv("data/crime_2012.csv")
df_2012.name = "Crime in 2012"
df_2012.year = "2012"


datasets_by_area = [df_2008,
                    df_2009,
                    df_2010,
                    df_2011,
                    df_2012]

  interactivity=interactivity, compiler=compiler, result=result)


### Size

In [6]:
for dataset in datasets:
    print("Dataset: {}, Shape: {}".format(dataset.name, dataset.shape))
    print("test")

Dataset: Crime in 2003, Shape: (475913, 22)
test
Dataset: Crime in 2007, Shape: (621848, 22)
test
Dataset: Crime in 2016, Shape: (265462, 22)
test


### Missingness

In [7]:
for dataset in datasets:
    print("Dataset: {}".format(dataset.name))
    print(dataset.isnull().sum())
    print('\n')

Dataset: Crime in 2003
ID                         0
Case Number                0
Date                       0
Block                      0
IUCR                       0
Primary Type               0
Description                0
Location Description       2
Arrest                     0
Domestic                   0
Beat                       0
District                   0
Ward                      19
Community Area            50
FBI Code                   0
X Coordinate            4728
Y Coordinate            4728
Year                       0
Updated On                 0
Latitude                4728
Longitude               4728
Location                4728
dtype: int64


Dataset: Crime in 2007
ID                         0
Case Number                0
Date                       0
Block                      0
IUCR                       0
Primary Type               0
Description                0
Location Description       5
Arrest                     0
Domestic                   0
Beat       

### Remove rows with missing values

In [8]:
for dataset in datasets:
    dataset['Date'] = pd.to_datetime(dataset['Date'])
    dataset.set_index('Date', inplace=True)
    dataset.sort_index(inplace=True)
    print(dataset.name, "done")

for dataset in datasets_by_area:
    dataset['Date'] = pd.to_datetime(dataset['Date'])
    dataset.set_index('Date', inplace=True)
    dataset.sort_index(inplace=True)
    print(dataset.name, "done")

Crime in 2003 done
Crime in 2007 done
Crime in 2016 done
Crime in 2008 done
Crime in 2009 done
Crime in 2010 done
Crime in 2011 done
Crime in 2012 done


In [9]:
for dataset in datasets:
    dataset.dropna(inplace=True)
    
for dataset in datasets_by_area:
    dataset.dropna(inplace=True)

In [None]:
for dataset in datasets:
    dataset.drop(columns=['ID', 'Case Number', 'Block', 'IUCR', 'Beat', 'Ward', 'FBI Code', 'Updated On', 'Latitude', 'Longitude'], axis=1, inplace=True)

for dataset in datasets_by_area:
    dataset.drop(columns=['ID', 'Case Number', 'Block', 'IUCR', 'Beat', 'Ward', 'FBI Code', 'Updated On', 'Latitude', 'Longitude'], axis=1, inplace=True)

### Number of crimes per year

In [None]:
years = []
data_len = []
for dataset in datasets:
    years.append(dataset.year)
    data_len.append(len(dataset))

df_num = pd.DataFrame({'Year': years,'Num Crimes': data_len}).set_index('Year')
df_num.plot(kind='bar', rot=0);

In [None]:
print("this is a test - github pls work")

### Crime per community area (2003, 2007, 2016)

In [None]:
df_areas = pd.DataFrame()
df_areas['2003'] = df_2003['Community Area'].value_counts()
df_areas['2007'] = df_2007['Community Area'].value_counts()
df_areas['2016'] = df_2016['Community Area'].value_counts()
df_areas.plot(kind='bar', rot=90);

## Latitude & Longtitude

In [None]:
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon

In [None]:
crs={'init':'epsg:4326'}

# zip the Longtitude and Latitude together. Each point represents a row in Data frame
Geo_2003= [Point(xy) for xy in zip (df_2003["X Coordinate"],df_2003["Y Coordinate"])]
Geo_2007= [Point(xy) for xy in zip (df_2007["X Coordinate"],df_2007["Y Coordinate"])]
Geo_2016= [Point(xy) for xy in zip (df_2016["X Coordinate"],df_2016["Y Coordinate"])]

Geo_df_2003= gpd.GeoDataFrame(df_2003,crs=crs,geometry=Geo_2003)
Geo_df_2007= gpd.GeoDataFrame(df_2007,crs=crs,geometry=Geo_2007)
Geo_df_2016= gpd.GeoDataFrame(df_2016,crs=crs,geometry=Geo_2016)

In [None]:
fig,ax= plt.subplots(figsize=(8,8))
Geo_df_2003.plot(ax=ax,markersize=0.25)
ax.set_title('Crimes happened in Chicago in 2003')

In [None]:
fig,ax= plt.subplots(figsize=(8,8))
Geo_df_2007.plot(ax=ax,markersize=0.25)
ax.set_title('Crimes happened in Chicago in 2007')
plt.show()

In [None]:
fig,ax= plt.subplots(figsize=(8,8))
Geo_df_2016.plot(ax=ax,markersize=0.25)
ax.set_title('Crimes happened in Chicago in 2016')
plt.show()

In [None]:
## You can change Primary Type to see different types.

In [None]:
fig,ax= plt.subplots(figsize=(8,8))
Geo_df_2003[Geo_df_2003['Primary Type']=='THEFT'].plot(ax=ax,markersize=0.25)
ax.set_title('Homicide happened in Chicago in 2003')

In [None]:
fig,ax= plt.subplots(figsize=(8,8))
Geo_df_2007[Geo_df_2007['Primary Type']=='THEFT'].plot(ax=ax,markersize=0.25)
ax.set_title('Homicide happened in Chicago in 2007')

In [None]:
fig,ax= plt.subplots(figsize=(8,8))
Geo_df_2016[Geo_df_2016['Primary Type']=='THEFT'].plot(ax=ax,markersize=0.25)
ax.set_title('Homicide happened in Chicago in 2016')

## Choropleth 

In [None]:
fd='data/PoliceDistrict copy.shp'
map_Chicago=gpd.read_file(fd)
map_Chicago.drop(columns=['DIST_LABEL'],inplace=True)
map_Chicago["DIST_NUM"]=map_Chicago["DIST_NUM"].apply(pd.to_numeric)

district_2003=pd.DataFrame(df_2003['District'].value_counts())
district_2007=pd.DataFrame(df_2007['District'].value_counts())
district_2016=pd.DataFrame(df_2016['District'].value_counts())


district_2003=district_2003.reset_index().rename(columns={'index':'DIST_NUM','District':'CRIME_NUM'})
district_2007=district_2007.reset_index().rename(columns={'index':'DIST_NUM','District':'CRIME_NUM'})
district_2016=district_2016.reset_index().rename(columns={'index':'DIST_NUM','District':'CRIME_NUM'})

map_2003=map_Chicago.merge(district_2003,on='DIST_NUM')
map_2007=map_Chicago.merge(district_2007,on='DIST_NUM')
map_2016=map_Chicago.merge(district_2016,on='DIST_NUM')



In [None]:
SMALL_SIZE = 15
MEDIUM_SIZE = 18
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams['figure.figsize'] = (25, 10) #increase figure size

In [None]:
fig, axs = plt.subplots(1,3)

sm = plt.cm.ScalarMappable(cmap='Greens')
sm._A = []
cbar = fig.colorbar(sm)
ax.axis('off')
ax.set_title('Crime happened in Chicago in 2003')


map_2003.plot(column="DIST_NUM", cmap='Greens', linewidth=0.8, ax=axs[0], edgecolor='0.6')
map_2007.plot(column="DIST_NUM", cmap='Greens', linewidth=0.8, ax=axs[1], edgecolor='0.6')
map_2016.plot(column="DIST_NUM", cmap='Greens', linewidth=0.8, ax=axs[2], edgecolor='0.6')


## Get Data

In [22]:
df_2008_2012

Unnamed: 0_level_0,ID,Case Number,Block,IUCR,Primary Type,Description,Location Description,Beat,Ward,Community Area,FBI Code,Updated On,Latitude,Longitude,Location,Num Crimes
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2008-01-01 00:00:00,6211723,HP299683,106XX S STATE LINE RD,0840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,432,10.0,52.0,06,02/04/2016 06:33:39 AM,41.702015,-87.524532,"(41.7020149, -87.524532163)",1
2008-01-01 00:00:00,6211723,HP299683,106XX S STATE LINE RD,0840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,432,10.0,52.0,06,02/04/2016 06:33:39 AM,41.702015,-87.524532,"(41.7020149, -87.524532163)",1
2008-01-01 00:00:00,6212119,HP299612,053XX W CONGRESS PKWY,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,1522,29.0,25.0,26,02/04/2016 06:33:39 AM,41.873905,-87.758120,"(41.873905286, -87.758119625)",1
2008-01-01 00:00:00,6112974,HP208079,092XX S BRANDON AVE,0840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,424,10.0,46.0,06,02/04/2016 06:33:39 AM,41.727522,-87.546863,"(41.727521881, -87.546862958)",1
2008-01-01 00:00:00,5999804,HP107869,015XX W 95TH ST,0810,THEFT,OVER $500,CHURCH/SYNAGOGUE/PLACE OF WORSHIP,2221,21.0,73.0,06,02/04/2016 06:33:39 AM,41.721242,-87.660354,"(41.721241868, -87.660354127)",1
2008-01-01 00:00:00,6740434,HR156944,045XX N MAGNOLIA AVE,0840,THEFT,FINANCIAL ID THEFT: OVER $300,APARTMENT,2311,46.0,3.0,06,02/04/2016 06:33:39 AM,41.964352,-87.661001,"(41.964351639, -87.661000678)",1
2008-01-01 00:00:00,5992244,HP101403,005XX W DEMING PL,0810,THEFT,OVER $500,STREET,1933,43.0,7.0,06,02/04/2016 06:33:39 AM,41.928280,-87.643119,"(41.928279878, -87.643119246)",1
2008-01-01 00:00:00,6212213,HP300014,027XX N CLARK ST,0840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,2333,43.0,7.0,06,02/04/2016 06:33:39 AM,41.932167,-87.644738,"(41.932167184, -87.644738471)",1
2008-01-01 00:00:00,6254976,HP343482,002XX S HOYNE AVE,0265,CRIM SEXUAL ASSAULT,AGGRAVATED: OTHER,APARTMENT,1211,2.0,28.0,02,02/04/2016 06:33:39 AM,41.878416,-87.679014,"(41.878415984, -87.679013713)",1
2008-01-01 00:00:00,6059783,HP160371,015XX S CANAL ST,0810,THEFT,OVER $500,OTHER,131,2.0,28.0,06,02/04/2016 06:33:39 AM,41.860958,-87.639035,"(41.860958336, -87.639034715)",1


In [23]:
df_2008_2012 = pd.concat(datasets_by_area)

# Get count of number of crimes
df_2008_2012['Num Crimes'] = 1

# Drop unnecessary columns
df_2008_2012.drop(columns=['Arrest', 'Domestic', 'District', 'X Coordinate', 'Y Coordinate', 'Year'], axis=1, inplace=True)

# Drop unnecessary first row
# df_2008_2012.drop([0], inplace=True)

#import income data
df_income = pd.read_csv("./data/Per_Capita_Income.csv", index_col=0)
df_income.index.names = ['Community Area']

"""
# Join Dataframes
df_2008_2012 = df_2008_2012.join(df_income, how='outer')
df_2008_2012.columns

# Rename column (to remove space)
df_2008_2012.rename(columns={'Num Crimes':'NUM',"PER CAPITA INCOME ":'INCOME',"HARDSHIP INDEX":"HARDSHIP_INDEX"}, inplace=True)
"""


'\n# Join Dataframes\ndf_2008_2012 = df_2008_2012.join(df_income, how=\'outer\')\ndf_2008_2012.columns\n\n# Rename column (to remove space)\ndf_2008_2012.rename(columns={\'Num Crimes\':\'NUM\',"PER CAPITA INCOME ":\'INCOME\',"HARDSHIP INDEX":"HARDSHIP_INDEX"}, inplace=True)\n'

In [26]:
# Groupby Crime Type & Community area
df_2008_2012_crime={}

for crime_type in list(set(df_2008_2012['Primary Type'])):
    df_2008_2012_crime.update({crime_type:df_2008_2012[df_2008_2012['Primary Type']==crime_type].groupby(['Community Area']).sum()})
    
    df_2008_2012_crime[crime_type].join(df_income, how='outer')
    
    
    
    

In [27]:
df_2008_2012_crime['THEFT']

Unnamed: 0_level_0,ID,Beat,Ward,Latitude,Longitude,Num Crimes
Community Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,69595134,5668,380.0,4.189029e+02,-8.771492e+02,10
1.0,65979830751,21853607,429280.0,3.784390e+05,-7.897332e+05,9008
2.0,66306679632,21591618,438167.0,3.830000e+05,-7.996703e+05,9119
3.0,77596201355,23057645,492201.0,4.450025e+05,-9.295170e+05,10604
4.0,42194677711,11414911,254845.0,2.415859e+05,-5.047280e+05,5756
5.0,44722523117,11779834,249270.0,2.578932e+05,-5.390741e+05,6148
6.0,174658662851,49819500,1027073.0,1.000007e+06,-2.089860e+06,23842
7.0,157550198548,40763890,852490.0,9.078614e+05,-1.898155e+06,21656
8.0,302521777231,75404792,1645557.0,1.726928e+06,-3.611945e+06,41218
9.0,4913409398,1098925,27962.0,2.864807e+04,-5.988884e+04,682


In [None]:
plt.hist(df_2008_2012['Income'],bins=10)

In [None]:
Income10= pd.DataFrame({'Income10':np.log(df_2008_2012["Income"])})
df_2008_2012=df_2008_2012.join(Income10)
plt.hist(df_2008_2012['Income10'],bins=10)

In [None]:
plt.scatter(df_2008_2012['Income'],df_2008_2012['Num'])
plt.xlabel('Income')
plt.ylabel('Num')

In [None]:
plt.scatter(df_2008_2012['Income10'],df_2008_2012['Num'])
plt.xlabel('Income10')
plt.ylabel('Num')

In [None]:
import patsy
import statsmodels.api as sm

In [None]:
outcome, predictors = patsy.dmatrices('Num ~ Income', df_2008_2012)
mod = sm.OLS(outcome, predictors)
res = mod.fit()

## look at the results
print(res.summary())

In [None]:
outcome, predictors = patsy.dmatrices('Num ~ Income10', df_2008_2012)
mod = sm.OLS(outcome, predictors)
res = mod.fit()

## look at the results
print(res.summary())