In [2]:
from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math
import warnings
from sklearn.decomposition import PCA
import statsmodels.formula.api as smf

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.5f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

# Introduction

Since I live in Tampa, looking closer at Florida crime data seems rewarding! On the FBI's site, we are able to download crime data for the past 23 years ([here](https://ucr.fbi.gov/crime-in-the-u.s), so we grabbed the [2016](https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/tables/table-6/table-6-state-cuts/florida.xls) and [2017](https://ucr.fbi.gov/crime-in-the-u.s/2017/crime-in-the-u.s.-2017/tables/table-8/table-8-state-cuts/florida.xls) data.


## Read Data

In [3]:
# Read in the crime data for Florida for 2016, listed by city
florida_crime = pd.read_excel('./florida_2016.xls')
florida_crime_2017 = pd.read_excel('./florida_2017.xls')
florida_crime.columns = ['city', 'population', 'violent_crime', 'murder', 'rape', 'robbery',
       'assault', 'property_crime', 'burglary', 'larceny_theft',
       'motor_vehicle_theft', 'arson']
florida_crime_2017.columns = florida_crime.columns
florida_crime.set_index('city', inplace=True)
florida_crime_2017.set_index('city', inplace=True)

# lets make the predicted value be the inverse of property crime per capita
# in other words, for how many people does it take for one property crime to be committed?
florida_crime["odds"] = florida_crime["population"] / florida_crime["property_crime"]
florida_crime_2017["odds"] = florida_crime_2017["population"] / florida_crime_2017["property_crime"]

# our model cannot handle inf and nan values so we need to replace them with something
florida_crime.loc["Center Hill","odds"] = 0.0

## Transformations

1. Log Population
2. Indicator Variables
  1. Crimes
  2. Population

In [6]:
# 1. Log Transform population
florida_crime["population_log"] = np.log(florida_crime.population)

# 2. Indicator Variables

## Crimes
florida_crime["has_violent_crime"] = np.where(florida_crime["violent_crime"] > 0, 1, 0)
florida_crime["has_murder"] = np.where(florida_crime["murder"] > 0, 1, 0)
florida_crime["has_rape"] = np.where(florida_crime["rape"] > 0, 1, 0)
florida_crime["has_robbery"] = np.where(florida_crime["robbery"] > 0, 1, 0)
florida_crime["has_assault"] = np.where(florida_crime["assault"] > 0, 1, 0)
florida_crime["has_property_crime"] = np.where(florida_crime["property_crime"] > 0, 1, 0)
florida_crime["has_bulglary"] = np.where(florida_crime["burglary"] > 0, 1, 0)
florida_crime["has_larceny_theft"] = np.where(florida_crime["larceny_theft"] > 0, 1, 0)
florida_crime["has_motor_vehicle_theft"] = np.where(florida_crime["motor_vehicle_theft"] > 0, 1, 0)
florida_crime["has_arson"] = np.where(florida_crime["arson"] > 0, 1, 0)

## Population
threshold_low = florida_crime["population_log"].quantile(0.1)
threshold_medium = florida_crime["population_log"].quantile(0.9)

florida_crime["population_low"] = (florida_crime["population_log"] < threshold_low).values # the 10% quantile
florida_crime["population_medium"] = (florida_crime["population_log"] < threshold_medium).values # the 90% quantile
florida_crime["population_high"] = (florida_crime["population_log"] >= threshold_medium).values
# there are 29 cities that fall in the low category, 255 in the medium and another 29 in the high

In [7]:
florida_crime.corr()

Unnamed: 0,population,violent_crime,murder,rape,robbery,assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,...,has_violent_crime,has_murder,has_rape,has_robbery,has_assault,has_property_crime,has_bulglary,has_larceny_theft,has_motor_vehicle_theft,has_arson
population,1.0,0.92963,0.85436,0.87604,0.89801,0.92466,0.94536,0.91969,0.94329,0.93626,...,0.09542,0.36497,0.26556,0.19635,0.11073,0.02718,0.04804,0.03896,0.10957,0.33679
violent_crime,0.92963,1.0,0.91698,0.89322,0.97546,0.99657,0.9681,0.95369,0.96128,0.97281,...,0.07691,0.35471,0.22553,0.1641,0.08932,0.0214,0.03762,0.03058,0.0873,0.30207
murder,0.85436,0.91698,1.0,0.84962,0.86326,0.91997,0.89037,0.93608,0.86996,0.89326,...,0.05603,0.30741,0.16659,0.1208,0.06518,0.01586,0.02756,0.02247,0.06518,0.23052
rape,0.87604,0.89322,0.84962,1.0,0.81405,0.8903,0.91037,0.93606,0.89697,0.89497,...,0.07399,0.32294,0.23217,0.15731,0.08608,0.01943,0.03552,0.0286,0.0853,0.29046
robbery,0.89801,0.97546,0.86326,0.81405,1.0,0.95734,0.94939,0.90744,0.94838,0.96242,...,0.07055,0.33384,0.2089,0.15337,0.08154,0.01997,0.0347,0.02829,0.08101,0.28369
assault,0.92466,0.99657,0.91997,0.8903,0.95734,1.0,0.95635,0.94852,0.94823,0.9596,...,0.07854,0.35905,0.22728,0.16625,0.09136,0.02183,0.03841,0.03121,0.08849,0.30499
property_crime,0.94536,0.9681,0.89037,0.91037,0.94939,0.95635,1.0,0.97275,0.99817,0.98769,...,0.08808,0.36963,0.25035,0.18634,0.10207,0.02524,0.04379,0.03572,0.10095,0.33171
burglary,0.91969,0.95369,0.93608,0.93606,0.90744,0.94852,0.97275,1.0,0.95775,0.96198,...,0.08105,0.34887,0.22853,0.16996,0.09334,0.02325,0.04041,0.03285,0.0892,0.30364
larceny_theft,0.94329,0.96128,0.86996,0.89697,0.94838,0.94823,0.99817,0.95775,1.0,0.98244,...,0.09061,0.37575,0.25747,0.19201,0.10514,0.02596,0.04503,0.03678,0.10448,0.34048
motor_vehicle_theft,0.93626,0.97281,0.89326,0.89497,0.96242,0.9596,0.98769,0.96198,0.98244,1.0,...,0.07488,0.33236,0.21666,0.15869,0.08676,0.02136,0.03713,0.0301,0.0878,0.29075


## PCA

In [11]:
# In order to explain 99% of the variance in all predictors, we are going to need 9 components
# Now PCA all features (crimes + the log of population)
# not trying to reduce dimensionality just want to eliminate collinearity
crime_cols = ['violent_crime', 'murder', 'rape', 'robbery', 'assault', 
              'property_crime', 'burglary', 'larceny_theft', 'motor_vehicle_theft',
              'arson', 'population_log', 'population_low', 
              'population_medium', 'population_high']

pca = PCA(n_components=9)
pca.fit(florida_crime[crime_cols].values)
X_new = pca.transform(florida_crime[crime_cols].values)
X_new = pd.DataFrame(X_new, columns=["PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9"])
X_new.corr()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
PC1,1.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0
PC2,0.0,1.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0
PC3,0.0,-0.0,1.0,-0.0,-0.0,0.0,-0.0,0.0,0.0
PC4,-0.0,0.0,-0.0,1.0,-0.0,0.0,-0.0,0.0,0.0
PC5,-0.0,0.0,-0.0,-0.0,1.0,-0.0,0.0,-0.0,-0.0
PC6,0.0,-0.0,0.0,0.0,-0.0,1.0,-0.0,0.0,0.0
PC7,-0.0,0.0,-0.0,-0.0,0.0,-0.0,1.0,-0.0,0.0
PC8,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,1.0,-0.0
PC9,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,1.0


In [21]:
X_new["population_log"] = florida_crime["population_log"].values
X_new["population_low"] = florida_crime["population_low"].values
X_new["population_medium"] = florida_crime["population_medium"].values
X_new["population_high"] = florida_crime["population_high"].values
X_new["property_crime"] = florida_crime["property_crime"].values

# Model

## Model 1

Model that simply includes all of the principle components created above, as well as the population indicator variables.

In [59]:
formula = "property_crime ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + population_low + population_medium + population_high"
# Fit the model to our data using the formula.
lm1 = smf.ols(formula=formula, data=X_new).fit()
print("Model Parameters\n===========================================")
print(lm1.params)
print("p-values\n===========================================")
print(lm1.pvalues)
print("\n\nR-Squared: {}".format(lm1.rsquared))

Model Parameters
Intercept                   789.80042
population_low[T.True]        0.00003
population_medium[T.True]   394.90030
population_high[T.True]     394.90012
PC1                           0.78599
PC2                          -0.12883
PC3                          -0.25045
PC4                           0.21106
PC5                           0.08734
PC6                          -0.02634
PC7                           0.00157
PC8                           0.00572
PC9                          -0.00078
dtype: float64
p-values
Intercept                   0.00000
population_low[T.True]      0.00000
population_medium[T.True]   0.00000
population_high[T.True]     0.00000
PC1                         0.00000
PC2                         0.00000
PC3                         0.00000
PC4                         0.00000
PC5                         0.00000
PC6                         0.00000
PC7                         0.00000
PC8                         0.00000
PC9                         0.000

This implies overfitting, so let's reduce the number of predictors we include in the model.

In [24]:
formula_new = "property_crime ~ PC1 + PC2 + population_medium + population_high"
# Fit the model to our data using the formula.
lm_new = smf.ols(formula=formula_new, data=X_new).fit()
print("Model Parameters\n===========================================")
print(lm_new.params)
print("p-values\n===========================================")
print(lm_new.pvalues)
print("\n\nR-Squared: {}".format(lm_new.rsquared))

Model Parameters
Intercept                   792.42148
population_medium[T.True]   391.27025
population_high[T.True]     401.15123
PC1                           0.78543
PC2                          -0.13320
dtype: float64
p-values
Intercept                   0.00000
population_medium[T.True]   0.00000
population_high[T.True]     0.00000
PC1                         0.00000
PC2                         0.00000
dtype: float64


R-Squared: 0.9998505611723576


In [91]:
y_pred = pd.DataFrame(lm_new.predict(X_new), columns=["pred"])
y_pred["actual"] = florida_crime["property_crime"].values
y_pred["resid"] = lm_new.resid
y_pred.index = florida_crime.index
y_pred.head()

Unnamed: 0_level_0,pred,actual,resid
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alachua,301.21758,306,4.78242
Altamonte Springs,1523.30044,1535,11.69956
Apalachicola,21.39071,26,4.60929
Apopka,1966.72546,2008,41.27454
Arcadia,185.62493,189,3.37507


Now lets make some predictions for the next year (2017).

## Evaluate

Load up the crime data for 2017.

In [44]:
florida_crime_2017 = pd.read_excel('./florida_2017.xls')
florida_crime_2017.set_index('City', inplace=True)
florida_crime_2017.columns = florida_crime.columns[:11]
florida_crime_2017.head()

Unnamed: 0_level_0,population,violent_crime,murder,rape,robbery,assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,arson
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alachua,10037,40,0,4,6,30,266,45,211,10,2
Altamonte Springs,43833,136,1,16,29,90,1369,147,1147,75,2
Apalachicola,2345,3,0,0,0,3,5,1,4,0,0
Apopka,50833,225,1,22,51,151,1881,350,1387,144,2
Arcadia,7992,71,5,4,6,56,175,82,83,10,1


In [45]:
florida_crime_2017["population_log"] = np.log(florida_crime_2017.population)

threshold_low = florida_crime_2017["population_log"].quantile(0.1)
threshold_medium = florida_crime_2017["population_log"].quantile(0.9)

florida_crime_2017["population_low"] = (florida_crime_2017["population_log"] < threshold_low).values # the 10% quantile
florida_crime_2017["population_medium"] = (florida_crime_2017["population_log"] < threshold_medium).values # the 90% quantile
florida_crime_2017["population_high"] = (florida_crime_2017["population_log"] >= threshold_medium).values

**PCA**
Now transform the 2017 data into it's 9 components.

In [46]:
pca2 = PCA(n_components=9)
pca2.fit(florida_crime_2017[crime_cols].values)
X_new_2017 = pca.transform(florida_crime_2017[crime_cols].values)
X_new_2017 = pd.DataFrame(X_new, columns=["PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9"])
X_new_2017.corr()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
PC1,1.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0
PC2,0.0,1.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0
PC3,0.0,-0.0,1.0,-0.0,-0.0,0.0,-0.0,0.0,0.0
PC4,-0.0,0.0,-0.0,1.0,-0.0,0.0,-0.0,0.0,0.0
PC5,-0.0,0.0,-0.0,-0.0,1.0,-0.0,0.0,-0.0,-0.0
PC6,0.0,-0.0,0.0,0.0,-0.0,1.0,-0.0,0.0,0.0
PC7,-0.0,0.0,-0.0,-0.0,0.0,-0.0,1.0,-0.0,0.0
PC8,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,1.0,-0.0
PC9,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,1.0


Now we use our previous model to make predictions on this transformed data.

In [48]:
X_new_2017["population_log"] = florida_crime_2017["population_log"].values
X_new_2017["population_low"] = florida_crime_2017["population_low"].values
X_new_2017["population_medium"] = florida_crime_2017["population_medium"].values
X_new_2017["population_high"] = florida_crime_2017["population_high"].values
X_new_2017["property_crime"] = florida_crime_2017["property_crime"].values

In [56]:
pred = pd.DataFrame(lm_new.predict(X_new_2017), columns= ["pred"])
pred["actual"] = florida_crime_2017["property_crime"].values

In [75]:
pred.index = florida_crime_2017.index

In [76]:
# calculate R Squared for 2017
resid = (pred.pred - pred.actual)