# **Predicting U.S. Crime Rates**

## **Data-joining**

In this notebook, we join all the predictor data with the target data.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
crimes = pd.read_csv('../data/crimes_by_state.csv')
unemployment = pd.read_csv('../data/bls_state_unemployment.csv')

In [7]:
crimes.head()

Unnamed: 0,state_abbr,year,population,violent_crime,homicide,rape,robbery,aggravated_assault,property_crime,burglary,...,violent_crime_1000,homicide_1000,rape_1000,robbery_1000,aggravated_assault_1000,property_crime_1000,burglary_1000,larceny_1000,motor_vehicle_theft_1000,arson_1000
0,AL,1979,3769000,15578,496,1037,4127,9918,144372,48517,...,4.133192,0.1316,0.275139,1.094985,2.631467,38.305121,12.872645,22.231626,3.200849,0.067392
1,AL,1980,3861466,17320,509,1158,5102,10551,173191,58952,...,4.485343,0.131815,0.299886,1.32126,2.732382,44.851101,15.266741,26.422348,3.162012,0.287197
2,AL,1981,3916000,18423,465,1021,4952,11985,173411,56811,...,4.704545,0.118744,0.260725,1.264556,3.060521,44.282686,14.507406,26.93335,2.841931,0.305158
3,AL,1982,3943000,17653,417,1026,4417,11793,165048,49531,...,4.477048,0.105757,0.260208,1.120213,2.99087,41.858483,12.561755,26.56353,2.733198,0.273396
4,AL,1983,3959000,16471,364,931,3895,11281,145890,42485,...,4.160394,0.091942,0.23516,0.983834,2.849457,36.850215,10.731245,23.813842,2.305128,0.24779


In [8]:
unemployment.head()

Unnamed: 0,state,year,avg_unemployment_rate
0,Alabama,1979,7.225
1,Alabama,1980,8.816667
2,Alabama,1981,10.691667
3,Alabama,1982,13.95
4,Alabama,1983,13.816667


In [9]:
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
}

In [10]:
unemployment['state_abbr'] = unemployment['state'].map(lambda x: us_state_to_abbrev[x])
unemployment.drop(columns = 'state', inplace = True)
unemployment.head()

Unnamed: 0,year,avg_unemployment_rate,state_abbr
0,1979,7.225,AL
1,1980,8.816667,AL
2,1981,10.691667,AL
3,1982,13.95,AL
4,1983,13.816667,AL


In [11]:
crimes = pd.merge(crimes, unemployment, how = 'inner', on = ['state_abbr', 'year'])
crimes.head()

Unnamed: 0,state_abbr,year,population,violent_crime,homicide,rape,robbery,aggravated_assault,property_crime,burglary,...,homicide_1000,rape_1000,robbery_1000,aggravated_assault_1000,property_crime_1000,burglary_1000,larceny_1000,motor_vehicle_theft_1000,arson_1000,avg_unemployment_rate
0,AL,1979,3769000,15578,496,1037,4127,9918,144372,48517,...,0.1316,0.275139,1.094985,2.631467,38.305121,12.872645,22.231626,3.200849,0.067392,7.225
1,AL,1980,3861466,17320,509,1158,5102,10551,173191,58952,...,0.131815,0.299886,1.32126,2.732382,44.851101,15.266741,26.422348,3.162012,0.287197,8.816667
2,AL,1981,3916000,18423,465,1021,4952,11985,173411,56811,...,0.118744,0.260725,1.264556,3.060521,44.282686,14.507406,26.93335,2.841931,0.305158,10.691667
3,AL,1982,3943000,17653,417,1026,4417,11793,165048,49531,...,0.105757,0.260208,1.120213,2.99087,41.858483,12.561755,26.56353,2.733198,0.273396,13.95
4,AL,1983,3959000,16471,364,931,3895,11281,145890,42485,...,0.091942,0.23516,0.983834,2.849457,36.850215,10.731245,23.813842,2.305128,0.24779,13.816667


In [13]:
cpi = pd.read_csv('../data/bls_cpi.csv')
cpi.head()

Unnamed: 0,year,avg_CPI
0,1979,72.583333
1,1980,82.383333
2,1981,90.933333
3,1982,96.533333
4,1983,99.583333


In [14]:
crimes['avg_CPI'] = crimes['year'].map(lambda x: cpi['avg_CPI'][x-1979])
crimes.head()

Unnamed: 0,state_abbr,year,population,violent_crime,homicide,rape,robbery,aggravated_assault,property_crime,burglary,...,rape_1000,robbery_1000,aggravated_assault_1000,property_crime_1000,burglary_1000,larceny_1000,motor_vehicle_theft_1000,arson_1000,avg_unemployment_rate,avg_CPI
0,AL,1979,3769000,15578,496,1037,4127,9918,144372,48517,...,0.275139,1.094985,2.631467,38.305121,12.872645,22.231626,3.200849,0.067392,7.225,72.583333
1,AL,1980,3861466,17320,509,1158,5102,10551,173191,58952,...,0.299886,1.32126,2.732382,44.851101,15.266741,26.422348,3.162012,0.287197,8.816667,82.383333
2,AL,1981,3916000,18423,465,1021,4952,11985,173411,56811,...,0.260725,1.264556,3.060521,44.282686,14.507406,26.93335,2.841931,0.305158,10.691667,90.933333
3,AL,1982,3943000,17653,417,1026,4417,11793,165048,49531,...,0.260208,1.120213,2.99087,41.858483,12.561755,26.56353,2.733198,0.273396,13.95,96.533333
4,AL,1983,3959000,16471,364,931,3895,11281,145890,42485,...,0.23516,0.983834,2.849457,36.850215,10.731245,23.813842,2.305128,0.24779,13.816667,99.583333


In [15]:
crimes.tail()

Unnamed: 0,state_abbr,year,population,violent_crime,homicide,rape,robbery,aggravated_assault,property_crime,burglary,...,rape_1000,robbery_1000,aggravated_assault_1000,property_crime_1000,burglary_1000,larceny_1000,motor_vehicle_theft_1000,arson_1000,avg_unemployment_rate,avg_CPI
2137,DC,2016,684336,8236,136,533,3500,4067,32377,2361,...,0.778857,5.114447,5.942987,47.311555,3.45006,39.584356,4.277139,0.301022,6.233333,240.005417
2138,DC,2017,695691,6976,116,444,2625,3791,29736,1809,...,0.638214,3.773227,5.449258,42.743114,2.600292,36.424217,3.718605,0.321982,6.133333,245.1355
2139,DC,2018,701547,6995,160,450,2415,3970,30726,1788,...,0.64144,3.442392,5.658922,43.797493,2.548653,37.552723,3.696117,0.276532,5.7,251.102333
2140,DC,2019,708253,7403,166,345,2714,4178,30819,1843,...,0.487114,3.831964,5.899022,43.514111,2.602177,37.617913,3.294021,0.228732,5.325,255.652583
2141,DC,2020,712816,7127,201,311,2373,4242,24899,1964,...,0.436298,3.32905,5.951045,34.930473,2.755269,27.406792,4.768411,0.290398,8.066667,258.844083


In [16]:
crimes.shape

(2142, 25)

In [17]:
crimes.to_csv('../data/predictors_and_targets.csv', index=False)

In [18]:
training_set = crimes[crimes['year'] < 2013]
training_set.shape

(1734, 25)

In [19]:
test_set = crimes[crimes['year'] >= 2013]
test_set.shape

(408, 25)

In [20]:
training_set.reset_index()
test_set.reset_index()

Unnamed: 0,index,state_abbr,year,population,violent_crime,homicide,rape,robbery,aggravated_assault,property_crime,...,rape_1000,robbery_1000,aggravated_assault_1000,property_crime_1000,burglary_1000,larceny_1000,motor_vehicle_theft_1000,arson_1000,avg_unemployment_rate,avg_CPI
0,34,AL,2013,4833996,20834,346,2055,4645,13788,161835,...,0.425114,0.960903,2.852299,33.478513,8.773280,22.520085,2.185149,0.157427,7.325000,232.951750
1,35,AL,2014,4846411,20727,276,2005,4702,13744,154087,...,0.413708,0.970202,2.835913,31.794043,8.196375,21.505192,2.092476,0.256891,6.733333,236.715000
2,36,AL,2015,4853875,22957,348,2037,4612,15960,144785,...,0.419665,0.950169,3.288095,29.828745,7.265329,20.433571,2.129845,0.158636,6.125000,237.001750
3,37,AL,2016,4860545,25878,407,1915,4687,18869,143259,...,0.393989,0.964295,3.882075,29.473855,7.004359,20.059067,2.410429,0.216848,5.908333,240.005417
4,38,AL,2017,4875120,25469,419,2001,4233,18816,143774,...,0.410451,0.868286,3.859597,29.491377,6.414201,20.452215,2.624961,0.144202,4.583333,245.135500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,2137,DC,2016,684336,8236,136,533,3500,4067,32377,...,0.778857,5.114447,5.942987,47.311555,3.450060,39.584356,4.277139,0.301022,6.233333,240.005417
404,2138,DC,2017,695691,6976,116,444,2625,3791,29736,...,0.638214,3.773227,5.449258,42.743114,2.600292,36.424217,3.718605,0.321982,6.133333,245.135500
405,2139,DC,2018,701547,6995,160,450,2415,3970,30726,...,0.641440,3.442392,5.658922,43.797493,2.548653,37.552723,3.696117,0.276532,5.700000,251.102333
406,2140,DC,2019,708253,7403,166,345,2714,4178,30819,...,0.487114,3.831964,5.899022,43.514111,2.602177,37.617913,3.294021,0.228732,5.325000,255.652583


In [21]:
training_set.head()

Unnamed: 0,state_abbr,year,population,violent_crime,homicide,rape,robbery,aggravated_assault,property_crime,burglary,...,rape_1000,robbery_1000,aggravated_assault_1000,property_crime_1000,burglary_1000,larceny_1000,motor_vehicle_theft_1000,arson_1000,avg_unemployment_rate,avg_CPI
0,AL,1979,3769000,15578,496,1037,4127,9918,144372,48517,...,0.275139,1.094985,2.631467,38.305121,12.872645,22.231626,3.200849,0.067392,7.225,72.583333
1,AL,1980,3861466,17320,509,1158,5102,10551,173191,58952,...,0.299886,1.32126,2.732382,44.851101,15.266741,26.422348,3.162012,0.287197,8.816667,82.383333
2,AL,1981,3916000,18423,465,1021,4952,11985,173411,56811,...,0.260725,1.264556,3.060521,44.282686,14.507406,26.93335,2.841931,0.305158,10.691667,90.933333
3,AL,1982,3943000,17653,417,1026,4417,11793,165048,49531,...,0.260208,1.120213,2.99087,41.858483,12.561755,26.56353,2.733198,0.273396,13.95,96.533333
4,AL,1983,3959000,16471,364,931,3895,11281,145890,42485,...,0.23516,0.983834,2.849457,36.850215,10.731245,23.813842,2.305128,0.24779,13.816667,99.583333


In [22]:
test_set.head()

Unnamed: 0,state_abbr,year,population,violent_crime,homicide,rape,robbery,aggravated_assault,property_crime,burglary,...,rape_1000,robbery_1000,aggravated_assault_1000,property_crime_1000,burglary_1000,larceny_1000,motor_vehicle_theft_1000,arson_1000,avg_unemployment_rate,avg_CPI
34,AL,2013,4833996,20834,346,2055,4645,13788,161835,42410,...,0.425114,0.960903,2.852299,33.478513,8.77328,22.520085,2.185149,0.157427,7.325,232.95175
35,AL,2014,4846411,20727,276,2005,4702,13744,154087,39723,...,0.413708,0.970202,2.835913,31.794043,8.196375,21.505192,2.092476,0.256891,6.733333,236.715
36,AL,2015,4853875,22957,348,2037,4612,15960,144785,35265,...,0.419665,0.950169,3.288095,29.828745,7.265329,20.433571,2.129845,0.158636,6.125,237.00175
37,AL,2016,4860545,25878,407,1915,4687,18869,143259,34045,...,0.393989,0.964295,3.882075,29.473855,7.004359,20.059067,2.410429,0.216848,5.908333,240.005417
38,AL,2017,4875120,25469,419,2001,4233,18816,143774,31270,...,0.410451,0.868286,3.859597,29.491377,6.414201,20.452215,2.624961,0.144202,4.583333,245.1355


In [23]:
training_set.to_csv('../data/training_set.csv', index=False)
test_set.to_csv('../data/test_set.csv', index=False)