# Violent and non-violent crime prediciton for Chicago beats for next week

Plan:
1. EDA and feature engineering and extraction
2. recent data is usually more predictive then aged data, tentatively limit the data from past 3 years 2012 - 2015
3. join table with SP500 daily price table from 2012 - present by date
4. features to extract:
   - year, month, week of the year
   - beat number 
   - num of Arrest, num of Domestic
   - IUCR
   - Primary Type_violent, Primary Type_nonviolent (split Primary Type into two groups for violent and nonviolent crime)
   - average weekly SP500 price
5. split extracted fields into two subset RDD by primary type - violentRDD, nonviolentRDD
6. prepare data parallelly for two RDDs and enter random forest regressor in MLlib
7. eveluate model performance by calculating mean square error on testdata for both violent and nonviolent models
8. make prediction for next week's violent and nonviolent crime using new data
   
   
ref: 
http://pbpython.com/categorical-encoding.html
https://pypi.python.org/pypi/geopy
https://stackoverflow.com/questions/31257077/how-do-you-perform-basic-joins-of-two-rdd-tables-in-spark-using-python
https://stackoverflow.com/questions/39699107/spark-rdd-to-dataframe-python
https://sparkour.urizone.net/recipes/controlling-schema/#02
https://github.com/jhlch/ds-for-telco/blob/master/ds-for-telco.ipynb

In [42]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import csv
#from geopy.geocoders import Nominatim
#geolocator = Nominatim()https://sparkour.urizone.net/recipes/controlling-schema/#02

In [1]:
from pyspark.sql.functions import *
from pyspark.context import SparkContext 
from pyspark.sql import SQLContext
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
import pyspark.rdd
from datetime import datetime, date
import re
from pyspark.mllib.stat import Statistics
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.util import MLUtils


sc = SparkContext.getOrCreate()

In [250]:
#use when done with current pyspark context
sc.stop()

In [2]:
sqlContext = SQLContext(sc)
#sc.setLogLevel("ERROR")

In [3]:
crime = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('Crimes_-_2001_to_present.csv')
crime.show(3)

+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|                Date|               Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10078659|   HY267429|05/19/2015 11:57:...|     010XX E 79TH ST|143A|   WEAPONS VIOLATION|UNLAWFUL POSS OF ...|             

In [4]:
#create column month, day, year from column Date 
#substring(Column str, int pos, int len)
crime=crime.withColumn("month", substring(col("Date"),1,2))
crime=crime.withColumn("day", substring(col("Date"),4,2))

In [6]:
crime.show(3)

+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+-----+---+
|      ID|Case Number|                Date|               Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|month|day|
+--------+-----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+-----+---+
|10078659|   HY267429|05/19/2015 11:57:...|     010XX E 79TH ST|143A|   WEAPONS VIOLATION|UNLA

In [5]:
#read in SP500 2001 to 2015 daily price dataset:
SP500 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('SP500_2001_2015.csv')
SP500.show(3)

+--------------------+-----------+-----------+-----------+-----------+-----------+----------+
|                Date|       Open|       High|        Low|      Close|  Adj Close|    Volume|
+--------------------+-----------+-----------+-----------+-----------+-----------+----------+
|2001-01-02 00:00:...|1320.280029|1320.280029|1276.050049| 1283.27002| 1283.27002|1129400000|
|2001-01-03 00:00:...| 1283.27002| 1347.76001|1274.619995|1347.560059|1347.560059|1880700000|
|2001-01-04 00:00:...|1347.560059| 1350.23999|1329.140015|1333.339966|1333.339966|2131000000|
+--------------------+-----------+-----------+-----------+-----------+-----------+----------+
only showing top 3 rows



In [6]:
#create column month, day, year from column Date for table SP500
SP500=SP500.withColumn("year", substring(col("Date"),1,4))
SP500=SP500.withColumn("month", substring(col("Date"),6,2))
SP500=SP500.withColumn("day", substring(col("Date"),9,2))
SP500=SP500.drop(SP500.Date)
SP500.show(3)

+-----------+-----------+-----------+-----------+-----------+----------+----+-----+---+
|       Open|       High|        Low|      Close|  Adj Close|    Volume|year|month|day|
+-----------+-----------+-----------+-----------+-----------+----------+----+-----+---+
|1320.280029|1320.280029|1276.050049| 1283.27002| 1283.27002|1129400000|2001|   01| 02|
| 1283.27002| 1347.76001|1274.619995|1347.560059|1347.560059|1880700000|2001|   01| 03|
|1347.560059| 1350.23999|1329.140015|1333.339966|1333.339966|2131000000|2001|   01| 04|
+-----------+-----------+-----------+-----------+-----------+----------+----+-----+---+
only showing top 3 rows



In [7]:
#Inner join crime table with SP500 table by {year, month,day}
joined=crime.join(SP500, (crime.Year == SP500.year) & (crime.month == SP500.month) & (crime.day == SP500.day)).drop(SP500.year).drop(SP500.month).drop(SP500.day)
#sqlContext.sql('SELECT * FROM df1 JOIN df2 ON df1.k = df2.k')
joined.take(3)

[Row(ID=10078659, Case Number='HY267429', Date='05/19/2015 11:57:00 PM', Block='010XX E 79TH ST', IUCR='143A', Primary Type='WEAPONS VIOLATION', Description='UNLAWFUL POSS OF HANDGUN', Location Description='STREET', Arrest=True, Domestic=False, Beat=624, District=6, Ward=8, Community Area='44', FBI Code='15', X Coordinate=1184626, Y Coordinate=1852799, Year=2015, Updated On='05/26/2015 12:42:06 PM', Latitude=41.751242944, Longitude=-87.599004724, Location='(41.751242944, -87.599004724)', month='05', day='19', Open=2129.449951, High=2133.02002, Low=2124.5, Close=2127.830078, Adj Close=2127.830078, Volume=3296030000),
 Row(ID=10078598, Case Number='HY267408', Date='05/19/2015 11:50:00 PM', Block='067XX N SHERIDAN RD', IUCR='3731', Primary Type='INTERFERENCE WITH PUBLIC OFFICER', Description='OBSTRUCTING IDENTIFICATION', Location Description='STREET', Arrest=True, Domestic=False, Beat=2432, District=24, Ward=49, Community Area='1', FBI Code='24', X Coordinate=1167071, Y Coordinate=1944859

In [8]:
#feature creation 1: week of the month
joined=joined.withColumn("weekOfMonth", col("day").cast(IntegerType())/7)
#feature creation 2: shortdate, time
joined=joined.withColumn("trimDate", substring(col("Date"),1,10))
joined=joined.withColumn("time", substring(col("Date"),12,11))
joined.show(3)
#type(joined)

+--------+-----------+--------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+-----+---+-----------+----------+------+-----------+-----------+----------+------------------+----------+-----------+
|      ID|Case Number|                Date|              Block|IUCR|        Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|month|day|       Open|      High|   Low|      Close|  Adj Close|    Volume|       weekOfMonth|  trimDate|       time|
+--------+-----------+--------------------+-------------------+----+--------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+---

In [9]:
#convert spark joined dataframe back to RDD
rdd = joined.rdd.map(list)
rdd.take(3)

[[10078659,
  'HY267429',
  '05/19/2015 11:57:00 PM',
  '010XX E 79TH ST',
  '143A',
  'WEAPONS VIOLATION',
  'UNLAWFUL POSS OF HANDGUN',
  'STREET',
  True,
  False,
  624,
  6,
  8,
  '44',
  '15',
  1184626,
  1852799,
  2015,
  '05/26/2015 12:42:06 PM',
  41.751242944,
  -87.599004724,
  '(41.751242944, -87.599004724)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:57:00 PM'],
 [10078598,
  'HY267408',
  '05/19/2015 11:50:00 PM',
  '067XX N SHERIDAN RD',
  '3731',
  'INTERFERENCE WITH PUBLIC OFFICER',
  'OBSTRUCTING IDENTIFICATION',
  'STREET',
  True,
  False,
  2432,
  24,
  49,
  '1',
  '24',
  1167071,
  1944859,
  2015,
  '05/26/2015 12:42:06 PM',
  42.004255918,
  -87.660691083,
  '(42.004255918, -87.660691083)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:50:00 PM'],
 [10078625,
  '

In [10]:
#filter and subset to get data in recent 3 years 2013-2015
rdd =rdd.filter(lambda x: int(x[17]) >=2013)
rdd.take(3)

[[10078659,
  'HY267429',
  '05/19/2015 11:57:00 PM',
  '010XX E 79TH ST',
  '143A',
  'WEAPONS VIOLATION',
  'UNLAWFUL POSS OF HANDGUN',
  'STREET',
  True,
  False,
  624,
  6,
  8,
  '44',
  '15',
  1184626,
  1852799,
  2015,
  '05/26/2015 12:42:06 PM',
  41.751242944,
  -87.599004724,
  '(41.751242944, -87.599004724)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:57:00 PM'],
 [10078598,
  'HY267408',
  '05/19/2015 11:50:00 PM',
  '067XX N SHERIDAN RD',
  '3731',
  'INTERFERENCE WITH PUBLIC OFFICER',
  'OBSTRUCTING IDENTIFICATION',
  'STREET',
  True,
  False,
  2432,
  24,
  49,
  '1',
  '24',
  1167071,
  1944859,
  2015,
  '05/26/2015 12:42:06 PM',
  42.004255918,
  -87.660691083,
  '(42.004255918, -87.660691083)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:50:00 PM'],
 [10078625,
  '

In [11]:
#use regular expression to filter and get rows with beat - 3-4 digits number
rdd = rdd.filter( lambda x: int(x[10])>=100 and int(x[10])<=9999)
rdd.take(3)

[[10078659,
  'HY267429',
  '05/19/2015 11:57:00 PM',
  '010XX E 79TH ST',
  '143A',
  'WEAPONS VIOLATION',
  'UNLAWFUL POSS OF HANDGUN',
  'STREET',
  True,
  False,
  624,
  6,
  8,
  '44',
  '15',
  1184626,
  1852799,
  2015,
  '05/26/2015 12:42:06 PM',
  41.751242944,
  -87.599004724,
  '(41.751242944, -87.599004724)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:57:00 PM'],
 [10078598,
  'HY267408',
  '05/19/2015 11:50:00 PM',
  '067XX N SHERIDAN RD',
  '3731',
  'INTERFERENCE WITH PUBLIC OFFICER',
  'OBSTRUCTING IDENTIFICATION',
  'STREET',
  True,
  False,
  2432,
  24,
  49,
  '1',
  '24',
  1167071,
  1944859,
  2015,
  '05/26/2015 12:42:06 PM',
  42.004255918,
  -87.660691083,
  '(42.004255918, -87.660691083)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:50:00 PM'],
 [10078625,
  '

In [12]:
#create new feature week of the year from field 31 trimDate
#https://stackoverflow.com/questions/2600775/how-to-get-week-number-in-python
#(ID, weekOfYear)
weekOfYear=rdd.map(lambda x: (x[0],datetime.strptime(x[2],'%m/%d/%Y %I:%M:%S %p').date().isocalendar()[1]))
#weekOfYear=rdd.map(lambda x: (x[1],datetime.strptime(x[31],'%m/%d/%y').date().isocalendar()[1]))

weekOfYear.take(3)

[(10078659, 21), (10078598, 21), (10078625, 21)]

In [13]:
#split rdd3 into violent crime rdd, and non-violent crime rdd:
#define violent crime to be Primary Type = "Battery", "Assault", "BURGLARY", "Robbery" and "Homicide"
violentRDD = rdd.filter(lambda x: x[5] in ('ASSAULT', 'BATTERY','BURGLARY','ROBBERY','HOMICIDE'))
violentRDD.take(5)

[[10078662,
  'HY267423',
  '05/19/2015 11:46:00 PM',
  '015XX E 62ND ST',
  '051A',
  'ASSAULT',
  'AGGRAVATED: HANDGUN',
  'APARTMENT',
  False,
  True,
  314,
  3,
  5,
  '42',
  '04A',
  1187377,
  1864316,
  2015,
  '05/26/2015 12:42:06 PM',
  41.782781732,
  -87.588558362,
  '(41.782781732, -87.588558362)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:46:00 PM'],
 [10078629,
  'HY267393',
  '05/19/2015 11:40:00 PM',
  '013XX S LAWNDALE AVE',
  '0454',
  'BATTERY',
  'AGG PO HANDS NO/MIN INJURY',
  'STREET',
  True,
  False,
  1011,
  10,
  24,
  '29',
  '08B',
  1151957,
  1893696,
  2015,
  '05/26/2015 12:42:06 PM',
  41.864172884,
  -87.717647622,
  '(41.864172884, -87.717647622)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:40:00 PM'],
 [10079225,
  'HY267395',
  '05/19/2015 11:30:00

In [14]:
nonviolentRDD=rdd.filter(lambda x: x[5] not in ('ASSAULT', 'BATTERY','BURGLARY','ROBBERY','HOMICIDE'))
nonviolentRDD.take(5)

[[10078659,
  'HY267429',
  '05/19/2015 11:57:00 PM',
  '010XX E 79TH ST',
  '143A',
  'WEAPONS VIOLATION',
  'UNLAWFUL POSS OF HANDGUN',
  'STREET',
  True,
  False,
  624,
  6,
  8,
  '44',
  '15',
  1184626,
  1852799,
  2015,
  '05/26/2015 12:42:06 PM',
  41.751242944,
  -87.599004724,
  '(41.751242944, -87.599004724)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:57:00 PM'],
 [10078598,
  'HY267408',
  '05/19/2015 11:50:00 PM',
  '067XX N SHERIDAN RD',
  '3731',
  'INTERFERENCE WITH PUBLIC OFFICER',
  'OBSTRUCTING IDENTIFICATION',
  'STREET',
  True,
  False,
  2432,
  24,
  49,
  '1',
  '24',
  1167071,
  1944859,
  2015,
  '05/26/2015 12:42:06 PM',
  42.004255918,
  -87.660691083,
  '(42.004255918, -87.660691083)',
  '05',
  '19',
  2129.449951,
  2133.02002,
  2124.5,
  2127.830078,
  2127.830078,
  3296030000,
  2.7142857142857144,
  '05/19/2015',
  '11:50:00 PM'],
 [10078625,
  '

# Weekly Violent Crime Prediction for Chicago Beats in the past three years 

In [16]:
#select useful features in violentRDD: ID|Case Number|Date|IUCR|Primary Type|Arrest|Domestic|Beat|Year|adj close
violentRDD2=violentRDD.map(lambda x: (x[0],(x[1],x[2],x[4],x[5],x[8],x[9],x[10],x[17],x[28])))
violentRDD2.take(5)

[(10078662,
  ('HY267423',
   '05/19/2015 11:46:00 PM',
   '051A',
   'ASSAULT',
   False,
   True,
   314,
   2015,
   2127.830078)),
 (10078629,
  ('HY267393',
   '05/19/2015 11:40:00 PM',
   '0454',
   'BATTERY',
   True,
   False,
   1011,
   2015,
   2127.830078)),
 (10079225,
  ('HY267395',
   '05/19/2015 11:30:00 PM',
   '0497',
   'BATTERY',
   False,
   True,
   725,
   2015,
   2127.830078)),
 (10078618,
  ('HY267392',
   '05/19/2015 11:30:00 PM',
   '0320',
   'ROBBERY',
   False,
   False,
   813,
   2015,
   2127.830078)),
 (10078652,
  ('HY267402',
   '05/19/2015 11:20:00 PM',
   '0320',
   'ROBBERY',
   False,
   False,
   2423,
   2015,
   2127.830078))]

In [18]:
#joined weekOfYear back to violentRDD2:
violentRDD3=violentRDD2.join(weekOfYear)
violentRDD3.take(5)

[(9775788,
  (('HX425435',
    '09/12/2014 10:50:00 AM',
    '0486',
    'BATTERY',
    False,
    True,
    222,
    2014,
    1985.540039),
   37)),
 (10059786,
  (('HY248556',
    '05/04/2015 01:30:00 PM',
    '0460',
    'BATTERY',
    False,
    False,
    1021,
    2015,
    2114.48999),
   19)),
 (9895952,
  (('HX546318',
    '12/17/2014 10:40:00 PM',
    '0460',
    'BATTERY',
    True,
    False,
    2023,
    2014,
    2012.890015),
   51)),
 (9601042,
  (('HX251469',
    '05/06/2014 07:40:00 PM',
    '0460',
    'BATTERY',
    False,
    False,
    1034,
    2014,
    1867.719971),
   19)),
 (9207836,
  (('HW353936',
    '07/08/2013 08:08:00 PM',
    '0486',
    'BATTERY',
    True,
    False,
    831,
    2013,
    1640.459961),
   28))]

In [19]:
#calculate number of crimes by (beat, year, weekOfYear) as response variable
pairs=violentRDD3.map(lambda x: ((x[1][0][6],x[1][0][7],x[1][1]),1))
#pairs.take(3)
counts=pairs.reduceByKey(lambda x,y: int(x)+int(y))
counts.take(3)

[((2431, 2013, 7), 5), ((1234, 2014, 25), 4), ((1911, 2013, 9), 5)]

In [20]:
#calculate number of arrest by (beat,year, weekOfYear) as predictor
pairs2 = violentRDD3.filter(lambda x: x[1][0][4]==True).map(lambda x:((x[1][0][6],x[1][0][7],x[1][1]),1))
#pairs2.take(3)
counts2=pairs2.reduceByKey(lambda x,y: int(x)+int(y))
counts2.take(3)

[((715, 2013, 17), 1), ((2431, 2013, 7), 2), ((124, 2013, 24), 1)]

In [21]:
#calculate number of domestic by (beat,year, weekOfYear) as predictor
pairs3 = violentRDD3.filter(lambda x: x[1][0][5]==True).map(lambda x:((x[1][0][6],x[1][0][7],x[1][1]),1))
#pairs3.take(3)
counts3=pairs3.reduceByKey(lambda x,y: int(x)+int(y))
counts3.take(3)

[((715, 2013, 17), 1), ((2431, 2013, 7), 2), ((1613, 2014, 38), 1)]

In [22]:
#calculate average adj close SP500 by (year, weekOfYear)
#calculate counts
pairs4=violentRDD3.map(lambda x: ((x[1][0][7],x[1][1]),1))
#pairs.take(3)
counts4=pairs4.reduceByKey(lambda x,y: int(x)+int(y))
#counts4.take(3)

#calculate sum
pairs4_2 =violentRDD3.map(lambda x: ((x[1][0][7],x[1][1]),x[1][0][8]))
#pairs4_2.take(3)
sum=pairs4_2.reduceByKey(lambda x,y: int(x)+int(y))
#sum.take(3)

#[(year, weekOfYear),weeklyAvePrice]
weeklyjoin=sum.join(counts4)
#weeklyjoin.take(3)
weeklyAvePrice=weeklyjoin.map(lambda x: (x[0],x[1][0]/x[1][1]))
weeklyAvePrice.take(3)
#weeklyAvePrice.count() #125

[((2013, 33), 1676.6430107526883),
 ((2014, 28), 1968.7817294281729),
 ((2014, 10), 1869.6590229312064)]

In [23]:
#joined all the features into one rdd: [(beat,year,weekOfYear), num of crime, num of arrest, num of domestic]
featuresRDD=counts.join(counts2).join(counts3)
featuresRDD.take(3)

[((715, 2013, 17), ((3, 1), 1)),
 ((2431, 2013, 7), ((5, 2), 2)),
 ((2522, 2014, 49), ((4, 1), 1))]

In [24]:
#re-arrange featuresRDD to to join with weeklyAvePrice: 
#[(year, weekOfYear),beat,num of crime, num of arrest, num of domestic]
featuresRDD=featuresRDD.map(lambda x: ((x[0][1],x[0][2]),(x[0][0],x[1][0][0],x[1][0][1],x[1][1])))
featuresRDD.take(3)
#featuresRDD.count() #13729

[((2013, 17), (715, 3, 1, 1)),
 ((2013, 7), (2431, 5, 2, 2)),
 ((2014, 49), (2522, 4, 1, 1))]

In [25]:
featuresRDD=featuresRDD.join(weeklyAvePrice)
#featuresRDD.count() #13729
featuresRDD.take(3)

[((2013, 33), ((2223, 9, 2, 2), 1676.6430107526883)),
 ((2013, 33), ((2533, 7, 1, 3), 1676.6430107526883)),
 ((2013, 33), ((631, 19, 3, 8), 1676.6430107526883))]

In [26]:
#re-arrange fields and drop year field to enter MLlib
#[number of crimes, weekOfYear, beat, num of arrest, num of domestic, weeklyAvePrice]
featuresRDD=featuresRDD.map(lambda x: (x[1][0][1],x[0][1],x[1][0][0],x[1][0][2],x[1][0][3],x[1][1]))
featuresRDD.take(3)

[(9, 33, 2223, 2, 2, 1676.6430107526883),
 (7, 33, 2533, 1, 3, 1676.6430107526883),
 (19, 33, 631, 3, 8, 1676.6430107526883)]

In [27]:
#Create dictionary starting at index = 0, ending at index = n-1, for categorical features
#where n is the number of categories for feature
#trees for categorical features require start at index 0

#create category index for weekOfYear starting at index 0:
weekIndex = dict(zip(range(1,54), range(0,53)))

In [28]:
#create category index for best starting at index 0:
beat = featuresRDD.map(lambda x: x[2]).distinct().sortBy(lambda x: x ) # 270 records
#beat.count()
beatIndex = dict(beat.zipWithIndex().map(lambda x: (x[0],x[1])).collect())

In [29]:
#[(crimes, [weekIndex, beatIndex, num of arrest, num of domestic, weeklyAvePrice])]
features = featuresRDD.map(lambda x: (x[0],[weekIndex[x[1]],beatIndex[x[2]],x[3],x[4],x[5]]))
features = features.filter(lambda x: re.findall("[0-9]+", str(x[1][1])))
#features.map(lambda x:x[1][1]).take(3)
features.take(3)

[(9, [32, 242, 2, 2, 1676.6430107526883]),
 (7, [32, 267, 1, 3, 1676.6430107526883]),
 (19, [32, 67, 3, 8, 1676.6430107526883])]

In [30]:
# convert to labeled point, (response, [predictors])
featuresLP = features.map(lambda x: LabeledPoint(x[0],x[1]))

In [31]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = featuresLP.randomSplit([0.7, 0.3])

In [32]:
featuresIndex = {0:270}
#maxBins = max(len(beatIndex),len(weekIndex)) 
#ecisionTree requires maxBins >= max categories in categorical features (270)


In [33]:
### random forest fit for violent crime prediction
violentModel = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo=featuresIndex,
                                    numTrees=5, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=5, maxBins=270)

In [34]:
### Evalute violent crime prediction model
# Evaluate model on test instances and compute test error
predictions = violentModel.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
    .sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
#print('Learned regression forest model:')

Test Mean Squared Error = 5.5053853821143495


In [37]:
#Prediction
#Predict number of violent crime in chicago beats for next week
nextWeek = datetime.now().isocalendar()[1]
nextWeekPrice=np.mean([2,429.33,2,436.10,2,439.07,2,430.06,2,411.80]) #average SP500 adj close price over past 5 days
#use average num of arrest and average num of domestic as input: 25,23
dataNextWeek = beat.map(lambda x: [weekIndex[nextWeek],beatIndex[x],25,23,nextWeekPrice])

violentCrimeNextWeek = violentModel.predict(dataNextWeek)

# [((week, prediction), beat)]
violentPredictions = violentCrimeNextWeek.map(lambda x: (nextWeek, x)).zip(beat)


In [38]:
# re-arrange into [[week, beat, prediction]]
violentPredictions = violentPredictions.map(lambda x: [x[0][0], x[1], x[0][1]])
violentPredictions.take(5)

[[23, 111, 13.369984891724021],
 [23, 112, 13.369984891724021],
 [23, 113, 13.369984891724021],
 [23, 114, 13.369984891724021],
 [23, 121, 13.369984891724021]]

In [46]:
#write prediction results to local:
violentOutput = violentPredictions.collect()

with open("violentOutput.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Test MSE: {0}'.format(testMSE)])
    writer.writerow(['**************'])
    writer.writerow(["week", "beat", "prediction"])
    writer.writerows(violentOutput)

# Weekly Non-violent Crime Prediction for Chicago Beats in the past three years


In [47]:
#select useful features in nonviolentRDD: ID|Case Number|Date|IUCR|Primary Type|Arrest|Domestic|Beat|Year|adj close
nonviolentRDD2=nonviolentRDD.map(lambda x: (x[0],(x[1],x[2],x[4],x[5],x[8],x[9],x[10],x[17],x[28])))
nonviolentRDD2.take(5)

[(10078659,
  ('HY267429',
   '05/19/2015 11:57:00 PM',
   '143A',
   'WEAPONS VIOLATION',
   True,
   False,
   624,
   2015,
   2127.830078)),
 (10078598,
  ('HY267408',
   '05/19/2015 11:50:00 PM',
   '3731',
   'INTERFERENCE WITH PUBLIC OFFICER',
   True,
   False,
   2432,
   2015,
   2127.830078)),
 (10078625,
  ('HY267417',
   '05/19/2015 11:47:00 PM',
   '2170',
   'NARCOTICS',
   True,
   False,
   421,
   2015,
   2127.830078)),
 (10078584,
  ('HY267397',
   '05/19/2015 11:45:00 PM',
   '4625',
   'OTHER OFFENSE',
   True,
   False,
   935,
   2015,
   2127.830078)),
 (10078594,
  ('HY267388',
   '05/19/2015 11:30:00 PM',
   '1305',
   'CRIMINAL DAMAGE',
   True,
   False,
   1434,
   2015,
   2127.830078))]

In [48]:
#joined weekOfYear back to nonviolentRDD2:
nonviolentRDD3=nonviolentRDD2.join(weekOfYear)
nonviolentRDD3.take(5)

[(9076738,
  (('HW221405',
    '04/05/2013 10:30:00 PM',
    '0820',
    'THEFT',
    False,
    False,
    2222,
    2013,
    1553.280029),
   14)),
 (9404450,
  (('HW548048',
    '11/25/2013 06:00:00 PM',
    '0880',
    'THEFT',
    False,
    False,
    1813,
    2013,
    1802.47998),
   48)),
 (9502724,
  (('HX157814',
    '02/18/2014 12:01:00 AM',
    '0820',
    'THEFT',
    False,
    False,
    813,
    2014,
    1840.76001),
   8)),
 (9213270,
  (('HW359283',
    '07/12/2013 01:15:00 PM',
    '0860',
    'THEFT',
    True,
    False,
    411,
    2013,
    1680.189941),
   28)),
 (9956012,
  (('HY144959',
    '02/09/2015 07:00:00 AM',
    '1330',
    'CRIMINAL TRESPASS',
    True,
    False,
    2514,
    2015,
    2046.73999),
   7))]

In [49]:
#calculate number of crimes by (beat, year, weekOfYear) as response variable
pairsNV=nonviolentRDD3.map(lambda x: ((x[1][0][6],x[1][0][7],x[1][1]),1))
#pairsNV.take(3)
countsNV=pairsNV.reduceByKey(lambda x,y: int(x)+int(y))
countsNV.take(3)

[((1511, 2015, 1), 2), ((131, 2014, 2), 5), ((223, 2014, 12), 7)]

In [53]:
#calculate number of arrest by (beat,year, weekOfYear) as predictor
pairsNV2 = nonviolentRDD3.filter(lambda x: x[1][0][4]==True).map(lambda x:((x[1][0][6],x[1][0][7],x[1][1]),1))
#pairsNV2.take(3)
countsNV2=pairsNV2.reduceByKey(lambda x,y: int(x)+int(y))
countsNV2.take(3)

[((1833, 2014, 38), 1), ((233, 2014, 12), 1), ((1122, 2014, 3), 10)]

In [50]:
#calculate number of domestic by (beat,year, weekOfYear) as predictor
pairsNV3 = nonviolentRDD3.filter(lambda x: x[1][0][5]==True).map(lambda x:((x[1][0][6],x[1][0][7],x[1][1]),1))
#pairsNV3.take(3)
countsNV3=pairsNV3.reduceByKey(lambda x,y: int(x)+int(y))
countsNV3.take(3)

[((1511, 2013, 31), 2), ((726, 2013, 4), 1), ((1832, 2014, 3), 1)]

In [51]:
#calculate average adj close SP500 by (year, weekOfYear)
#calculate counts
pairsNV4=nonviolentRDD3.map(lambda x: ((x[1][0][7],x[1][1]),1))
#pairsNV4.take(3)
countsNV4=pairsNV4.reduceByKey(lambda x,y: int(x)+int(y))
#countsNV4.take(3)

#calculate sum
pairsNV4_2 =nonviolentRDD3.map(lambda x: ((x[1][0][7],x[1][1]),x[1][0][8]))
#pairsNV4_2.take(3)
sumNV=pairsNV4_2.reduceByKey(lambda x,y: int(x)+int(y))
#sumNV.take(3)

#[(year, weekOfYear),weeklyAvePrice]
weeklyjoinNV=sumNV.join(countsNV4)
#weeklyjoinNV.take(3)
weeklyAvePriceNV=weeklyjoinNV.map(lambda x: (x[0],x[1][0]/x[1][1]))
weeklyAvePriceNV.take(3)
#weeklyAvePriceNV.count() #125

[((2013, 33), 1676.815995001562),
 ((2014, 28), 1968.725626740947),
 ((2014, 10), 1870.0199291408326)]

In [54]:
#joined all the features into one rdd: [(beat,year,weekOfYear), num of crime, num of arrest, num of domestic]
NVfeaturesRDD=countsNV.join(countsNV2).join(countsNV3)
NVfeaturesRDD.take(3)

[((1533, 2015, 7), ((22, 16), 1)),
 ((1234, 2014, 25), ((9, 2), 1)),
 ((735, 2013, 23), ((10, 5), 1))]

In [55]:
#re-arrange NVfeaturesRDD to to join with weeklyAvePriceNV: 
#[(year, weekOfYear),beat,num of crime, num of arrest, num of domestic]
NVfeaturesRDD=NVfeaturesRDD.map(lambda x: ((x[0][1],x[0][2]),(x[0][0],x[1][0][0],x[1][0][1],x[1][1])))
NVfeaturesRDD.take(3)
#NVfeaturesRDD.count() #13729

[((2015, 7), (1533, 22, 16, 1)),
 ((2014, 25), (1234, 9, 2, 1)),
 ((2013, 23), (735, 10, 5, 1))]

In [56]:
NVfeaturesRDD=NVfeaturesRDD.join(weeklyAvePriceNV)
#NVfeaturesRDD.count() #13729
NVfeaturesRDD.take(3)

[((2013, 33), ((2223, 15, 1, 1), 1676.815995001562)),
 ((2013, 33), ((1215, 10, 1, 1), 1676.815995001562)),
 ((2013, 33), ((631, 16, 9, 1), 1676.815995001562))]

In [57]:
#re-arrange fields and drop year field to enter MLlib
#[number of crimes, weekOfYear, beat, num of arrest, num of domestic, weeklyAvePrice]
NVfeaturesRDD=NVfeaturesRDD.map(lambda x: (x[1][0][1],x[0][1],x[1][0][0],x[1][0][2],x[1][0][3],x[1][1]))
NVfeaturesRDD.take(3)

[(15, 33, 2223, 1, 1, 1676.815995001562),
 (10, 33, 1215, 1, 1, 1676.815995001562),
 (16, 33, 631, 9, 1, 1676.815995001562)]

In [58]:
#create category index for weekOfYear starting at index 0:
weekIndex = dict(zip(range(1,54), range(0,53)))

In [59]:
#create category index for best starting at index 0:
beat2 = NVfeaturesRDD.map(lambda x: x[2]).distinct().sortBy(lambda x: x ) 
#beat2.count() # 271 records
beatIndex2 = dict(beat2.zipWithIndex().map(lambda x: (x[0],x[1])).collect())

In [60]:
#[(crimes, [weekIndex, beatIndex, num of arrest, num of domestic, weeklyAvePrice])]
NVfeatures = NVfeaturesRDD.map(lambda x: (x[0],[weekIndex[x[1]],beatIndex2[x[2]],x[3],x[4],x[5]]))
NVfeatures = NVfeatures.filter(lambda x: re.findall("[0-9]+", str(x[1][1])))
#NVfeatures.map(lambda x:x[1][1]).take(3)
NVfeatures.take(3)

[(15, [32, 243, 1, 1, 1676.815995001562]),
 (10, [32, 147, 1, 1, 1676.815995001562]),
 (16, [32, 67, 9, 1, 1676.815995001562])]

In [61]:
#convert to labeled point, (response, [predictors])
NVfeaturesLP = NVfeatures.map(lambda x: LabeledPoint(x[0],x[1]))

In [62]:
#Split the data into training and test sets (30% held out for testing)
(trainingData2, testData2) = NVfeaturesLP.randomSplit([0.7, 0.3])

In [63]:
NVfeaturesIndex = {0:271}
#maxBins = max(len(beatIndex),len(weekIndex)) 
#ecisionTree requires maxBins >= max categories in categorical features (270)

In [64]:
### random forest fit for nonviolent crime prediction
nonviolentModel = RandomForest.trainRegressor(trainingData2, categoricalFeaturesInfo=NVfeaturesIndex,
                                    numTrees=5, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=5, maxBins=271)

In [65]:
### Evalute nonviolent crime prediction model
# Evaluate model on test instances and compute test error
predictions2 = nonviolentModel.predict(testData2.map(lambda x: x.features))
labelsAndPredictions2 = testData2.map(lambda lp: lp.label).zip(predictions2)
testMSE2 = labelsAndPredictions2.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
    .sum() / float(testData2.count())
print('Test Mean Squared Error = ' + str(testMSE2))
#print('Learned regression forest model:')

Test Mean Squared Error = 11.96188556156307


In [66]:
#Prediction
#Predict number of nonviolent crime in chicago beats for next week
nextWeek = datetime.now().isocalendar()[1]
nextWeekPrice=np.mean([2,429.33,2,436.10,2,439.07,2,430.06,2,411.80]) #average SP500 adj close price over past 5 days
#use average num of arrest and average num of domestic as input: 25,23
dataNextWeek2 = beat2.map(lambda x: [weekIndex[nextWeek],beatIndex2[x],25,23,nextWeekPrice])

nonviolentCrimeNextWeek = nonviolentModel.predict(dataNextWeek2)

# [((week, prediction), beat)]
nonviolentPredictions = nonviolentCrimeNextWeek.map(lambda x: (nextWeek, x)).zip(beat2)

In [67]:
# re-arrange into [[week, beat, prediction]]
nonviolentPredictions = nonviolentPredictions.map(lambda x: [x[0][0], x[1], x[0][1]])
nonviolentPredictions.take(5)

[[23, 111, 24.97424652073861],
 [23, 112, 24.97424652073861],
 [23, 113, 24.97424652073861],
 [23, 114, 24.97424652073861],
 [23, 121, 24.97424652073861]]

In [69]:
#write prediction results to local:
nonviolentOutput = nonviolentPredictions.collect()

with open("nonviolentOutput.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Test MSE: {0}'.format(testMSE2)])
    writer.writerow(['**************'])
    writer.writerow(["week", "beat", "prediction"])
    writer.writerows(nonviolentOutput)

In [121]:
#build schema for featuresRDD and convert it back to spark dataframe
#https://sparkour.urizone.net/recipes/controlling-schema/#02
def build_schema():
    """Build and return a schema to use for the sample data."""
    schema = StructType(
        [
            StructField("numOfCrime", IntegerType(), True),
            StructField("weekOfYear", StringType(), True),
            StructField("beat", StringType(), True),
            StructField("numOfArrest", IntegerType(), True),
            StructField("numOfDomestic", IntegerType(), True),
            StructField("weeklyAvePrice", DoubleType(), True),
        ]
    )
    return schema

In [156]:
# convert featuresRDD back to dataframe, specifying schema.
featuresDF = spark.createDataFrame(featuresRDD, schema=build_schema())
featuresDF.printSchema()
featuresDF.show(3)

root
 |-- numOfCrime: integer (nullable = true)
 |-- weekOfYear: string (nullable = true)
 |-- beat: string (nullable = true)
 |-- numOfArrest: integer (nullable = true)
 |-- numOfDomestic: integer (nullable = true)
 |-- weeklyAvePrice: double (nullable = true)

+----------+----------+----+-----------+-------------+------------------+
|numOfCrime|weekOfYear|beat|numOfArrest|numOfDomestic|    weeklyAvePrice|
+----------+----------+----+-----------+-------------+------------------+
|         9|        33|2223|          2|            2|1676.6430107526883|
|         7|        33|2533|          1|            3|1676.6430107526883|
|        19|        33| 631|          3|            8|1676.6430107526883|
+----------+----------+----+-----------+-------------+------------------+
only showing top 3 rows



In [14]:
crime=pd.read_csv('Crimes_-_2001_to_present.csv')


  interactivity=interactivity, compiler=compiler, result=result)


In [52]:
crime.iloc[:,10:13]
#crime[:5]
#crime[-1:]
#crime.iloc[[0, 10, 35549], :]
#crime[]
#crime.iloc[:,0:10].head(5)
#crime[[3]]
#crime[['Ward','District','Community Area']]
#crime[[5]]

Unnamed: 0,Beat,District,Ward
0,624,6.0,8.0
1,2432,24.0,49.0
2,421,4.0,7.0
3,314,3.0,5.0
4,935,9.0,3.0
5,1011,10.0,24.0
6,725,7.0,17.0
7,1434,14.0,32.0
8,1111,11.0,37.0
9,813,8.0,13.0


In [5]:
crime.describe()
list(crime)
crime.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5801844 entries, 0 to 5801843
Data columns (total 22 columns):
ID                      int64
Case Number             object
Date                    object
Block                   object
IUCR                    object
Primary Type            object
Description             object
Location Description    object
Arrest                  bool
Domestic                bool
Beat                    int64
District                float64
Ward                    float64
Community Area          object
FBI Code                object
X Coordinate            float64
Y Coordinate            float64
Year                    int64
Updated On              object
Latitude                float64
Longitude               float64
Location                object
dtypes: bool(2), float64(6), int64(3), object(11)
memory usage: 896.4+ MB


In [8]:
crime["IUCR"].value_counts()
crime["Primary Type"].value_counts()

THEFT                                1197662
BATTERY                              1057801
CRIMINAL DAMAGE                       668132
NARCOTICS                             658978
OTHER OFFENSE                         358098
ASSAULT                               351696
BURGLARY                              341420
MOTOR VEHICLE THEFT                   276437
ROBBERY                               216743
DECEPTIVE PRACTICE                    195123
CRIMINAL TRESPASS                     170286
PROSTITUTION                           65431
WEAPONS VIOLATION                      55637
PUBLIC PEACE VIOLATION                 42005
OFFENSE INVOLVING CHILDREN             36672
SEX OFFENSE                            21125
CRIM SEXUAL ASSAULT                    21119
GAMBLING                               13622
LIQUOR LAW VIOLATION                   13211
INTERFERENCE WITH PUBLIC OFFICER       11158
ARSON                                   9565
HOMICIDE                                7090
KIDNAPPING

In [1]:
#reserse-geocoding trial
coords = crime[['Latitude','Longitude']].head(100)
coords['coords']= coords.Latitude.map(str) + ", " + coords.Longitude.map(str)
locations = geolocator.reverse(coords.iloc[0,:],coords.iloc[0,:])
#print(location.address)
#dir(location)
#coords[[0]]

NameError: name 'crime' is not defined

In [6]:
#from pyspark import SparkContext
from pyspark import SparkContext
import csv
import re
from operator import add
from pyspark.mllib.stat import Statistics
import math
from scipy import stats
#from pyspark.sql import SQLContext

sc = SparkContext.getOrCreate()

In [None]:
#use when done with current pyspark context
sc.stop()

In [None]:
crimeFile = sc.textFile("Crimes_-_2001_to_present.csv")
crimeFile.take(5)