In [1]:
import findspark
findspark.init("/usr/local/spark/spark-2.2.1-bin-hadoop2.7")

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('New York Crime').getOrCreate()

In [3]:
df = spark.read.csv('NYCrime.csv', header= True, inferSchema=True)
df.head(10)

[Row(OBJECTID=1, Identifier='f070032d', Occurrence Date='09/06/1940 07:30:00 PM', Day of Week='Friday', Occurrence Month='Sep', Occurrence Day=6, Occurrence Year=1940, Occurrence Hour='19', CompStat Month='9', CompStat Day=7, CompStat Year=2010, Offense='BURGLARY', Offense Classification='FELONY', Sector='D', Precinct='66', Borough='BROOKLYN', Jurisdiction='N.Y. POLICE DEPT', XCoordinate=987478, YCoordinate=166141, Location 1='(40.6227027620001, -73.9883732929999)'),
 Row(OBJECTID=2, Identifier='c6245d4d', Occurrence Date='12/14/1968 12:20:00 AM', Day of Week='Saturday', Occurrence Month='Dec', Occurrence Day=14, Occurrence Year=1968, Occurrence Hour='0', CompStat Month='12', CompStat Day=14, CompStat Year=2008, Offense='GRAND LARCENY', Offense Classification='FELONY', Sector='G', Precinct='28', Borough='MANHATTAN', Jurisdiction='N.Y. POLICE DEPT', XCoordinate=996470, YCoordinate=232106, Location 1='(40.8037530600001, -73.955861904)'),
 Row(OBJECTID=3, Identifier='716dbc6f', Occurrence

In [5]:
df.printSchema()

root
 |-- OBJECTID: integer (nullable = true)
 |-- Identifier: string (nullable = true)
 |-- Occurrence Date: string (nullable = true)
 |-- Day of Week: string (nullable = true)
 |-- Occurrence Month: string (nullable = true)
 |-- Occurrence Day: integer (nullable = true)
 |-- Occurrence Year: integer (nullable = true)
 |-- Occurrence Hour: string (nullable = true)
 |-- CompStat Month: string (nullable = true)
 |-- CompStat Day: integer (nullable = true)
 |-- CompStat Year: integer (nullable = true)
 |-- Offense: string (nullable = true)
 |-- Offense Classification: string (nullable = true)
 |-- Sector: string (nullable = true)
 |-- Precinct: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Jurisdiction: string (nullable = true)
 |-- XCoordinate: integer (nullable = true)
 |-- YCoordinate: integer (nullable = true)
 |-- Location 1: string (nullable = true)



In [6]:
df.show()

+--------+----------+--------------------+-----------+----------------+--------------+---------------+---------------+--------------+------------+-------------+--------------------+----------------------+------+--------+---------+-------------------+-----------+-----------+--------------------+
|OBJECTID|Identifier|     Occurrence Date|Day of Week|Occurrence Month|Occurrence Day|Occurrence Year|Occurrence Hour|CompStat Month|CompStat Day|CompStat Year|             Offense|Offense Classification|Sector|Precinct|  Borough|       Jurisdiction|XCoordinate|YCoordinate|          Location 1|
+--------+----------+--------------------+-----------+----------------+--------------+---------------+---------------+--------------+------------+-------------+--------------------+----------------------+------+--------+---------+-------------------+-----------+-----------+--------------------+
|       1|  f070032d|09/06/1940 07:30:...|     Friday|             Sep|             6|           1940|          

In [7]:
df.select('Identifier').head(1)[0][0]

'f070032d'

In [8]:
df.select('Offense').head(1)[0][0]

'BURGLARY'

#Displaying number of crime cases of different types

In [9]:
df.groupBy(df.Offense).count().show()

+--------------------+------+
|             Offense| count|
+--------------------+------+
|      FELONY ASSAULT|184042|
|                  NA|     1|
|MURDER & NON-NEGL...|  4574|
|             ROBBERY|198744|
|GRAND LARCENY OF ...|101963|
|                RAPE| 13779|
|       GRAND LARCENY|428993|
|            BURGLARY|191369|
+--------------------+------+



Number of Cases per Year 

In [10]:
df.groupBy(df['Occurrence Year']).count().orderBy(df['Occurrence Year']).show()

+---------------+-----+
|Occurrence Year|count|
+---------------+-----+
|           null|  244|
|           1905|    2|
|           1908|    1|
|           1910|    3|
|           1911|    1|
|           1912|    1|
|           1913|    4|
|           1914|    2|
|           1915|    3|
|           1920|    1|
|           1940|    1|
|           1945|    2|
|           1946|    1|
|           1950|    1|
|           1954|    1|
|           1955|    1|
|           1956|    1|
|           1958|    1|
|           1959|    1|
|           1960|    1|
+---------------+-----+
only showing top 20 rows



Removing Null Values from our data

In [11]:
filteredDF = df.na.drop(subset = ['Offense', 'Occurrence Year'])

#getting cases that happen after 2006

crimesFiltered = filteredDF.filter(df['Occurrence Year'] >= 2006 )

In [12]:
#grouping count of cases happen after 2006
crimesFiltered.groupBy(df['Occurrence Year']).count().orderBy(df['Occurrence Year']).show()

+---------------+------+
|Occurrence Year| count|
+---------------+------+
|           2006|127887|
|           2007|120554|
|           2008|117375|
|           2009|106018|
|           2010|105643|
|           2011|107206|
|           2012|111798|
|           2013|111286|
|           2014|106849|
|           2015|102657|
+---------------+------+



In [13]:
maxloc = df.agg({"Location 1": "max"}).collect()[0][0]
maxloc

'(59.5805088160001, -73.8954321749999)'

In [14]:
minloc = df.agg({"Location 1": "min"}).collect()[0][0]
minloc

'(40.112709974, -77.519206334)'

In [15]:
#filtering data for New York Location only
crimesFiltered = crimesFiltered.filter(df['Location 1'] >= '(40.477399, -74.25909)')
crimesFiltered = crimesFiltered.filter(df['Location 1'] <= '(40.917577, -73.700009)')

In [16]:
#Filtering cases for Burglary only for year >= 2006
filterBurglary = crimesFiltered.filter(df['Offense'] == 'BURGLARY')
filterBurglary.groupBy(df['Occurrence Year']).count().orderBy(df['Occurrence Year']).show()

+---------------+-----+
|Occurrence Year|count|
+---------------+-----+
|           2006|23069|
|           2007|21716|
|           2008|20732|
|           2009|19441|
|           2010|18700|
|           2011|18860|
|           2012|19309|
|           2013|17419|
|           2014|16832|
|           2015|14967|
+---------------+-----+

