In [41]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import pyspark as ps


In [49]:
spark = (ps.sql.SparkSession
         .builder
         .master('local[4]')
         .appName('lecture')
         .getOrCreate()
        )
sc = spark.sparkContext

In [50]:
# Read in as CSV...it's A LOT of data and takes a long time in pandas
crime_raw = pd.read_csv('Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')
crime_raw


Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,...,Civic Center Harm Reduction Project Boundary,Fix It Zones as of 2017-11-06,DELETE - HSOC Zones,Fix It Zones as of 2018-02-07,"CBD, BID and GBD Boundaries as of 2017","Areas of Vulnerability, 2016",Central Market/Tenderloin Boundary,Central Market/Tenderloin Boundary Polygon - Updated,HSOC Zones as of 2018-06-05,OWED Public Spaces
0,146196161,NON-CRIMINAL,LOST PROPERTY,Tuesday,09/23/2014,01:00,SOUTHERN,NONE,800 Block of BRYANT ST,-122.403405,...,,,,,,2.0,,,,
1,150045675,ASSAULT,BATTERY,Thursday,01/15/2015,17:00,TARAVAL,NONE,1800 Block of VICENTE ST,-122.485604,...,,,,,,1.0,,,,
2,140632022,SUSPICIOUS OCC,INVESTIGATIVE DETENTION,Wednesday,07/30/2014,09:32,BAYVIEW,NONE,100 Block of GILLETTE AV,-122.396535,...,,,,,,1.0,,,,
3,150383259,ASSAULT,BATTERY,Saturday,05/02/2015,23:10,BAYVIEW,"ARREST, BOOKED",2400 Block of PHELPS ST,-122.400131,...,,,,,,2.0,,,,
4,40753980,OTHER OFFENSES,RECKLESS DRIVING,Friday,07/02/2004,13:43,BAYVIEW,NONE,I-280 / CESAR CHAVEZ ST,-120.500000,...,,,,,,,,,,
5,40855122,SUICIDE,SUICIDE BY JUMPING,Tuesday,07/27/2004,15:19,SOUTHERN,NONE,500 Block of I-80,-122.386667,...,,,,,,,,,,
6,66085191,NON-CRIMINAL,LOST PROPERTY,Sunday,11/19/2006,17:45,BAYVIEW,NONE,0 Block of GIANTS DR,-122.387501,...,,,,,,2.0,,,,
7,50908404,VEHICLE THEFT,STOLEN AUTOMOBILE,Saturday,08/13/2005,17:00,TENDERLOIN,NONE,JENNINGS CT / INGALLS ST,-120.500000,...,,,,,,,,,,
8,90768064,ARSON,ARSON OF A VEHICLE,Tuesday,07/28/2009,23:26,BAYVIEW,NONE,SELBY ST / OAKDALE AV,-122.399686,...,,,,,,2.0,,,,
9,111027676,ASSAULT,BATTERY,Saturday,12/24/2011,07:00,SOUTHERN,NONE,0 Block of DORE ST,-122.412933,...,1.0,,1.0,,,2.0,,,1.0,


In [51]:
crime_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2215024 entries, 0 to 2215023
Data columns (total 33 columns):
IncidntNum                                              int64
Category                                                object
Descript                                                object
DayOfWeek                                               object
Date                                                    object
Time                                                    object
PdDistrict                                              object
Resolution                                              object
Address                                                 object
X                                                       float64
Y                                                       float64
Location                                                object
PdId                                                    int64
SF Find Neighborhoods                                   float64
Curr

In [52]:
crime_raw['Descript'].value_counts()

GRAND THEFT FROM LOCKED AUTO                              178836
LOST PROPERTY                                              77956
BATTERY                                                    67654
STOLEN AUTOMOBILE                                          64763
DRIVERS LICENSE, SUSPENDED OR REVOKED                      62534
AIDED CASE, MENTAL DISTURBED                               56313
WARRANT ARREST                                             56230
SUSPICIOUS OCCURRENCE                                      52490
PETTY THEFT FROM LOCKED AUTO                               51946
PETTY THEFT OF PROPERTY                                    46114
MALICIOUS MISCHIEF, VANDALISM                              45238
MALICIOUS MISCHIEF, VANDALISM OF VEHICLES                  44290
TRAFFIC VIOLATION                                          38105
THREATS AGAINST LIFE                                       34670
FOUND PROPERTY                                             33383
GRAND THEFT OF PROPERTY  

In [53]:
# Check out some value counts for each columns
columns = crime_raw.columns
for col in columns:
    print(col)
    print(crime_raw[col].value_counts(dropna=False))

IncidntNum
180187811    23
140135145    19
171000725    18
160629623    17
170559220    16
160174703    15
160354262    14
170186986    14
151121761    14
160434193    14
170265847    13
130994393    13
160696202    13
150109908    13
151038144    13
160263431    13
120772983    13
170614846    13
120571133    12
180269635    12
81314301     12
120931559    12
120932648    12
140628203    12
81309841     12
180010911    12
130247982    12
160538949    11
170784869    11
180089217    11
             ..
61174181      1
136168697     1
40182191      1
166058416     1
40225202      1
140900788     1
40057196      1
106032094     1
50664270      1
90390883      1
60887337      1
60885290      1
140583215     1
90306868      1
40751138      1
90282296      1
60916027      1
160650571     1
60928317      1
60926270      1
150437618     1
60965187      1
116150250     1
170025287     1
156140904     1
160429536     1
40022365      1
166064742     1
61032802      1
160819761     1
Name: Incidnt

# Read data in as Spark DF and create temp SQL for initial queries. Once data is pared down, then transfer to pandas and plot

In [54]:
crime = spark.read.csv('Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv',
                        header=True,
                        quote='"',
                        sep=',',
                        inferSchema=True)


In [55]:
crime.printSchema()

root
 |-- IncidntNum: integer (nullable = true)
 |-- Category: string (nullable = true)
 |-- Descript: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- PdDistrict: string (nullable = true)
 |-- Resolution: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- PdId: long (nullable = true)
 |-- SF Find Neighborhoods: integer (nullable = true)
 |-- Current Police Districts: integer (nullable = true)
 |-- Current Supervisor Districts: integer (nullable = true)
 |-- Analysis Neighborhoods: integer (nullable = true)
 |-- DELETE - Fire Prevention Districts: integer (nullable = true)
 |-- DELETE - Police Districts: integer (nullable = true)
 |-- DELETE - Supervisor Districts: integer (nullable = true)
 |-- DELETE - Zip Codes: integer (nullable = true)
 |-- DELETE - Neighborhoods: in

In [83]:
# Drop additional columns
print(columns)
columns_to_drop = list(columns[17:])
columns_to_drop += ['IndicntNum', 'PdId', 'Current Police Districts',
                    'Current Supervisor Districts','Location']
columns_to_drop

['DELETE - Fire Prevention Districts',
 'DELETE - Police Districts',
 'DELETE - Supervisor Districts',
 'DELETE - Zip Codes',
 'DELETE - Neighborhoods',
 'DELETE - 2017 Fix It Zones',
 'Civic Center Harm Reduction Project Boundary',
 'Fix It Zones as of 2017-11-06 ',
 'DELETE - HSOC Zones',
 'Fix It Zones as of 2018-02-07',
 'CBD, BID and GBD Boundaries as of 2017',
 'Areas of Vulnerability, 2016',
 'Central Market/Tenderloin Boundary',
 'Central Market/Tenderloin Boundary Polygon - Updated',
 'HSOC Zones as of 2018-06-05',
 'OWED Public Spaces',
 'IndicntNum',
 'PdId',
 'Current Police Districts',
 'Current Supervisor Districts',
 'Location']

In [84]:
for col in columns_to_drop:
    crime = crime.drop(col)
crime.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Descript: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- PdDistrict: string (nullable = true)
 |-- Resolution: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- SF Find Neighborhoods: integer (nullable = true)
 |-- Analysis Neighborhoods: integer (nullable = true)

