In [145]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import pyspark as ps
import pyspark.sql.functions as F


In [250]:
def auto_theft_filter(df):
    """ Returns pandas dataframe filtered on grand and petty theft from vehicles
    
    Inputs:
    spark dataframe
    
    Returns:
    counts of petty and grand auto theft
    fraction of auto relative to all other crimes
        
    """    
   
    petty_counts = df.filter((F.col('Descript') == "PETTY THEFT FROM LOCKED AUTO") |
                         (F.col('Descript') == "PETTY THEFT FROM UNLOCKED AUTO")).count()
    
    grand_counts = df.filter((F.col('Descript') == "GRAND THEFT FROM LOCKED AUTO") |
                             (F.col('Descript') == "GRAND THEFT FROM UNLOCKED AUTO")).count()
    
    auto_fraction = (petty_counts + grand_counts)/df.count()
        
    return df.count(), grand_counts, petty_counts, auto_fraction


def auto_to_pandas(df):
    """Returns a panda dataframe"""
    
    df_filtered = df.filter((F.col('Descript') == "GRAND THEFT FROM LOCKED AUTO") | 
                  (F.col('Descript') == "PETTY THEFT FROM LOCKED AUTO") |
                  (F.col('Descript') == "GRAND THEFT FROM UNLOCKED AUTO") | 
                  (F.col('Descript') == "PETTY THEFT FROM UNLOCKED AUTO"))
    
    return df_filtered.toPandas()
    

In [12]:
spark = (ps.sql.SparkSession
         .builder
         .master('local[4]')
         .appName('lecture')
         .getOrCreate()
        )
sc = spark.sparkContext

In [162]:
# Read in as CSV...it's A LOT of data and takes a long time in pandas
crime_raw = pd.read_csv('Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')
# crime_raw


In [164]:
crime_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2215024 entries, 0 to 2215023
Data columns (total 33 columns):
IncidntNum                                              int64
Category                                                object
Descript                                                object
DayOfWeek                                               object
Date                                                    object
Time                                                    object
PdDistrict                                              object
Resolution                                              object
Address                                                 object
X                                                       float64
Y                                                       float64
Location                                                object
PdId                                                    int64
SF Find Neighborhoods                                   float64
Curr

In [163]:
crime_raw['Descript'].value_counts()

GRAND THEFT FROM LOCKED AUTO                              178836
LOST PROPERTY                                              77956
BATTERY                                                    67654
STOLEN AUTOMOBILE                                          64763
DRIVERS LICENSE, SUSPENDED OR REVOKED                      62534
AIDED CASE, MENTAL DISTURBED                               56313
WARRANT ARREST                                             56230
SUSPICIOUS OCCURRENCE                                      52490
PETTY THEFT FROM LOCKED AUTO                               51946
PETTY THEFT OF PROPERTY                                    46114
MALICIOUS MISCHIEF, VANDALISM                              45238
MALICIOUS MISCHIEF, VANDALISM OF VEHICLES                  44290
TRAFFIC VIOLATION                                          38105
THREATS AGAINST LIFE                                       34670
FOUND PROPERTY                                             33383
GRAND THEFT OF PROPERTY  

In [15]:
# Check out some value counts for each columns
columns = crime_raw.columns
# for col in columns:
#     print(col)
#     print(crime_raw[col].value_counts(dropna=False))

## Read data in as Spark DF and remove extraneous columns

Is there any useful data in SF Find Neighborhoods or Analysis Neighborhoods?

In [131]:
crime = spark.read.csv('Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv',
                        header=True,
                        quote='"',
                        sep=',',
                        inferSchema=True)


In [132]:
crime.printSchema()

root
 |-- IncidntNum: integer (nullable = true)
 |-- Category: string (nullable = true)
 |-- Descript: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- PdDistrict: string (nullable = true)
 |-- Resolution: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- PdId: long (nullable = true)
 |-- SF Find Neighborhoods: integer (nullable = true)
 |-- Current Police Districts: integer (nullable = true)
 |-- Current Supervisor Districts: integer (nullable = true)
 |-- Analysis Neighborhoods: integer (nullable = true)
 |-- DELETE - Fire Prevention Districts: integer (nullable = true)
 |-- DELETE - Police Districts: integer (nullable = true)
 |-- DELETE - Supervisor Districts: integer (nullable = true)
 |-- DELETE - Zip Codes: integer (nullable = true)
 |-- DELETE - Neighborhoods: in

In [133]:
# Drop additional columns
print(columns)
columns_to_drop = list(columns[17:])
columns_to_drop += ['IncidntNum', 'PdId', 'Current Police Districts',
                    'Current Supervisor Districts','Location', 'SF Find Neighborhoods',
                    'Analysis Neighborhoods']
columns_to_drop

Index(['IncidntNum', 'Category', 'Descript', 'DayOfWeek', 'Date', 'Time',
       'PdDistrict', 'Resolution', 'Address', 'X', 'Y', 'Location', 'PdId',
       'SF Find Neighborhoods', 'Current Police Districts',
       'Current Supervisor Districts', 'Analysis Neighborhoods',
       'DELETE - Fire Prevention Districts', 'DELETE - Police Districts',
       'DELETE - Supervisor Districts', 'DELETE - Zip Codes',
       'DELETE - Neighborhoods', 'DELETE - 2017 Fix It Zones',
       'Civic Center Harm Reduction Project Boundary',
       'Fix It Zones as of 2017-11-06 ', 'DELETE - HSOC Zones',
       'Fix It Zones as of 2018-02-07',
       'CBD, BID and GBD Boundaries as of 2017',
       'Areas of Vulnerability, 2016', 'Central Market/Tenderloin Boundary',
       'Central Market/Tenderloin Boundary Polygon - Updated',
       'HSOC Zones as of 2018-06-05', 'OWED Public Spaces'],
      dtype='object')


['DELETE - Fire Prevention Districts',
 'DELETE - Police Districts',
 'DELETE - Supervisor Districts',
 'DELETE - Zip Codes',
 'DELETE - Neighborhoods',
 'DELETE - 2017 Fix It Zones',
 'Civic Center Harm Reduction Project Boundary',
 'Fix It Zones as of 2017-11-06 ',
 'DELETE - HSOC Zones',
 'Fix It Zones as of 2018-02-07',
 'CBD, BID and GBD Boundaries as of 2017',
 'Areas of Vulnerability, 2016',
 'Central Market/Tenderloin Boundary',
 'Central Market/Tenderloin Boundary Polygon - Updated',
 'HSOC Zones as of 2018-06-05',
 'OWED Public Spaces',
 'IncidntNum',
 'PdId',
 'Current Police Districts',
 'Current Supervisor Districts',
 'Location',
 'SF Find Neighborhoods',
 'Analysis Neighborhoods']

In [134]:
for col in columns_to_drop:
    crime = crime.drop(col)
crime.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Descript: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- PdDistrict: string (nullable = true)
 |-- Resolution: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)



## Now drop all rows with a NaN in them. 
Apparently there aren't values with nan in them...Already cleaned after removing unnecessary columns

In [135]:
num_rows = crime.count()

In [136]:
crime = crime.dropna(how='any')
crime.count()

2215024

## Figure out which categories and descriptions fit auto theft. 

In [137]:
crime.groupBy('Category').count().orderBy('count', ascending=False).show()
crime.groupBy('Descript').count().orderBy('count', ascending=False).show()

+--------------------+------+
|            Category| count|
+--------------------+------+
|       LARCENY/THEFT|480448|
|      OTHER OFFENSES|309358|
|        NON-CRIMINAL|238323|
|             ASSAULT|194694|
|       VEHICLE THEFT|126602|
|       DRUG/NARCOTIC|119628|
|           VANDALISM|116059|
|            WARRANTS|101379|
|            BURGLARY| 91543|
|      SUSPICIOUS OCC| 80444|
|      MISSING PERSON| 64961|
|             ROBBERY| 55867|
|               FRAUD| 41542|
|     SECONDARY CODES| 25831|
|FORGERY/COUNTERFE...| 23050|
|         WEAPON LAWS| 22234|
|            TRESPASS| 19449|
|        PROSTITUTION| 16701|
|     STOLEN PROPERTY| 11891|
|SEX OFFENSES, FOR...| 11742|
+--------------------+------+
only showing top 20 rows

+--------------------+------+
|            Descript| count|
+--------------------+------+
|GRAND THEFT FROM ...|178836|
|       LOST PROPERTY| 77956|
|             BATTERY| 67654|
|   STOLEN AUTOMOBILE| 64763|
|DRIVERS LICENSE, ...| 62534|
|AIDED CASE, M

In [154]:
# Turn date into something useable
from pyspark.sql.functions import unix_timestamp, from_unixtime

crime2 = crime.withColumn('Date',from_unixtime(unix_timestamp('Date', 'MM/dd/yyy')).alias('date'))
crime2.count()

2215024

In [158]:
# Create dataframe for each year
crime_2003 = crime2.filter((F.col('Date') > '2002-12-31') & (F.col('Date') < '2004-01-01'))
crime_2004 = crime2.filter((F.col('Date') > '2003-12-31') & (F.col('Date') < '2005-01-01'))
crime_2005 = crime2.filter((F.col('Date') > '2004-12-31') & (F.col('Date') < '2006-01-01'))
crime_2006 = crime2.filter((F.col('Date') > '2005-12-31') & (F.col('Date') < '2007-01-01'))
crime_2007 = crime2.filter((F.col('Date') > '2006-12-31') & (F.col('Date') < '2008-01-01'))
crime_2008 = crime2.filter((F.col('Date') > '2007-12-31') & (F.col('Date') < '2009-01-01'))
crime_2009 = crime2.filter((F.col('Date') > '2008-12-31') & (F.col('Date') < '2010-01-01'))
crime_2010 = crime2.filter((F.col('Date') > '2009-12-31') & (F.col('Date') < '2011-01-01'))
crime_2011 = crime2.filter((F.col('Date') > '2010-12-31') & (F.col('Date') < '2012-01-01'))
crime_2012 = crime2.filter((F.col('Date') > '2011-12-31') & (F.col('Date') < '2013-01-01'))
crime_2013 = crime2.filter((F.col('Date') > '2012-12-31') & (F.col('Date') < '2014-01-01'))
crime_2014 = crime2.filter((F.col('Date') > '2013-12-31') & (F.col('Date') < '2015-01-01'))
crime_2015 = crime2.filter((F.col('Date') > '2014-12-31') & (F.col('Date') < '2016-01-01'))
crime_2016 = crime2.filter((F.col('Date') > '2015-12-31') & (F.col('Date') < '2017-01-01'))
crime_2017 = crime2.filter((F.col('Date') > '2016-12-31') & (F.col('Date') < '2018-01-01'))


In [161]:
crime_years = [crime_2003, crime_2004, crime_2005, crime_2006, crime_2007, crime_2008, 
               crime_2009, crime_2010, crime_2011, crime_2012, crime_2013, crime_2014, 
               crime_2015, crime_2016, crime_2017]

years = ['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
         '2013', '2014', '2015', '2016', '2017']

crimes_per_year = {}

for dataset, year in zip(crime_years,years):
    crimes_per_year[year] = dataset.count()

crimes_per_year

{'2003': 149176,
 '2004': 148471,
 '2005': 142591,
 '2006': 138247,
 '2007': 138006,
 '2008': 141670,
 '2009': 140215,
 '2010': 133868,
 '2011': 133094,
 '2012': 141267,
 '2013': 153166,
 '2014': 150518,
 '2015': 156927,
 '2016': 151297,
 '2017': 155254}

In [212]:
crime_2013.filter((F.col('Descript') == "GRAND THEFT FROM LOCKED AUTO") | 
                  (F.col('Descript') == "PETTY THEFT FROM LOCKED AUTO") |
                  (F.col('Descript') == "GRAND THEFT FROM UNLOCKED AUTO") | 
                  (F.col('Descript') == "PETTY THEFT FROM UNLOCKED AUTO")).count()

18704

In [251]:
# Try out my helper function!
total_crimes, grand_crimes, petty_crimes, auto_theft_fraction = auto_theft_filter(crime_2013)

0.12211587428019273

In [255]:
# Go through all years and calculate grand/petty auto thefts
crime_data_by_year = []
for year, dataset in zip(years, crime_years):
    tot_crimes, n_grand, n_petty, fraction = auto_theft_filter(dataset)
    crime_data_by_year.append([year, tot_crimes, n_grand, n_petty, fraction])
    
crime_data_by_year    

[['2003', 149176, 6895, 5276, 0.08158819112994048],
 ['2004', 148471, 6735, 4922, 0.07851364913013316],
 ['2005', 142591, 8989, 4045, 0.09140829365107195],
 ['2006', 138247, 10835, 4604, 0.11167692608157863],
 ['2007', 138006, 10196, 3578, 0.09980725475703955],
 ['2008', 141670, 9273, 3225, 0.08821910072704171],
 ['2009', 140215, 9197, 2587, 0.08404236351317619],
 ['2010', 133868, 8301, 1697, 0.07468551110048705],
 ['2011', 133094, 8890, 1719, 0.07971058049198311],
 ['2012', 141267, 10456, 3058, 0.09566282288149391],
 ['2013', 153166, 14578, 4126, 0.12211587428019273],
 ['2014', 150518, 17375, 4232, 0.14355093742941044],
 ['2015', 156927, 20562, 4968, 0.1626871092928559],
 ['2016', 151297, 19714, 4590, 0.1606376861405051],
 ['2017', 155254, 25685, 5114, 0.19837814162598066]]

## Use SQL to look for similar descriptions and double check counts

In [256]:
crime_2013.createOrReplaceTempView('crime')

In [196]:
query = '''SELECT DISTINCT(Descript)
           FROM crime
           WHERE Descript LIKE '%BATTERY%'
           '''
results_battery = spark.sql(query)

In [260]:
query_all_theft = '''SELECT COUNT(*)
                     FROM crime
                     WHERE Descript LIKE '%THEFT%'
                     '''
results_all_theft = spark.sql(query_all_theft)

In [262]:
query_auto_theft = '''SELECT COUNT(*)
                 FROM crime
                 WHERE Descript LIKE '%GRAND THEFT FROM LOCKED AUTO%'
                 OR Descript LIKE '%PETTY THEFT FROM LOCKED AUTO%'
                 OR Descript LIKE '%GRAND THEFT FROM UNLOCKED AUTO%'
                 OR Descript LIKE '%PETTY THEFT FROM UNLOCKED AUTO%'
                 '''
results_auto_theft = spark.sql(query_theft)

In [263]:
#results_petty.take(30)
#results_battery.take(30)
results_theft.show()
#results_all_theft.show()

+--------+
|count(1)|
+--------+
|   18704|
+--------+

