In [1]:
import pyspark
import warnings
import matplotlib
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession

matplotlib.rcParams['figure.dpi'] = 300

!sed -i 's/hadoop.root.logger=INFO,console/hadoop.root.logger=WARN,console/' /usr/hadoop-3.3.2/etc/hadoop/log4j.properties
warnings.filterwarnings('ignore')

In [2]:
conf = pyspark.SparkConf().setAll(
    [('spark.master', 'local[2]'),
     ('spark.app.name', 'EDA')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

2023-05-06 00:17:18,209 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2023-05-06 00:17:19,278 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
df = spark.read.csv('file:///home/work/Project/crime.csv', header=True)
df = df.filter("Year < 2023")
# Ignore 2023 to only look at completed years

#### Features

In [9]:
' *--* '.join([i.name for i in df.schema])

'ID *--* Case Number *--* Date *--* Block *--* IUCR *--* Primary Type *--* Description *--* Location Description *--* Arrest *--* Domestic *--* Beat *--* District *--* Ward *--* Community Area *--* FBI Code *--* X Coordinate *--* Y Coordinate *--* Year *--* Updated On *--* Latitude *--* Longitude *--* Location'

#### Feature: Primary Categories

In [None]:
new_cats = {
    'CRIM SEXUAL ASSAULT':'CRIMINAL SEXUAL ASSAULT',
    'NON-CRIMINAL (SUBJECT SPECIFIED)':'NON-CRIMINAL',
    'NON - CRIMINAL':'NON-CRIMINAL',
    'OTHER NARCOTIC VIOLATION':'NARCOTICS',
    'PUBLIC INDECENCY':'PUBLIC INDECENCY/OBSCENITY',
    'OBSCENITY':'PUBLIC INDECENCY/OBSCENITY'
}

                                                                                

In [4]:
df.createOrReplaceTempView('crime')

['ID',
 'Case Number',
 'Date',
 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']

In [23]:
query = '''
SELECT CAST(`Ward` AS INT) AS ward
    , COUNT(*) AS n_crimes
FROM crime
WHERE `Ward` IS NOT NULL
GROUP BY `Ward`
'''
df = spark.sql(query).toPandas().to_csv('map_ward.csv', index=False)

                                                                                

In [25]:
query = '''
WITH cte (
    SELECT `Ward`
        , SUM(CASE Arrest
            WHEN 'false' THEN 0
            WHEN 'true' THEN 1 END) AS count_true
        , COUNT(`Arrest`) AS total
    FROM crime
    WHERE Year < 2023
    GROUP BY `Ward`
)
SELECT *
    , count_true/total AS proportion_arrested
FROM cte
'''
tdf = spark.sql(query).toPandas().to_csv('map_arrest_rate.csv',index=False)

                                                                                

In [9]:
query = '''
WITH cte (
    SELECT `Primary Type`
        , SUM(CASE Arrest
            WHEN 'false' THEN 0
            WHEN 'true' THEN 1 END) AS count_true
        , COUNT(`Arrest`) AS total
    FROM crime
    WHERE Year < 2023
    GROUP BY `Primary Type`
)
SELECT *
    , count_true/total AS proportion_arrested
FROM cte
'''
tdf = spark.sql(query).toPandas().to_csv('arrest_rates.csc',index=False)

                                                                                

In [None]:
tdf.head(30)

In [23]:
tdf.to_csv('year_type_counts.csv', index=False)

In [28]:
spark.stop()

In [None]:
ew_cats = {'CRIM SEXUAL ASSAULT':'CRIMINAL SEXUAL ASSAULT','NON-CRIMINAL (SUBJECT SPECIFIED)':'NON-CRIMINAL',
           'NON - CRIMINAL':'NON-CRIMINAL', 'OTHER NARCOTIC VIOLATION':'NARCOTICS',
           'PUBLIC INDECENCY':'PUBLIC INDECENCY/OBSCENITY', 'OBSCENITY':'PUBLIC INDECENCY/OBSCENITY'}
