In [1]:
# pip
# !pip install tqdm
# !pip install dask

In [2]:
# conda
# !conda install tqdm
# !conda install dask

### Importing necessary libraries

In [3]:
# configuration, worked on using python@3.10.9 
import os
import urllib
import json
from threading import Thread, Lock
from tqdm import tqdm
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, BooleanType
import findspark
from haversine import haversine, Unit

### Dataset links and filename

In [4]:
# data folder
data_dir = 'data'

# data urls
historic_arrest_loc = { 'url': 'https://data.cityofnewyork.us/resource/8h9b-rp9u.json?$limit=15000000', 'filename': 'arrest.json' }
historic_complaint_loc = { 'url': 'https://data.cityofnewyork.us/resource/qgea-i56i.json?$limit=15000000', 'filename': 'complaint.json' }
historic_court_summons_loc = { 'url': 'https://data.cityofnewyork.us/resource/sv2w-rv3k.json?$limit=15000000', 'filename': 'summons.json' }
traffic_speed_loc = { 'url': 'https://data.cityofnewyork.us/resource/i4gi-tjb9.json?$limit=15000000', 'filename': 'speed.json' }
turnstile_loc = { 'url': 'https://data.ny.gov/resource/i55r-43gk.json?$limit=15000000', 'filename': 'turnstile.json' }
subway_loc = { 'url': 'http://web.mta.info/developers/data/nyct/subway/Stations.csv?$limit=10000', 'filename': 'subway.csv' }

### Dataset: Downloading handler

In [5]:
# download flags
downloadflag = True
redownload = False

thread_lock = Lock()

# download utils
def download_dataset_thread(loc, folder):
    with thread_lock:
        if ((not os.path.exists(os.path.join(folder, loc['filename']))) or redownload) and downloadflag:
            if os.path.isfile(os.path.join(folder, loc['filename'])):
                os.remove(os.path.join(folder, loc['filename']))
            if not os.path.exists(folder):
                os.makedirs(folder) 
            with tqdm(unit="B", unit_scale=True, desc=loc['filename'], miniters=1) as progress_bar:
                urllib.request.urlretrieve(loc['url'], os.path.join(folder, loc['filename']), lambda block_num, block_size, total_size: progress_bar.update(block_size))
            progress_bar.display()
        
def download_dataset(loc, folder):
    thread = Thread(target=download_dataset_thread, args=(loc, folder))
    thread.start()
    thread.join()
        

### Dataset: Downloading

In [6]:
# download datasets
for dataset in [historic_arrest_loc,
                historic_complaint_loc,
                historic_court_summons_loc,
                turnstile_loc,
                subway_loc]:
    download_dataset(dataset, data_dir)

### Providing Apache Spark backend

In [7]:
findspark.init('/opt/homebrew/Cellar/apache-spark/3.4.0/libexec')
findspark.find()

'/opt/homebrew/Cellar/apache-spark/3.4.0/libexec'

### Creating spark session using SparkSession builder

In [8]:
# spark session initialization
spark = SparkSession.builder\
    .master("local[5]")\
    .appName("main")\
    .config("spark.sql.debug.maxToStringFields", 50)\
    .config("spark.driver.memory", '8g')\
    .config("spark.executor.instances", 4)\
    .config("spark.executor.cores", 5)\
    .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/07 03:32:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
spark.conf.get("spark.executor.cores")

'5'

### Initializing spark dataframes

In [10]:
# dataframes
arrest_df = spark.read.json(os.path.join(data_dir, historic_arrest_loc['filename']), multiLine=True).repartition(5)
complaint_df = spark.read.json(os.path.join(data_dir, historic_complaint_loc['filename']), multiLine=True).repartition(5)
summons_df = spark.read.json(os.path.join(data_dir, historic_court_summons_loc['filename']), multiLine=True).repartition(5)
turnstile_df = spark.read.json(os.path.join(data_dir, turnstile_loc['filename']), multiLine=True).repartition(5)
subway_df = spark.read.csv(os.path.join(data_dir, subway_loc['filename']), header=True, inferSchema=True).repartition(5)

23/05/07 03:33:01 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

### Dataset: Analysis & Cleansing

#### 1. Subway and Turnstile dataset analysis

In [11]:
# normalize udf for station id in subway and turnstile
def normalize(unit):
    type, *val = list(unit)
    return type + str(int(''.join(val)))

nUdf = F.udf(normalize, StringType())

In [12]:
# normalize subway GTFS Stop ID
subway_df = subway_df.withColumn('GTFS Stop ID', nUdf(F.col('GTFS Stop ID')))

In [13]:
# normalize turnstile unit
turnstile_df = turnstile_df.withColumn('unit', nUdf(F.col('unit')))

In [14]:
# getting subways stations in turnstile and subway dataset
s_list = subway_df.select(F.col('GTFS Stop ID')).distinct().toPandas().values.flatten()
t_list = turnstile_df.select(F.col('unit')).distinct().toPandas().values.flatten()

                                                                                

In [15]:
# comparing common subway stations
ts_intersect = [value for value in t_list if value in s_list]
ts_intersect

['R16',
 'R13',
 'R6',
 'R30',
 'R8',
 'R40',
 'R23',
 'R45',
 'R21',
 'R20',
 'R43',
 'R3',
 'R36',
 'R15',
 'R24',
 'R29',
 'R33',
 'R39',
 'R4',
 'R41',
 'R27',
 'R19',
 'R5',
 'R44',
 'R22',
 'R34',
 'R32',
 'R31',
 'R42',
 'R9',
 'R11',
 'R1',
 'R25',
 'R18',
 'R14',
 'R35',
 'R17',
 'R28']

### Dataset: Consolidating

1. Combine Subway to Turnstile

In [16]:
gt_df = turnstile_df.groupBy(F.col('unit'))\
    .agg({ 'entries': 'sum', 'exits': 'sum'})\
        .select(F.col('unit'),\
            F.col('sum(entries)').alias('entries'),\
            F.col('sum(exits)').alias('exits'))
gt_df.show()

[Stage 20:>                                                         (0 + 5) / 5]

+----+----------------+----------------+
|unit|         entries|           exits|
+----+----------------+----------------+
|R159| 9.7658670459E10| 3.6752142042E10|
|R143|6.16250900125E11|7.75773326471E11|
|R469| 1.0314052482E10|    9.79342886E8|
| R16| 7.9053401996E10| 9.4031973197E10|
|R167| 1.6874102816E11|1.96763060056E11|
|R100| 3.8355082149E10| 4.2813609167E10|
|R177|2.21718982614E11|1.91598461153E11|
|R290| 7.7285465068E10| 9.9919615555E10|
| R13| 9.9819375916E10|1.70757933631E11|
| R37| 2.8851327936E11| 1.8206496699E10|
|R384|  2.283725805E10| 2.2146211814E10|
|R432| 1.5507768821E10|   7.016570014E9|
| R80|2.69169144454E11| 1.5734215062E11|
|R541| 2.0813861412E10| 1.6886431668E10|
|R412|   9.997534418E9|    3.88309983E9|
|R430|   9.052436119E9|  1.126470821E10|
|R294| 4.0256229752E10| 2.4555010956E10|
| R95| 4.8301616614E10| 4.9211442273E10|
|  R6| 7.1331940846E10| 2.6958063183E10|
|R329| 1.1117120569E10|   5.684716085E9|
+----+----------------+----------------+
only showing top

                                                                                

In [17]:
# combine the total exits and entries for each station
st_df = subway_df.join(gt_df, F.col('GTFS Stop ID') == F.col('unit'))\
    .select(F.col('GTFS Stop ID').alias('id'),\
        F.col('Line').alias('line'),\
        F.col('Stop Name').alias('stop_name'),\
        F.col('Borough').alias('borough'),\
        F.col('GTFS Latitude').alias('lat'),\
        F.col('GTFS Longitude').alias('long'),\
        F.col('North Direction Label').alias('n_label'),\
        F.col('South Direction Label').alias('s_label'),\
        F.col('entries'),\
        F.col('exits'))
st_df.show()

[Stage 29:>                                                         (0 + 5) / 5]

+---+-------------------+-----------------+-------+---------+----------+--------------------+--------------------+-----------------+-----------------+
| id|               line|        stop_name|borough|      lat|      long|             n_label|             s_label|          entries|            exits|
+---+-------------------+-----------------+-------+---------+----------+--------------------+--------------------+-----------------+-----------------+
|R16|Broadway - Brighton|   Times Sq-42 St|      M|40.754672|-73.986754|     Uptown & Queens| Downtown & Brooklyn|  7.9053401996E10|  9.4031973197E10|
|R13|            Astoria|       5 Av/59 St|      M|40.764811|-73.973347|              Queens| Downtown & Brooklyn|  9.9819375916E10| 1.70757933631E11|
| R6|            Astoria|            36 Av|      Q|40.756804|-73.929575|Astoria - Ditmars...|           Manhattan|  7.1331940846E10|  2.6958063183E10|
|R30|Broadway - Brighton|        DeKalb Av|     Bk|40.690635|-73.981824|           Manhattan|C

                                                                                

2. Combine Subway to Arrest

In [18]:
def withinRange(slat, slong, dlat, dlong):
    srs = (slat, slong)
    dst = (dlat, dlong)
    distance = float(haversine(srs, dst,unit=Unit.MILES))
    return distance < 0.1
    
withinRangeUdf = F.udf(withinRange, BooleanType())

In [19]:
arrest_df.columns

[':@computed_region_92fq_4b7q',
 ':@computed_region_efsh_h5xi',
 ':@computed_region_f5dn_yrer',
 ':@computed_region_sbqj_enih',
 ':@computed_region_yeji_bk3q',
 'age_group',
 'arrest_boro',
 'arrest_date',
 'arrest_key',
 'arrest_precinct',
 'jurisdiction_code',
 'ky_cd',
 'latitude',
 'law_cat_cd',
 'law_code',
 'lon_lat',
 'longitude',
 'ofns_desc',
 'pd_cd',
 'pd_desc',
 'perp_race',
 'perp_sex',
 'x_coord_cd',
 'y_coord_cd']

In [20]:
arrest_df.show()

23/05/07 03:39:45 ERROR Executor: Exception in task 0.0 in stage 33.0 (TID 40)1]
java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.sql.catalyst.NoopFilters.<init>(StructFilters.scala:163)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$convertObject$default$4(JacksonParser.scala:444)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$$nestedInanonfun$makeConverter$19$1.applyOrElse(JacksonParser.scala:360)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$$nestedInanonfun$makeConverter$19$1.applyOrElse(JacksonParser.scala:359)
	at org.apache.spark.sql.catalyst.json.JacksonParser.parseJsonToken(JacksonParser.scala:404)
	at org.apache.spark.sql.catalyst.json.JacksonParser.$anonfun$makeConverter$19(JacksonParser.scala:359)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$Lambda$4114/0x000000080200c7a0.apply(Unknown Source)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$s

Py4JError: An error occurred while calling z:py4j.reflection.TypeUtil.isInstanceOf

In [None]:
sa_df = subway_df.join(arrest_df, withinRange(F.col('GTFS Latitude'),\
    F.col('GTFS Longitude'),\
    F.col('latitude'),\
    F.col('longitude')), 'cross')
sa_df.show()

3. Combine Subway to Criminal

4. Combine Subway to Summons

### Conclusion