In [None]:
# pip
# !pip install tqdm
# !pip install dask

In [None]:
# conda
# !conda install tqdm
# !conda install dask

### Importing necessary libraries

In [None]:
# configuration, worked on using python@3.10.9 
import os
import urllib
import json
from threading import Thread, Lock
from tqdm import tqdm
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, BooleanType
import findspark
from haversine import haversine, Unit

### Dataset links and filename

In [None]:
# data folder
data_dir = 'data'

# data urls
historic_arrest_loc = { 'url': 'https://data.cityofnewyork.us/resource/8h9b-rp9u.json?$limit=15000000', 'filename': 'arrest.json' }
historic_complaint_loc = { 'url': 'https://data.cityofnewyork.us/resource/qgea-i56i.json?$limit=15000000', 'filename': 'complaint.json' }
historic_court_summons_loc = { 'url': 'https://data.cityofnewyork.us/resource/sv2w-rv3k.json?$limit=15000000', 'filename': 'summons.json' }
traffic_speed_loc = { 'url': 'https://data.cityofnewyork.us/resource/i4gi-tjb9.json?$limit=15000000', 'filename': 'speed.json' }
turnstile_loc = { 'url': 'https://data.ny.gov/resource/i55r-43gk.json?$limit=15000000', 'filename': 'turnstile.json' }
subway_loc = { 'url': 'http://web.mta.info/developers/data/nyct/subway/Stations.csv?$limit=10000', 'filename': 'subway.csv' }

### Dataset: Downloading handler

In [None]:
# download flags
downloadflag = True
redownload = False

thread_lock = Lock()

# download utils
def download_dataset_thread(loc, folder):
    with thread_lock:
        if ((not os.path.exists(os.path.join(folder, loc['filename']))) or redownload) and downloadflag:
            if os.path.isfile(os.path.join(folder, loc['filename'])):
                os.remove(os.path.join(folder, loc['filename']))
            if not os.path.exists(folder):
                os.makedirs(folder) 
            with tqdm(unit="B", unit_scale=True, desc=loc['filename'], miniters=1) as progress_bar:
                urllib.request.urlretrieve(loc['url'], os.path.join(folder, loc['filename']), lambda block_num, block_size, total_size: progress_bar.update(block_size))
            progress_bar.display()
        
def download_dataset(loc, folder):
    thread = Thread(target=download_dataset_thread, args=(loc, folder))
    thread.start()
    thread.join()
        

### Dataset: Downloading

In [None]:
# download datasets
for dataset in [historic_arrest_loc,
                historic_complaint_loc,
                historic_court_summons_loc,
                turnstile_loc,
                subway_loc]:
    download_dataset(dataset, data_dir)

### Providing Apache Spark backend

In [None]:
findspark.init('/opt/homebrew/Cellar/apache-spark/3.4.0/libexec')
findspark.find()

### Creating spark session using SparkSession builder

In [None]:
# spark session initialization
spark = SparkSession.builder\
    .master("local[5]")\
    .appName("main")\
    .config("spark.sql.debug.maxToStringFields", 50)\
    .config("spark.driver.memory", '4g')\
    .config("spark.executor.instances", 5)\
    .getOrCreate()

spark

### Initializing spark dataframes

In [None]:
# dataframes
arrest_df = spark.read.json(os.path.join(data_dir, historic_arrest_loc['filename']), multiLine=True).repartition(5)
complaint_df = spark.read.json(os.path.join(data_dir, historic_complaint_loc['filename']), multiLine=True).repartition(5)
summons_df = spark.read.json(os.path.join(data_dir, historic_court_summons_loc['filename']), multiLine=True).repartition(5)
turnstile_df = spark.read.json(os.path.join(data_dir, turnstile_loc['filename']), multiLine=True).repartition(5)
subway_df = spark.read.csv(os.path.join(data_dir, subway_loc['filename']), header=True, inferSchema=True).repartition(5)

### Dataset: Analysis & Cleansing

#### 1. Subway and Turnstile dataset analysis

In [None]:
# normalize udf for station id in subway and turnstile
def normalize(unit):
    type, *val = list(unit)
    return type + str(int(''.join(val)))

nUdf = F.udf(normalize, StringType())

In [None]:
# normalize subway GTFS Stop ID
subway_df = subway_df.withColumn('GTFS Stop ID', nUdf(F.col('GTFS Stop ID')))

In [None]:
# normalize turnstile unit
turnstile_df = turnstile_df.withColumn('unit', nUdf(F.col('unit')))

In [None]:
# getting subways stations in turnstile and subway dataset
s_list = subway_df.select(F.col('GTFS Stop ID')).distinct().toPandas().values.flatten()
t_list = turnstile_df.select(F.col('unit')).distinct().toPandas().values.flatten()

In [None]:
# comparing common subway stations
ts_intersect = [value for value in t_list if value in s_list]
ts_intersect

#### 2. Arrest dataset cleaning

In [None]:
arrest_df = arrest_df.toDF(":@computed_region_92fq_4b7q", ":@computed_region_efsh_h5xi", ":@computed_region_f5dn_yrer", ":@computed_region_sbqj_enih", ":@computed_region_yeji_bk3q", "age_group", "arrest_boro", "arrest_date", "arrest_key", "arrest_precinct", "jurisdiction_code", "ky_cd", "latitude", "law_cat_cd", "law_code", "lon_lat", "longitude", "ofns_desc", "pd_cd", "pd_desc", "perp_race", "perp_sex", "x_coord_cd", "y_coord_cd")
arrest_df = arrest_df.select("arrest_boro","arrest_date", "arrest_key", "latitude", "longitude", )\
    .filter(F.col('latitude').isNotNull()\
        & F.col('longitude').isNotNull())\
    .withColumn("longitude", F.col("longitude").cast("double"))\
    .withColumn("latitude", F.col("latitude").cast("double"))
arrest_df.show()


### Dataset: Consolidating

1. Combine Subway to Turnstile

In [None]:
gt_df = turnstile_df.groupBy(F.col('unit'))\
    .agg({ 'entries': 'sum', 'exits': 'sum'})\
        .select(F.col('unit'),\
            F.col('sum(entries)').alias('entries'),\
            F.col('sum(exits)').alias('exits'))
gt_df.show()

In [None]:
# combine the total exits and entries for each station
st_df = subway_df.join(gt_df, F.col('GTFS Stop ID') == F.col('unit'))\
    .select(F.col('GTFS Stop ID').alias('id'),\
        F.col('Line').alias('line'),\
        F.col('Stop Name').alias('stop_name'),\
        F.col('Borough').alias('borough'),\
        F.col('GTFS Latitude').alias('lat'),\
        F.col('GTFS Longitude').alias('long'),\
        F.col('North Direction Label').alias('n_label'),\
        F.col('South Direction Label').alias('s_label'),\
        F.col('entries'),\
        F.col('exits'))
st_df.show()

2. Combine Subway to Arrest

In [None]:
def withinMeters(slat, slong, dlat, dlong):
    srs = (slat, slong)
    dst = (dlat, dlong)
    print(type(srs[0]),type(srs[1]))
    print(srs[0],srs[1])
    print(type(dst[0]),type(dst[1]))
    print(dst[0],dst[1])
    distance = float(haversine(srs, dst,unit=Unit.METERS))
    print(distance)
    return bool(distance < 402)
    
withinMetersUdf = F.udf(withinMeters, BooleanType())

In [None]:
sa_df = subway_df.join(arrest_df, withinMetersUdf(F.col('GTFS Latitude'), F.col('GTFS Longitude'), F.col('latitude'), F.col('longitude')), 'cross')\
    .drop(F.col('latitude'))\
    .drop(F.col('longitude'))
sa_df = sa_df.na.drop(subset=["Stop Name"])

In [None]:
arrest_count = sa_df.groupBy(F.col('Stop Name')).count()
arrest_count.printSchema()
arrest_count.show()

3. Combine Subway to Criminal

4. Combine Subway to Summons

### Conclusion