In [1]:
# pip
# !pip install tqdm
# !pip install dask

In [2]:
# conda
# !conda install tqdm
# !conda install dask

In [3]:
# configuration, worked on using python@3.10.9 
import os
import urllib
from threading import Thread, Lock
from tqdm import tqdm
import pyspark
import pandas

In [4]:
# data folder
data_dir = 'data'

# data urls
historic_arrest_loc = { 'url': 'https://data.cityofnewyork.us/resource/8h9b-rp9u.json?$limit=10000', 'filename': 'arrest.json' }
historic_complaint_loc = { 'url': 'https://data.cityofnewyork.us/resource/qgea-i56i.json?$limit=10000', 'filename': 'complaint.json' }
historic_court_summons_loc = { 'url': 'https://data.cityofnewyork.us/resource/sv2w-rv3k.json?$limit=10000', 'filename': 'summons.json' }
traffic_speed_loc = { 'url': 'https://data.cityofnewyork.us/resource/i4gi-tjb9.json?$limit=10000', 'filename': 'speed.json' }
turnstile_loc = { 'url': 'https://data.ny.gov/resource/i55r-43gk.json?$limit=10000', 'filename': 'turnstile.json' }
subway_loc = { 'url': 'http://web.mta.info/developers/data/nyct/subway/Stations.csv?$limit=10000', 'filename': 'subway.csv' }

In [5]:
# download flags
downloadflag = True
redownload = False

thread_lock = Lock()

# download utils
def download_dataset_thread(loc, folder):
    with thread_lock:
         if ((not os.path.exists(os.path.join(folder, loc['filename']))) or redownload) and downloadflag:
            if os.path.isfile(os.path.join(folder, loc['filename'])):
                os.remove(os.path.join(folder, loc['filename']))
            if not os.path.exists(folder):
                os.makedirs(folder) 
            with tqdm(unit="B", unit_scale=True, desc=loc['filename'], miniters=1) as progress_bar:
                urllib.request.urlretrieve(loc['url'], os.path.join(folder, loc['filename']), lambda block_num, block_size, total_size: progress_bar.update(block_size))
            progress_bar.display()
        
def download_dataset(loc, folder):
    thread = Thread(target=download_dataset_thread, args=(loc, folder))
    thread.start()
    thread.join()
        

In [6]:
# download datasets
for dataset in [historic_arrest_loc,
                historic_complaint_loc,
                historic_court_summons_loc,
                turnstile_loc,
                subway_loc]:
    download_dataset(dataset, data_dir)

In [7]:
conf = pyspark.SparkConf().setMaster("local[5]")

sc = pyspark.SparkContext(conf=conf)
spark = pyspark.sql.SparkSession(sc)
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/25 19:18:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [14]:
# dataframes
arrest_rdd = spark.read.json(os.path.join(data_dir, historic_arrest_loc['filename']), multiLine=True)
complaint_rdd = spark.read.json(os.path.join(data_dir, historic_complaint_loc['filename']), multiLine=True)
summons_rdd = spark.read.json(os.path.join(data_dir, historic_court_summons_loc['filename']), multiLine=True)
turnstile_rdd = spark.read.json(os.path.join(data_dir, turnstile_loc['filename']), multiLine=True)
subway_rdd = spark.read.csv(os.path.join(data_dir, subway_loc['filename']), header=True, inferSchema=True)

In [15]:
arrest_rdd.limit(1).toPandas().head()

Unnamed: 0,:@computed_region_92fq_4b7q,:@computed_region_efsh_h5xi,:@computed_region_f5dn_yrer,:@computed_region_sbqj_enih,:@computed_region_yeji_bk3q,age_group,arrest_boro,arrest_date,arrest_key,arrest_precinct,...,law_code,lon_lat,longitude,ofns_desc,pd_cd,pd_desc,perp_race,perp_sex,x_coord_cd,y_coord_cd
0,36,12423,18,18,4,45-64,M,2021-11-22T00:00:00.000,236791704,28,...,PL 2225001,"([-73.95240854099995, 40.799008797000056], Point)",-73.95240854099995,,581,,BLACK,M,997427,230378


In [16]:
complaint_rdd.limit(1).toPandas().head()

Unnamed: 0,:@computed_region_92fq_4b7q,:@computed_region_efsh_h5xi,:@computed_region_f5dn_yrer,:@computed_region_sbqj_enih,:@computed_region_yeji_bk3q,addr_pct_cd,boro_nm,cmplnt_fr_dt,cmplnt_fr_tm,cmplnt_num,...,station_name,susp_age_group,susp_race,susp_sex,transit_district,vic_age_group,vic_race,vic_sex,x_coord_cd,y_coord_cd
0,36,13095,18,20,4,32,MANHATTAN,2018-03-29T00:00:00.000,20:30:00,506547392,...,,,,,,25-44,WHITE,F,1000565,234704


In [17]:
summons_rdd.limit(1).toPandas().head()

Unnamed: 0,:@computed_region_92fq_4b7q,:@computed_region_efsh_h5xi,:@computed_region_f5dn_yrer,:@computed_region_sbqj_enih,:@computed_region_yeji_bk3q,age_group,boro,geocoded_column,jurisdiction_code,latitude,...,longitude,offense_description,precinct_of_occur,race,sex,summons_category_type,summons_date,summons_key,x_coordinate_cd,y_coordinate_cd
0,51,12081,11,10,4,UNKNOWN,NEW YORK,"([-73.97902801199996, 40.76340651400006], Point)",0,40.763406514000046,...,-73.97902801199997,UNINSURED VEHICLE,18,,,VTL,2019-01-24T00:00:00.000,192724412,990060,217404


In [18]:
turnstile_rdd.limit(5).toPandas().head()

Unnamed: 0,c_a,date,desc,division,entries,exits,linename,scp,station,time,unit
0,A002,2014-12-31T00:00:00.000,REGULAR,BMT,4943320,1674736,NQR456,02-00-00,LEXINGTON AVE,23:00:00,R051
1,A002,2014-12-31T00:00:00.000,REGULAR,BMT,4943145,1674709,NQR456,02-00-00,LEXINGTON AVE,19:00:00,R051
2,A002,2014-12-31T00:00:00.000,REGULAR,BMT,4942439,1674636,NQR456,02-00-00,LEXINGTON AVE,15:00:00,R051
3,A002,2014-12-31T00:00:00.000,REGULAR,BMT,4942012,1674547,NQR456,02-00-00,LEXINGTON AVE,11:00:00,R051
4,A002,2014-12-31T00:00:00.000,REGULAR,BMT,4941987,1674518,NQR456,02-00-00,LEXINGTON AVE,10:21:51,R051


In [19]:
subway_rdd.limit(1).toPandas().head()

Unnamed: 0,Station ID,Complex ID,GTFS Stop ID,Division,Line,Stop Name,Borough,Daytime Routes,Structure,GTFS Latitude,GTFS Longitude,North Direction Label,South Direction Label,ADA,ADA Direction Notes,ADA NB,ADA SB,Capital Outage NB,Capital Outage SB
0,1,1,R01,BMT,Astoria,Astoria-Ditmars Blvd,Q,N W,Elevated,40.775036,-73.912034,,Manhattan,0,,,,,
