In [1]:
# pip
# !pip install tqdm
# !pip install dask

In [2]:
# conda
# !conda install tqdm
# !conda install dask

### Importing necessary libraries

In [91]:
# configuration, worked on using python@3.10.9 
import os
import urllib
import json
from threading import Thread, Lock
from tqdm import tqdm
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import findspark

### Dataset links and filename

In [92]:
# data folder
data_dir = 'data'

# data urls
historic_arrest_loc = { 'url': 'https://data.cityofnewyork.us/resource/8h9b-rp9u.json?$limit=15000000', 'filename': 'arrest.json' }
historic_complaint_loc = { 'url': 'https://data.cityofnewyork.us/resource/qgea-i56i.json?$limit=15000000', 'filename': 'complaint.json' }
historic_court_summons_loc = { 'url': 'https://data.cityofnewyork.us/resource/sv2w-rv3k.json?$limit=15000000', 'filename': 'summons.json' }
traffic_speed_loc = { 'url': 'https://data.cityofnewyork.us/resource/i4gi-tjb9.json?$limit=15000000', 'filename': 'speed.json' }
turnstile_loc = { 'url': 'https://data.ny.gov/resource/i55r-43gk.json?$limit=15000000', 'filename': 'turnstile.json' }
subway_loc = { 'url': 'http://web.mta.info/developers/data/nyct/subway/Stations.csv?$limit=10000', 'filename': 'subway.csv' }

### Dataset: Downloading handler

In [93]:
# download flags
downloadflag = True
redownload = False

thread_lock = Lock()

# download utils
def download_dataset_thread(loc, folder):
    with thread_lock:
        if ((not os.path.exists(os.path.join(folder, loc['filename']))) or redownload) and downloadflag:
            if os.path.isfile(os.path.join(folder, loc['filename'])):
                os.remove(os.path.join(folder, loc['filename']))
            if not os.path.exists(folder):
                os.makedirs(folder) 
            with tqdm(unit="B", unit_scale=True, desc=loc['filename'], miniters=1) as progress_bar:
                urllib.request.urlretrieve(loc['url'], os.path.join(folder, loc['filename']), lambda block_num, block_size, total_size: progress_bar.update(block_size))
            progress_bar.display()
        
def download_dataset(loc, folder):
    thread = Thread(target=download_dataset_thread, args=(loc, folder))
    thread.start()
    thread.join()
        

### Dataset: Downloading

In [94]:
# download datasets
for dataset in [historic_arrest_loc,
                historic_complaint_loc,
                historic_court_summons_loc,
                turnstile_loc,
                subway_loc]:
    download_dataset(dataset, data_dir)

### Providing Apache Spark backend

In [95]:
findspark.init('/opt/homebrew/Cellar/apache-spark/3.4.0/libexec')
findspark.find()

'/opt/homebrew/Cellar/apache-spark/3.4.0/libexec'

### Creating spark session using SparkSession builder

In [96]:
# spark session initialization
spark = SparkSession.builder\
    .master("local[5]")\
        .appName("main")\
            .config("spark.sql.debug.maxToStringFields", 50)\
                .config("spark.driver.memory", '8g')\
                    .getOrCreate()
                    
spark

### Initializing spark dataframes

In [97]:
# dataframes
arrest_rdd = spark.read.json(os.path.join(data_dir, historic_arrest_loc['filename']), multiLine=True)
complaint_rdd = spark.read.json(os.path.join(data_dir, historic_complaint_loc['filename']), multiLine=True)
summons_rdd = spark.read.json(os.path.join(data_dir, historic_court_summons_loc['filename']), multiLine=True)
turnstile_rdd = spark.read.json(os.path.join(data_dir, turnstile_loc['filename']), multiLine=True)
subway_rdd = spark.read.csv(os.path.join(data_dir, subway_loc['filename']), header=True, inferSchema=True)

                                                                                

### Dataset: Analysis

In [98]:
subway_rdd.show()

+----------+----------+------------+--------+-------------------+--------------------+-------+--------------+---------+-------------+--------------+---------------------+---------------------+---+-------------------+------+------+-----------------+-----------------+
|Station ID|Complex ID|GTFS Stop ID|Division|               Line|           Stop Name|Borough|Daytime Routes|Structure|GTFS Latitude|GTFS Longitude|North Direction Label|South Direction Label|ADA|ADA Direction Notes|ADA NB|ADA SB|Capital Outage NB|Capital Outage SB|
+----------+----------+------------+--------+-------------------+--------------------+-------+--------------+---------+-------------+--------------+---------------------+---------------------+---+-------------------+------+------+-----------------+-----------------+
|         1|         1|         R01|     BMT|            Astoria|Astoria-Ditmars Blvd|      Q|           N W| Elevated|    40.775036|    -73.912034|                 null|            Manhattan|  0|   

In [111]:
turnstile_rdd.show()

[Stage 198:>                                                        (0 + 1) / 1]

+----+--------------------+-------+--------+-------+-------+--------+--------+-------------+--------+----+
| c_a|                date|   desc|division|entries|  exits|linename|     scp|      station|    time|unit|
+----+--------------------+-------+--------+-------+-------+--------+--------+-------------+--------+----+
|A002|2014-12-31T00:00:...|REGULAR|     BMT|4943320|1674736|  NQR456|02-00-00|LEXINGTON AVE|23:00:00|R051|
|A002|2014-12-31T00:00:...|REGULAR|     BMT|4943145|1674709|  NQR456|02-00-00|LEXINGTON AVE|19:00:00|R051|
|A002|2014-12-31T00:00:...|REGULAR|     BMT|4942439|1674636|  NQR456|02-00-00|LEXINGTON AVE|15:00:00|R051|
|A002|2014-12-31T00:00:...|REGULAR|     BMT|4942012|1674547|  NQR456|02-00-00|LEXINGTON AVE|11:00:00|R051|
|A002|2014-12-31T00:00:...|REGULAR|     BMT|4941987|1674518|  NQR456|02-00-00|LEXINGTON AVE|10:21:51|R051|
|A002|2014-12-31T00:00:...|REGULAR|     BMT|4941910|1674322|  NQR456|02-00-00|LEXINGTON AVE|07:00:00|R051|
|A002|2014-12-31T00:00:...|REGULAR|  

                                                                                

In [107]:
t_list = turnstile_rdd.select(F.col('station')).distinct().toPandas().values.flatten()
s_list = subway_rdd.select(F.col('Stop Name')).distinct().toPandas().values.flatten()

                                                                                

In [108]:
t_list.sort()
t_list

array(['1 AVE', '103 ST', '103 ST-CORONA', '104 ST', '110 ST',
       '110 ST-CATHEDRL', '110 ST-CPN', '111 ST', '116 ST',
       '116 ST-COLUMBIA', '121 ST', '125 ST', '135 ST', '137 ST-CITY COL',
       '138 ST-3 AVE', '138 ST-GR CONC', '14 ST', '14 ST-6 AVE',
       '14 ST-UNION SQ', '145 ST', '148 ST-LENOX', '149 ST-3 AVE',
       '149 ST-GR CONC', '14TH STREET', '15 ST-PROSPECT', '155 ST',
       '157 ST', '161 ST-YANKEE', '163 ST-AMSTERDM', '167 ST',
       '168 ST-BROADWAY', '169 ST', '170 ST', '174 ST', '174-175 ST',
       '175 ST', '176 ST', '18 AVE', '18 ST', '181 ST', '182-183 ST',
       '183 ST', '190 ST', '191 ST', '2 AVE', '20 AVE', '207 ST', '21 ST',
       '215 ST', '219 ST', '22 AVE-BAY PKY', '225 ST', '23 ST',
       '23 ST-5 AVE', '23 ST-6 AVE', '231 ST', '233 ST', '238 ST',
       '242 ST', '25 AVE', '25 ST', '28 ST', '28 ST-BROADWAY', '3 AVE',
       '33 ST', '33 ST/RAWSON ST', '34 ST-HERALD SQ', '34 ST-PENN STA',
       '36 ST', '4 AVE', '40 ST-LOWERY ST', '42 S

In [109]:
s_list.sort()
s_list

array(['1 Av', '103 St', '103 St-Corona Plaza', '104 St', '110 St',
       '111 St', '116 St', '116 St-Columbia University', '121 St',
       '125 St', '135 St', '137 St-City College',
       '138 St-Grand Concourse', '14 St', '14 St-Union Sq',
       '14 St-Union Sq.', '145 St', '149 St-Grand Concourse',
       '15 St-Prospect Park', '155 St', '157 St', '161 St-Yankee Stadium',
       '163 St-Amsterdam Av', '167 St', '168 St', '168 St-Washington Hts',
       '169 St', '170 St', '174 St', '174-175 Sts', '175 St', '176 St',
       '18 Av', '18 St', '181 St', '182-183 Sts', '183 St', '190 St',
       '191 St', '2 Av', '20 Av', '207 St', '21 St', '21 St-Queensbridge',
       '215 St', '219 St', '225 St', '23 St', '231 St', '233 St',
       '238 St', '25 Av', '25 St', '28 St', '3 Av', '3 Av-138 St',
       '3 Av-149 St', '30 Av', '33 St', '33 St-Rawson St',
       '34 St-Herald Sq', '34 St-Hudson Yards', '34 St-Penn Station',
       '36 Av', '36 St', '39 Av-Dutch Kills', '4 Av-9 St',
     

In [110]:
ts_intersect = [value for value in t_list if value in s_list]
ts_intersect

[]

In [105]:
len(turnstile_rdd.select(F.col('station')).distinct().toPandas().values.flatten())

                                                                                

392

In [103]:
turnstile_rdd.columns

['c_a',
 'date',
 'desc',
 'division',
 'entries',
 'exits',
 'linename',
 'scp',
 'station',
 'time',
 'unit']

### Dataset: Cleansing

### Dataset: Preprocessing

### Dataset: Consolidating

1. Combine Subway to Turnstile

2. Combine Subway to Arrest

3. Combine Subway to Criminal

4. Combine Subway to Summons

### Conclusion