In [1]:
# pip
#!pip install tqdm
#!pip install dask

In [2]:
# conda
# !conda install tqdm
# !conda install dask

In [1]:
# configuration, worked on using python@3.10.9 
import os
import urllib
import json
from threading import Thread, Lock
from tqdm import tqdm
import pyspark
import pandas

In [2]:
conf = pyspark.SparkConf()\
    .setMaster("local[7]")\
    .set("spark.eventLog.enabled", "true")\
    .set("spark.eventLog.dir", "./logs")\
    .set("spark.eventLog.gcMetrics.youngGenerationGarbageCollectors", "true")

sc = pyspark.SparkContext(conf=conf)
sc.setLogLevel("ERROR")


In [3]:
spark = pyspark.sql.SparkSession(sc)
spark

In [4]:
# data folder
data_dir = 'data'

# data urls
historic_arrest_loc = { 'url': 'https://data.cityofnewyork.us/resource/8h9b-rp9u.json?$limit=15000000', 'filename': 'arrest.json' }
historic_complaint_loc = { 'url': 'https://data.cityofnewyork.us/resource/qgea-i56i.json?$limit=15000000', 'filename': 'complaint.json' }
historic_court_summons_loc = { 'url': 'https://data.cityofnewyork.us/resource/sv2w-rv3k.json?$limit=15000000', 'filename': 'summons.json' }
traffic_speed_loc = { 'url': 'https://data.cityofnewyork.us/resource/i4gi-tjb9.json?$limit=15000000', 'filename': 'speed.json' }
turnstile_loc = { 'url': 'https://data.ny.gov/resource/bjcb-yee3.csv?$limit=15000000', 'filename': 'turnstile.csv' }
subway_loc = { 'url': 'http://web.mta.info/developers/data/nyct/subway/Stations.csv?$limit=10000', 'filename': 'subway.csv' }

In [5]:
# download flags
downloadflag = True
redownload = False

thread_lock = Lock()

# download utils
def download_dataset_thread(loc, folder):
    with thread_lock:
         if ((not os.path.exists(os.path.join(folder, loc['filename']))) or redownload) and downloadflag:
            if os.path.isfile(os.path.join(folder, loc['filename'])):
                os.remove(os.path.join(folder, loc['filename']))
            if not os.path.exists(folder):
                os.makedirs(folder) 
            with tqdm(unit="B", unit_scale=True, desc=loc['filename'], miniters=1) as progress_bar:
                urllib.request.urlretrieve(loc['url'], os.path.join(folder, loc['filename']), lambda block_num, block_size, total_size: progress_bar.update(block_size))
            progress_bar.display()
        
def download_dataset(loc, folder):
    thread = Thread(target=download_dataset_thread, args=(loc, folder))
    thread.start()
    thread.join()
        

In [6]:
# download datasets
for dataset in [historic_arrest_loc,
                historic_complaint_loc,
                historic_court_summons_loc,
                turnstile_loc,
                subway_loc]:
    download_dataset(dataset, data_dir)

In [7]:
# dataframes
#arrest_rdd = spark.read.json(os.path.join(data_dir, historic_arrest_loc['filename']), multiLine=True)
#complaint_rdd = spark.read.json(os.path.join(data_dir, historic_complaint_loc['filename']), multiLine=True)
#summons_rdd = spark.read.json(os.path.join(data_dir, historic_court_summons_loc['filename']), multiLine=True)
turnstile_rdd = spark.read.csv(os.path.join(data_dir, turnstile_loc['filename']), multiLine=True, header=True,inferSchema=True)


In [8]:
from pyspark.sql.functions import date_format, col
from pyspark.sql.types import DateType

turnstile_rdd = turnstile_rdd.withColumn("date", col("Date").cast(DateType())).withColumn("time", date_format(col("time"), "HH:mm:ss"))


In [9]:
turnstile_rdd = turnstile_rdd.select(col('unit'), col('station'), col('date'), col('entries'), col('exits'))
#turnstile_rdd.printSchema()
#turnstile_rdd.toPandas() 

In [10]:
subway_rdd = spark.read.csv(os.path.join(data_dir, subway_loc['filename']), header=True, inferSchema=True, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True, nullValue='')
turnstile_rdd.show()

+----+--------------+----------+--------+--------+
|unit|       station|      date| entries|   exits|
+----+--------------+----------+--------+--------+
|R170|14 ST-UNION SQ|2018-12-28| 6069026| 7074661|
|R170|14 ST-UNION SQ|2018-12-28|14197229|13704110|
|R170|14 ST-UNION SQ|2018-12-28| 2056268| 5177036|
|R170|14 ST-UNION SQ|2018-12-28| 2056268| 5177036|
|R170|14 ST-UNION SQ|2018-12-28|70294362|20274025|
|R170|14 ST-UNION SQ|2018-12-28| 1806541| 2182381|
|R170|14 ST-UNION SQ|2018-12-28|14197229|13704110|
|R170|14 ST-UNION SQ|2018-12-28|70294362|20274025|
|R170|14 ST-UNION SQ|2018-12-28| 6069026| 7074661|
|R170|14 ST-UNION SQ|2018-12-28|  694109|  207786|
|R170|14 ST-UNION SQ|2018-12-28| 1806541| 2182381|
|R170|14 ST-UNION SQ|2018-12-28| 4927946| 3748371|
|R170|14 ST-UNION SQ|2018-12-28|  694109|  207786|
|R170|14 ST-UNION SQ|2018-12-28|15598097| 6250334|
|R170|14 ST-UNION SQ|2018-12-28| 4927946| 3748371|
|R170|14 ST-UNION SQ|2018-12-28|15598097| 6250334|
|R170|14 ST-UNION SQ|2018-12-28

In [11]:
subway_rdd = subway_rdd.withColumnRenamed("Complex ID", "complex_id").withColumnRenamed("Stop Name", "station").withColumnRenamed("GTFS Latitude", "latitude").withColumnRenamed("GTFS Longitude", "longitude")
subway_rdd.printSchema()
subway_rdd = subway_rdd.select("complex_id", "latitude", "longitude")
subway_rdd.toPandas()

root
 |-- Station ID: integer (nullable = true)
 |-- complex_id: integer (nullable = true)
 |-- GTFS Stop ID: string (nullable = true)
 |-- Division: string (nullable = true)
 |-- Line: string (nullable = true)
 |-- station: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Daytime Routes: string (nullable = true)
 |-- Structure: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- North Direction Label: string (nullable = true)
 |-- South Direction Label: string (nullable = true)
 |-- ADA: integer (nullable = true)
 |-- ADA Direction Notes: string (nullable = true)
 |-- ADA NB: integer (nullable = true)
 |-- ADA SB: integer (nullable = true)
 |-- Capital Outage NB: string (nullable = true)
 |-- Capital Outage SB: string (nullable = true)



Unnamed: 0,complex_id,latitude,longitude
0,1,40.775036,-73.912034
1,2,40.770258,-73.917843
2,3,40.766779,-73.921479
3,4,40.761820,-73.925508
4,5,40.756804,-73.929575
...,...,...,...
491,517,40.525507,-74.200064
492,518,40.522410,-74.217847
493,519,40.519631,-74.229141
494,522,40.512764,-74.251961


In [12]:
from pyspark.sql.functions import sum

turnstile_rdd = turnstile_rdd.groupBy(['unit','station', 'date']) \
                           .agg(sum('entries').alias('entries'), sum('exits').alias('exits'))



In [13]:
sway_rdd = spark.read.csv("D:\\Bigdata_project\\remote_complex_lookup.csv", header=True, inferSchema=True, ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True, nullValue='')
sway_rdd = sway_rdd.select("complex_id", "remote")
turnstile_rdd.show()

+----+---------------+----------+-----------+-----------+
|unit|        station|      date|    entries|      exits|
+----+---------------+----------+-----------+-----------+
|R156|BEDFORD PK BLVD|2018-12-28| 1730762526|   78056832|
|R292|  BAYCHESTER AV|2018-12-28|  196532900|   20297276|
|R550|     CITY / BUS|2018-12-28|   99005486|   81797186|
|R334| CATHEDRAL PKWY|2018-12-26| 1090510336| 1782637664|
|R222|    PARKCHESTER|2018-12-25| 1175301086|  645398588|
|R203|          23 ST|2018-12-25|  310225202|  214048832|
|R278|          25 ST|2018-12-24|  166487130|   57945786|
|R452|          72 ST|2018-12-23|42282868696|55910954994|
|R389|BRONX PARK EAST|2018-12-23|11926356972| 5263177644|
|R157| NORWOOD 205 ST|2018-12-21| 2170967301| 1346719694|
|R428|       BUHRE AV|2018-12-21|   44656706|   29820000|
|R250|       GRAND ST|2018-12-21|  111085500|   94134580|
|R414| HOWARD BCH JFK|2018-12-21|   28867693|    8000473|
|R148|    PARKSIDE AV|2018-12-20|  236141035|  207208197|
|R131|        

In [14]:
df = sway_rdd.join(subway_rdd, on='complex_id', how='inner')


In [15]:
df = df.withColumnRenamed("remote", "unit")
df.toPandas()

Unnamed: 0,complex_id,unit,latitude,longitude
0,635,R001,40.702068,-74.013664
1,635,R001,40.703087,-74.012994
2,635,R001,40.702068,-74.013664
3,635,R001,40.703087,-74.012994
4,635,R001,40.702068,-74.013664
...,...,...,...,...
1006,477,R570,40.768799,-73.958424
1007,476,R571,40.777891,-73.951787
1008,476,R571,40.777891,-73.951787
1009,475,R572,40.784318,-73.947152


In [16]:
df2=turnstile_rdd.join(df, on='unit', how='inner').distinct()
df2.show()

+----+---------------+----------+-----------+-----------+----------+---------+----------+
|unit|        station|      date|    entries|      exits|complex_id| latitude| longitude|
+----+---------------+----------+-----------+-----------+----------+---------+----------+
|R156|BEDFORD PK BLVD|2018-12-28| 1730762526|   78056832|       211|40.873244|-73.887138|
|R292|  BAYCHESTER AV|2018-12-28|  196532900|   20297276|       443|40.878663|-73.838591|
|R334| CATHEDRAL PKWY|2018-12-26| 1090510336| 1782637664|       155|40.800603|-73.958161|
|R222|    PARKCHESTER|2018-12-25| 1175301086|  645398588|       366|40.833226|-73.860816|
|R203|          23 ST|2018-12-25|  310225202|  214048832|       228|40.742878|-73.992821|
|R278|          25 ST|2018-12-24|  166487130|   57945786|        31|40.660397|-73.998091|
|R452|          72 ST|2018-12-23|42282868696|55910954994|       313|40.778453| -73.98197|
|R389|BRONX PARK EAST|2018-12-23|11926356972| 5263177644|       425|40.848828|-73.868457|
|R157| NOR

In [17]:
df3 = df2.dropDuplicates(["latitude", "longitude"])

In [60]:
df3.toPandas()

Unnamed: 0,unit,station,date,entries,exits,complex_id,latitude,longitude
0,R312,W 8 ST-AQUARIUM,2018-12-28,4326237,9156352,57,40.576127,-73.975939
1,R264,OCEAN PKWY,2018-12-28,129405,75681,56,40.576312,-73.968501
2,R151,CONEY IS-STILLW,2018-12-28,3309942,1284341,58,40.577422,-73.981233
3,R172,BRIGHTON BEACH,2018-12-28,584713,283597,55,40.577621,-73.961376
4,R419,ROCKAWAY PARK B,2018-12-28,100664041,0,203,40.580903,-73.835592
...,...,...,...,...,...,...,...,...
469,R431,EASTCHSTER/DYRE,2018-12-28,6431698,8285886,442,40.888300,-73.830834
470,R117,V.CORTLANDT PK,2018-07-22,620922125,620916198,293,40.889248,-73.898583
471,R367,233 ST,2018-12-28,10031280,2355779,418,40.893193,-73.857473
472,R444,NEREID AV,2018-12-28,858204,280996,417,40.898379,-73.854376


In [18]:
from pyspark.sql.functions import first
from pyspark.sql.window import Window
from pyspark.sql import functions as F
# create a window partitioned by station and ordered by date
station_window = Window.partitionBy('station').orderBy('date')

# create a new dataframe with aggregated entries and exits by station and date
station_day_sum = (
    df3.groupBy('station', 'date')
      .agg(sum('entries').alias('entries'), sum('exits').alias('exits'),
           first('latitude').alias('latitude'), first('longitude').alias('longitude'))
      .withColumn('date', df3.date.cast('date'))
      .withColumn('row_num', F.row_number().over(station_window))
)

# filter to only include the first row of each station (i.e. the earliest date for that station)
station_day_sum = station_day_sum.filter(F.col('row_num') == 1).drop('row_num')


In [20]:
station_day_sum.toPandas()

Unnamed: 0,station,date,entries,exits,latitude,longitude
0,1 AV,2018-07-28,18230852265,9759011769,40.730953,-73.981628
1,103 ST,2018-02-05,353438446,192195754,40.799446,-73.968379
2,103 ST-CORONA,2018-09-18,402532124,411560198,40.749865,-73.862700
3,104 ST,2018-05-10,10108650071,5877256005,40.681711,-73.837683
4,110 ST,2018-02-15,201464538,195302500,40.795020,-73.944250
...,...,...,...,...,...,...
342,WOODHAVEN BLVD,2018-09-23,251901586,162469082,40.733106,-73.869229
343,WOODLAWN,2018-11-25,176711552,69992735,40.886037,-73.878751
344,WTC-CORTLANDT,2018-12-18,4394348320,5415208,40.711835,-74.012188
345,YORK ST,2018-12-13,245143240,201479832,40.701397,-73.986751


In [21]:
#pip install folium

In [19]:
import folium

In [28]:
##this is not working

import folium

# initialize the map
fmap = folium.Map(location=[40.738, -73.94], zoom_start=12, tiles="CartoDB dark_matter")

# define a function to create a circle marker for each row of data
def create_circle_marker(row):
    data = row.asDict()
    
    # create popup text
    popup_text = "{}<br> total entries: {}<br> total exits: {}".format(
        data['station'], int(data['entries']), int(data['exits']))
    
    
    # create circle marker
    folium.CircleMarker(
        location=(data['latitude'], data['longitude']),
        radius= 10,
        color="#E37222",
        popup=popup_text,
        fill=True
    ).add_to(fmap)

# create a temporary view of the station_day_sum dataframe
station_day_sum.createOrReplaceTempView('station_day_sum')

# select the rows corresponding to the most recent 5 days
most_recent_5 = spark.sql("SELECT * FROM station_day_sum WHERE date >= DATE_SUB(CURRENT_DATE(), 5)")

most_recent_5.foreach(create_circle_marker)

# display the map
fmap.save("part_1.html")

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 114.0 failed 1 times, most recent failure: Lost task 0.0 in stage 114.0 (TID 76) (Priyank executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:694)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:738)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 15 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2328)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1019)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:405)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1018)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:76)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1589)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:192)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:166)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:694)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:738)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:690)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:655)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:631)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:588)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:546)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:179)
	... 15 more
