In [1]:
from sedona.register import SedonaRegistrator  
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
import geopandas as gpd
from shapely.geometry import Point, Polygon
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from haversine import haversine, Unit
import pyspark.sql.types as types
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.dates as mdates
import leafmap

In [2]:
import os
import urllib
import json
from threading import Thread, Lock
from tqdm import tqdm
from kafka import KafkaConsumer, KafkaProducer
import pyspark
from pyspark.sql import SparkSession
import pandas

In [3]:
# spark session initialization
spark = SparkSession.builder\
    .master("local[5]")\
    .appName("main")\
    .config("spark.sql.debug.maxToStringFields", 50)\
    .config("spark.driver.memory", '8g')\
    .config("spark.executor.instances", 4)\
    .config("spark.executor.cores", 5)\
    .getOrCreate()

spark

In [4]:
spark.conf.get("spark.executor.cores")

'5'

In [5]:
# data folder
data_dir = 'data'
intr_dir = 'results/ss_df.csv'

# data url
subway_loc = { 'url': 'http://web.mta.info/developers/data/nyct/subway/Stations.csv?$limit=15000000', 'filename': 'subway.csv' }


In [6]:
# download flags
downloadflag = True
redownload = False

thread_lock = Lock()

# download utils
def download_dataset_thread(loc, folder):
    with thread_lock:
         if ((not os.path.exists(os.path.join(folder, loc['filename']))) or redownload) and downloadflag:
            if os.path.isfile(os.path.join(folder, loc['filename'])):
                os.remove(os.path.join(folder, loc['filename']))
            if not os.path.exists(folder):
                os.makedirs(folder) 
            with tqdm(unit="B", unit_scale=True, desc=loc['filename'], miniters=1) as progress_bar:
                urllib.request.urlretrieve(loc['url'], os.path.join(folder, loc['filename']), lambda block_num, block_size, total_size: progress_bar.update(block_size))
            progress_bar.display()
        
def download_dataset(loc, folder):
    thread = Thread(target=download_dataset_thread, args=(loc, folder))
    thread.start()
    thread.join()

In [7]:
# download datasets
for dataset in [subway_loc]:
    download_dataset(dataset, data_dir)

In [8]:
subway_rdd = spark.read.csv(os.path.join(data_dir, subway_loc['filename']), header=True, inferSchema=True)

In [9]:
subway_DF = subway_rdd.toDF("Station ID", "Complex ID", "GTFS Stop ID", "Division", "Line", "Stop Name", "Borough", "Daytime Routes", "Structure", "GTFS Latitude", "GTFS Longitude", "North Direction Label", "South Direction Label", "ADA", "ADA Direction Notes", "ADA NB", "ADA SB", "Capital Outage NB", "Capital Outage SB")
subway_DF = subway_DF.select('Station ID', 'Complex ID', 'GTFS Stop ID', 'Stop Name', 'Borough', 'GTFS Latitude','GTFS Longitude')\
                     .filter(F.col('GTFS Latitude').isNotNull() & F.col('GTFS Longitude').isNotNull())

In [10]:
subway_DF.show()

+----------+----------+------------+--------------------+-------+-------------+--------------+
|Station ID|Complex ID|GTFS Stop ID|           Stop Name|Borough|GTFS Latitude|GTFS Longitude|
+----------+----------+------------+--------------------+-------+-------------+--------------+
|         1|         1|         R01|Astoria-Ditmars Blvd|      Q|    40.775036|    -73.912034|
|         2|         2|         R03|        Astoria Blvd|      Q|    40.770258|    -73.917843|
|         3|         3|         R04|               30 Av|      Q|    40.766779|    -73.921479|
|         4|         4|         R05|            Broadway|      Q|     40.76182|    -73.925508|
|         5|         5|         R06|               36 Av|      Q|    40.756804|    -73.929575|
|         6|         6|         R08|   39 Av-Dutch Kills|      Q|    40.752882|    -73.932755|
|         7|       613|         R11|  Lexington Av/59 St|      M|     40.76266|    -73.967258|
|         8|         8|         R13|          5 Av

In [11]:
subway_DF.printSchema()

root
 |-- Station ID: integer (nullable = true)
 |-- Complex ID: integer (nullable = true)
 |-- GTFS Stop ID: string (nullable = true)
 |-- Stop Name: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- GTFS Latitude: double (nullable = true)
 |-- GTFS Longitude: double (nullable = true)



In [12]:
subway_DF.count()

496

In [13]:
ss_df = spark.read.csv(os.path.join(intr_dir, 'part-00000-0c0c6c23-ad4e-4192-9a8c-caa3fa4bf304-c000.csv'), header=True, inferSchema=True)
ss_df.show()

+---+-------------------+--------------------+-------+---------+----------+------------------+--------------------+-----------------+-----------------+-------+----------+-------+
| id|               line|           stop_name|borough|      lat|      long|           n_label|             s_label|          entries|            exits|arrests|complaints|summons|
+---+-------------------+--------------------+-------+---------+----------+------------------+--------------------+-----------------+-----------------+-------+----------+-------+
|F12|        Queens Blvd|          5 Av/53 St|      M|40.760167|-73.975224|            Queens| Downtown & Brooklyn| 1.71540057484E11| 1.83989511603E11|   1811|      3603|   1056|
|637|       Lexington Av|         Bleecker St|      M|40.725915|-73.994659|Uptown & The Bronx|            Downtown| 1.96565180833E11| 1.53466659135E11|   3444|      5079|   1046|
|603|             Pelham|       Middletown Rd|     Bx|40.843863|-73.836322|   Pelham Bay Park|           

In [14]:
ss_df.count()

447

In [15]:
ss_df = ss_df.toPandas()

In [16]:
ss_df['geometry'] = [Point(xy) for xy in zip(ss_df['long'],ss_df['lat'])]

In [17]:
ss_df["arrest_pct_total"] = (ss_df["arrests"] / ss_df["arrests"].sum()) * 100

In [18]:
ss_df.head()

Unnamed: 0,id,line,stop_name,borough,lat,long,n_label,s_label,entries,exits,arrests,complaints,summons,geometry,arrest_pct_total
0,F12,Queens Blvd,5 Av/53 St,M,40.760167,-73.975224,Queens,Downtown & Brooklyn,171540100000.0,183989500000.0,1811,3603,1056,POINT (-73.975224 40.760167),0.118282
1,637,Lexington Av,Bleecker St,M,40.725915,-73.994659,Uptown & The Bronx,Downtown,196565200000.0,153466700000.0,3444,5079,1046,POINT (-73.994659 40.725915),0.224938
2,603,Pelham,Middletown Rd,Bx,40.843863,-73.836322,Pelham Bay Park,Manhattan,16027570000.0,9955782000.0,442,1094,842,POINT (-73.836322 40.843863),0.028868
3,725,Flushing,Times Sq-42 St,M,40.755477,-73.987691,Queens,34 St - Hudson Yards,494357900000.0,184005900000.0,11199,12406,14342,POINT (-73.987691 40.755477),0.731441
4,606,Pelham,Zerega Av,Bx,40.836488,-73.847036,Pelham Bay Park,Manhattan,9471628000.0,5313407000.0,649,1661,996,POINT (-73.847036 40.836488),0.042388


In [19]:
ss_df["borough"].unique()

array(['M', 'Bx', 'Q', 'Bk', 'SI'], dtype=object)

In [20]:
nta_map = gpd.read_file(r'C:\Users\Nigel\Github\big-data-project\data\nynta2020_23a\nynta2020.shp')
nta_map.to_crs(4326, inplace=True)

In [21]:
ss_geo_df = gpd.GeoDataFrame(ss_df, crs=4326, geometry = ss_df.geometry)
# Just to be extra sure
ss_geo_df.to_crs(4326, inplace=True)

In [22]:
# Static map with GeoPandas
# fig,ax = plt.subplots(figsize=(7,7))
# nta_map.boundary.plot(ax=ax, edgecolor='k');
# ss_geo_df[ss_geo_df.borough == 'M']\
#             .head(10)\
#             .plot(column='stop_name', ax=ax, legend=True, marker='.',\
#             markersize=ss_geo_df.arrest_pct_total.astype('float') * 10000);

In [23]:
# ss_geo_df = ss_geo_df.head(10)

In [24]:
nta_demos = pandas.read_excel('https://www1.nyc.gov/assets/planning/download/office/planning-level/nyc-population/acs/demo_2019_acs5yr_nta.xlsx')

In [25]:
nta_df = nta_map.merge(nta_demos, how='left', left_on='NTA2020', right_on='GeoID')

In [26]:
ss_geo_df.to_file("ss_geo.json", driver="GeoJSON")

In [27]:
m = leafmap.Map(center=(40,-100),zoom=4)
m.add_gdf(nta_df, layer_name='2020 NTA Demographic Information', info_mode='on_click')
m.add_point_layer(filename=r'C:\Users\Nigel\Github\big-data-project\ss_geo.json', popup=['stop_name', 'arrests', 'complaints', 'summons', 'arrest_pct_total'], layer_name="Stations")

In [28]:
m

Map(center=[40.705833382057406, -73.9778002131813], controls=(ZoomControl(options=['position', 'zoom_in_text',…