In [1]:
import pandas as pd
import numpy as np
import rtree
import time
from pyspark import SparkContext
import traceback

In [55]:
# import gzip
# import shutil
# with gzip.open('yellow.csv.gz', 'rb') as f_in:
#     with open('yellow.csv', 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

In [56]:
boros_file = 'boroughs.geojson'
neighborhood_file = 'neighborhoods.geojson'

In [4]:
sc = SparkContext()

In [57]:
def createIndex(shapefile):
    import rtree
    import fiona.crs
    import geopandas as gpd
    zones = gpd.read_file(shapefile).to_crs(fiona.crs.from_epsg(2263))
    index = rtree.Rtree()
    for idx,geometry in enumerate(zones.geometry):
        index.insert(idx, geometry.bounds)
    return (index, zones)

def findZone(p, index, zones):
    match = index.intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        if zones.geometry[idx].contains(p):
            return idx
    return None

def processTrips(pid, records):
    import csv
    import pyproj
    import shapely.geometry as geom
    
    reader = csv.reader(records)
    proj = pyproj.Proj(init="epsg:2263", preserve_units=True)    
    
    index_n, neighborhoods = createIndex(neighborhood_file)  
    
    # Skip the header
    if pid == 0:
        next(records)
    
    counts = {}
    for row in reader:
        try: 
            if 'NULL' in row[2:6] or '0' in row[2:6]: 
                continue
            
            pickup_point = geom.Point(proj(float(row[3]), float(row[2])))
            dropoff_point= geom.Point(proj(float(row[5]), float(row[4])))

            start_idx = findZone(pickup_point, index_n, neighborhoods) ##, "boroname")
            end_idx   = findZone(dropoff_point, index_n, neighborhoods) ##, "neighborhood")
            
            if start_idx and end_idx:
                borough = neighborhoods.iloc[start_idx]['borough']
                neighborhood = neighborhoods.iloc[end_idx]['neighborhood']
                counts[(borough,neighborhood)] = counts.get((borough,neighborhood), 0) + 1

        except: 
            print("Failed at: ", row) ##TODO this won't log anything            
            print(traceback.format_exc())
    return counts.items()

In [58]:
def run_spark(taxi_file, sc):

    rdd = sc.textFile(taxi_file)

    counts_rdd = rdd.mapPartitionsWithIndex(processTrips) \
                .reduceByKey(lambda x, y: x + y ) \
                .map(lambda x: ( x[0][0], [(x[0][1], x[1])] ) ) \
                .reduceByKey(lambda x, y: x + y ) \
                .mapValues(lambda hood_counts: sorted(hood_counts, reverse=True, key=lambda tup:tup[1])[:3]) \
                .sortByKey() \
                .map(lambda x: x[0] + "," + ",".join([str(i) for sub in x[1] for i in sub])) \
   
    return counts_rdd.collect()

In [60]:
start = time.time()
print(run_spark('yellow.csv.gz', sc))
print("Execution Time(secs): ", time.time() - start)

['Bronx,Harlem,50,Longwood,49,East Harlem,32', 'Brooklyn,Williamsburg,1879,Bedford-Stuyvesant,983,Greenpoint,730', 'Manhattan,Upper East Side,52859,Midtown,46447,Upper West Side,36753', 'Queens,Midtown,1910,Upper East Side,1231,Astoria,1083', 'Staten Island,Castleton Corners,2,Dongan Hills,1,West Brighton,1']
Execution Time(secs):  1389.8245899677277


In [61]:
import pyproj
import shapely.geometry as geom
import time
proj = pyproj.Proj(init="epsg:2263", preserve_units=True)   
point = geom.Point(proj('-73.99164581','40.7387085'))

start = time.time()
index_e, zone_e = createIndex('neighborhoods.geojson') 
print("Execution Time(secs): ", time.time() - start)

start = time.time()
point_zone = findZone(point, index_e, zone_e)
print(point_zone)
print("Execution Time(secs): ", time.time() - start)

Execution Time(secs):  34.08284401893616
155
Execution Time(secs):  0.014960289001464844
