In [3]:
import pandas as pd
import numpy as np
import rtree
import time
from pyspark import SparkContext
from geopandas import GeoDataFrame
import traceback


In [4]:
# import gzip
# import shutil
# with gzip.open('yellow.csv.gz', 'rb') as f_in:
#     with open('yellow.csv', 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

In [5]:
sc = SparkContext()

In [12]:
def genIndex(shapefile):
    import fiona.crs
    import geopandas as gpd
    zones = gpd.read_file(shapefile).to_crs(fiona.crs.from_epsg(2263))
    for idx, geometry in enumerate(zones.geometry):
        yield (idx, geometry.bounds, zones.iloc[idx])

def getZone(p, index, field):
    matches = index.intersection((p.x, p.y, p.x, p.y), objects='raw')
    for match in matches:
        if match.geometry.contains(p):
            return match[field]
    return None

def createIndex(shapefile):
    import rtree
    import fiona.crs
    import geopandas as gpd
    zones = gpd.read_file(shapefile).to_crs(fiona.crs.from_epsg(2263))
    index = rtree.Rtree()
    for idx, geometry in enumerate(zones.geometry):
        index.insert(idx, geometry.bounds)
    return {"index": index, "zones": zones}

def findZone(p, geo_map):
    match = geo_map['index'].intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        if geo_map['zones'].geometry[idx].contains(p):
            return idx
    return None

def processTrips(pid, records):
    import csv
    import pyproj
    import shapely.geometry as geom
    import rtree.index

    # Skip the header
    if pid == 0:
        next(records)
    
    reader = csv.reader(records)
    proj = pyproj.Proj(init="epsg:2263", preserve_units=True)    
    
    boros = createIndex('boroughs.geojson')    
    neighborhoods = createIndex('neighborhoods.geojson')  
#     boros_gen = rtree.index.Index(genIndex('boroughs.geojson'))
#     hood_gen = rtree.index.Index(genIndex('neighborhoods.geojson'))
    
    for row in reader:
        try: 
            if 'NULL' in row[2:6]: 
                continue
            
            pickup_point = geom.Point(proj(float(row[3]), float(row[2])))
            dropoff_point= geom.Point(proj(float(row[5]), float(row[4])))

            start_boro = findZone(pickup_point, boros)
            end_hood = findZone(dropoff_point, neighborhoods)

#             start_boro = getZone(pickup_point, boros_gen, 'boroname')
#             end_hood = getZone(dropoff_point, hood_gen, 'neighborhood')

            if start_boro and end_hood:
                    boro_name = boros['zones'].iloc[start_boro]['boroname']
                    hood_name = neighborhoods['zones'].iloc[end_hood]['neighborhood']
                    yield ( (boro_name, hood_name), 1 )
#                     yield ( (start_boro, end_hood), 1 )

        except: 
            print(traceback.format_exc())
            print("Failed at: ", row) ##TODO this won't log anything


In [13]:
def run_spark(taxi_file, sc):
    from heapq import nlargest
    from operator import itemgetter

    rdd = sc.textFile(taxi_file).mapPartitionsWithIndex(processTrips).cache()
    
    print("Number of Partitions: ", rdd.getNumPartitions())
    
    counts = rdd.reduceByKey(lambda x,y: x+y) \
                .map(lambda x: ( x[0][0], [(x[0][1], x[1])] ) ) \
                .reduceByKey(lambda x,y: x+y) \
                .mapValues(lambda hood_counts: nlargest(3, hood_counts, key=itemgetter(1))) \
                .sortByKey() \
                .map(lambda x: x[0] + "," + ",".join([str(i) for sub in x[1] for i in sub])) \

    return counts.collect()


In [17]:
start = time.time()
print(run_spark('yellow100.csv', sc))
print("Execution Time(secs): ", time.time() - start)

Number of Partitions:  2
['Brooklyn,Clinton Hill,1,Boerum Hill,1,Prospect Heights,1', 'Manhattan,Upper West Side,14,Chelsea,10,Midtown,9', "Queens,Astoria,3,Hell's Kitchen,1"]
Execution Time(secs):  59.48563313484192


In [18]:
start = time.time()
print(run_spark('yellow250.csv', sc))
print("Execution Time(secs): ", time.time() - start)

Number of Partitions:  2
['Brooklyn,Bushwick,2,Clinton Hill,1,Boerum Hill,1', 'Manhattan,Chelsea,27,Upper West Side,23,Upper East Side,22', "Queens,Astoria,3,LaGuardia Airport,1,Hell's Kitchen,1"]
Execution Time(secs):  60.8266339302063


In [16]:
# Brooklyn,Bushwick,3,Williamsburg,3,Crown Heights,2', 'Manhattan,Upper East Side,51,Chelsea,45,Midtown,43', 
# 'Queens,Astoria,3,Long Island City,2,LaGuardia Airport,1'
start = time.time()
print(run_spark('yellow500.csv', sc))
print("Execution Time(secs): ", time.time() - start)

Number of Partitions:  2
['Brooklyn,Bushwick,3,Williamsburg,3,Crown Heights,2', 'Manhattan,Upper East Side,51,Chelsea,45,Midtown,43', 'Queens,Astoria,3,Long Island City,2,LaGuardia Airport,1']
Execution Time(secs):  62.504777669906616


In [55]:
start = time.time()
print(run_spark('yellow1000.csv', sc))
print("Execution Time(secs): ", time.time() - start)

['Brooklyn,Williamsburg,8,Bushwick,7,Clinton Hill,4', 'Manhattan,Upper East Side,108,Midtown,92,Chelsea,86', 'Queens,Astoria,5,Upper East Side,2,Upper West Side,2']
Execution Time(secs):  65.14846420288086


In [None]:
start = time.time()
print(run_spark('yellow.csv', sc))
print("Execution Time(secs): ", time.time() - start)

In [11]:
import pyproj
import shapely.geometry as geom

proj = pyproj.Proj(init="epsg:2263", preserve_units=True)   
point = geom.Point(proj('-73.99164581','40.7387085'))

In [45]:
start = time.time()
hood_gen = rtree.index.Index(genIndex('neighborhoods.geojson'))
zone = getZone(point, hood_gen, 'neighborhood')
print(zone)
print("Execution Time(secs): ", time.time() - start)

Chelsea
Execution Time(secs):  29.4283344745636


In [46]:
start = time.time()
hood_index = createIndex('neighborhoods.geojson') 
point_zone = findZone(point, hood_index)
print(point_zone)
print("Execution Time(secs): ", time.time() - start)

155
Execution Time(secs):  30.106755018234253
