In [15]:
import pandas as pd
import numpy as np
import rtree
import time
from pyspark import SparkContext
import traceback
# from heapq import nlargest
# from operator import itemgetter

In [2]:
# import gzip
# import shutil
# with gzip.open('yellow.csv.gz', 'rb') as f_in:
#     with open('yellow.csv', 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

In [3]:
boros_file = 'boroughs.geojson'
neighborhood_file = 'neighborhoods.geojson'

In [4]:
sc = SparkContext()

In [16]:
def createIndex(shapefile):
    import rtree
    import fiona.crs
    import geopandas as gpd
    zones = gpd.read_file(shapefile).to_crs(fiona.crs.from_epsg(2263))
    index = rtree.Rtree()
    for idx,geometry in enumerate(zones.geometry):
        index.insert(idx, geometry.bounds)
    return (index, zones)

def findZone(p, index, zones):
    match = index.intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        if zones.geometry[idx].contains(p):
            return idx
    return None

def processTrips(pid, records):
    import csv
    import pyproj
    import shapely.geometry as geom
    
    reader = csv.reader(records)
    proj = pyproj.Proj(init="epsg:2263", preserve_units=True)    
    
    index_n, neighborhoods = createIndex(neighborhood_file)  
    
    # Skip the header
    if pid == 0:
        next(records)
    
    counts = {}
    for row in reader:
        try: 
            if 'NULL' in row[2:6] or '0' in row[2:6]: 
                continue
            
            pickup_point = geom.Point(proj(float(row[3]), float(row[2])))
            dropoff_point= geom.Point(proj(float(row[5]), float(row[4])))

            start_idx = findZone(pickup_point, index_n, neighborhoods) ##, "boroname")
            end_idx   = findZone(dropoff_point, index_n, neighborhoods) ##, "neighborhood")
            
            if start_idx and end_idx:
                borough = neighborhoods.iloc[start_idx]['borough']
                neighborhood = neighborhoods.iloc[end_idx]['neighborhood']
                counts[(borough,neighborhood)] = counts.get((borough,neighborhood), 0) + 1

        except: 
            print(traceback.format_exc())
            print("Failed at: ", row) ##TODO this won't log anything
    return counts.items()

In [17]:
def run_spark(taxi_file, sc):

    rdd = sc.textFile(taxi_file)
#         sorted(data, key=lambda tup: tup[1])
    counts = rdd.mapPartitionsWithIndex(processTrips) \
                .map(lambda x: ( x[0][0], [(x[0][1], x[1])] ) ) \
                .mapValues(lambda hood_counts: sorted(hood_counts, key=lambda tup:tup[1])[:3]) \
                .sortByKey() \
                .map(lambda x: x[0] + "," + ",".join([str(i) for sub in x[1] for i in sub])) \
   
#     .mapValues(lambda hood_counts: nlargest(3, hood_counts, key=itemgetter(1))) \

    return counts.collect()


In [18]:
start = time.time()
print(run_spark('yellow100.csv', sc))
print("Execution Time(secs): ", time.time() - start)
# ['Brooklyn,Clinton Hill,1,Boerum Hill,1,Prospect Heights,1', 
#'Manhattan,Upper West Side,14,Chelsea,10,Midtown,9', "Queens,Astoria,3,
# Hell's Kitchen,1"]
# Execution Time(secs):  53.41633701324463

['Brooklyn,Clinton Hill,1', 'Brooklyn,Prospect Heights,1', 'Brooklyn,Boerum Hill,1', 'Manhattan,East Village,4', 'Manhattan,Stuyvesant Town,2', 'Manhattan,Chelsea,7', 'Manhattan,Upper West Side,7', 'Manhattan,Midtown,3', 'Manhattan,Gramercy,2', 'Manhattan,Carroll Gardens,1', 'Manhattan,Longwood,1', 'Manhattan,Lower East Side,1', 'Manhattan,Theater District,1', 'Manhattan,Murray Hill,1', "Manhattan,Hell's Kitchen,3", 'Manhattan,Highbridge,1', 'Manhattan,Greenwich Village,1', 'Manhattan,Upper East Side,4', 'Manhattan,DUMBO,1', 'Manhattan,West Village,2', 'Manhattan,Bedford-Stuyvesant,1', 'Manhattan,SoHo,1', 'Manhattan,East Harlem,1', 'Manhattan,East Village,1', 'Manhattan,Chelsea,3', 'Manhattan,Midtown,6', 'Manhattan,Upper East Side,4', 'Manhattan,Upper West Side,7', "Manhattan,Hell's Kitchen,4", 'Manhattan,Flatiron District,1', 'Manhattan,West Village,4', 'Manhattan,Murray Hill,1', 'Manhattan,Theater District,1', 'Manhattan,SoHo,1', 'Manhattan,Lower East Side,3', 'Manhattan,NoHo,1', 'Ma

In [18]:
start = time.time()
print(run_spark('yellow250.csv', sc))
print("Execution Time(secs): ", time.time() - start)

Number of Partitions:  2
['Brooklyn,Bushwick,2,Clinton Hill,1,Boerum Hill,1', 'Manhattan,Chelsea,27,Upper West Side,23,Upper East Side,22', "Queens,Astoria,3,LaGuardia Airport,1,Hell's Kitchen,1"]
Execution Time(secs):  60.8266339302063


In [16]:
# Brooklyn,Bushwick,3,Williamsburg,3,Crown Heights,2', 'Manhattan,Upper East Side,51,Chelsea,45,Midtown,43', 
# 'Queens,Astoria,3,Long Island City,2,LaGuardia Airport,1'
start = time.time()
print(run_spark('yellow500.csv', sc))
print("Execution Time(secs): ", time.time() - start)

Number of Partitions:  2
['Brooklyn,Bushwick,3,Williamsburg,3,Crown Heights,2', 'Manhattan,Upper East Side,51,Chelsea,45,Midtown,43', 'Queens,Astoria,3,Long Island City,2,LaGuardia Airport,1']
Execution Time(secs):  62.504777669906616


In [7]:
start = time.time()
print(run_spark('yellow1000.csv', sc))
print("Execution Time(secs): ", time.time() - start)

['Brooklyn,Williamsburg,5,Bushwick,4,Williamsburg,3', 'Manhattan,Upper East Side,57,Upper East Side,51,Midtown,49', 'Queens,Astoria,3,Long Island City,2,Upper East Side,2']
None
Execution Time(secs):  41.35136604309082


In [8]:
start = time.time()
print(run_spark('yellow.csv', sc))
print("Execution Time(secs): ", time.time() - start)

['Bronx,Longwood,25,Harlem,25,Harlem,25', 'Brooklyn,Williamsburg,1163,Williamsburg,716,Bedford-Stuyvesant,572', 'Manhattan,Upper East Side,29644,Midtown,24159,Upper East Side,23215', 'Queens,Midtown,1290,Upper East Side,820,Midtown,620', 'Staten Island,Dongan Hills,1,West Brighton,1,Midtown,1']
None
Execution Time(secs):  239.68706727027893


In [19]:
import pyproj
import shapely.geometry as geom

proj = pyproj.Proj(init="epsg:2263", preserve_units=True)   
point = geom.Point(proj('-73.99164581','40.7387085'))

In [9]:
start = time.time()
index_e, hood_e = createIndex('neighborhoods.geojson') 
print("Execution Time(secs): ", time.time() - start)

Execution Time(secs):  30.418677806854248


In [21]:
start = time.time()
point_zone = findZone(point, index_e, hood_e)
print("Execution Time(secs): ", time.time() - start)

Execution Time(secs):  0.00494694709777832
