In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import rtree
import time
from pyspark import SparkContext
from geopandas import GeoDataFrame

In [2]:
# import gzip
# import shutil
# with gzip.open('yellow.csv.gz', 'rb') as f_in:
#     with open('yellow.csv', 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

In [3]:
sc = SparkContext()

In [4]:
def genIndex(shapefile):
    import rtree
    import fiona.crs
    import geopandas as gpd
    zones = gpd.read_file(shapefile).to_crs(fiona.crs.from_epsg(2263))
    index = rtree.Rtree()
    for idx, geometry in enumerate(zones.geometry):
        yield (idx, geometry.bounds, zones.iloc[idx])

def getZone(p, index, field):
    matches = index.intersection((p.x, p.y, p.x, p.y), objects=True)
    for match in matches:
        hood = match.object
        if hood.geometry.contains(p):
            return hood[field]
    return None
        
def createIndex(shapefile):
    import rtree
    import fiona.crs
    import geopandas as gpd
    zones = gpd.read_file(shapefile).to_crs(fiona.crs.from_epsg(2263))
    index = rtree.Rtree()
    for idx, geometry in enumerate(zones.geometry):
        index.insert(idx, geometry.bounds)
    return {"index": index, "zones": zones}

def findZone(p, geo_map):
    match = geo_map['index'].intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        if geo_map['zones'].geometry[idx].contains(p):
            return idx
    return None

def processTrips(pid, records):
    import csv
    import pyproj
    import shapely.geometry as geom
    import rtree

    # Skip the header
    if pid==0:
        next(records)
    
    reader = csv.reader(records)
    
    # Create an R-tree index
    proj = pyproj.Proj(init="epsg:2263", preserve_units=True)    
#     boros = createIndex('boroughs.geojson')    
#     neighborhoods = createIndex('neighborhoods.geojson')  
    boros_index = rtree.index.Index(genIndex('boroughs.geojson'))
    hood_index = rtree.index.Index(genIndex('neighborhoods.geojson'))

    for row in reader:
        # 'tpep_pickup_datetime,tpep_dropoff_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude',
        try: 
            if 'NULL' in row[2:5]: 
                continue
            pickup_point = geom.Point(proj(float(row[3]), float(row[2])))
#             start_boro = findZone(pickup_point, boros)
            start_boro = getZone(pickup_point, boros_index, 'boroname')

            if start_boro:
                dropoff_point= geom.Point(proj(float(row[5]), float(row[4])))
#                 end_hood = findZone(dropoff_point, neighborhoods)
                end_hood = getZone(dropoff_point, hood_index, 'neighborhood')
                if end_hood:
#                     boro_name = boros['zones'].iloc[start_boro]['boroname']
#                     hood_name = neighborhoods['zones'].iloc[end_hood]['neighborhood']
                    yield ( (start_boro, end_hood), 1 )
        except: 
            print("Failed at: ", row) ##TODO this won't log anything
            

In [17]:
def new_partition(row):
    if row[0][0] == 'Manhattan':
        return 0
    if row[0][0] == 'Brooklyn':
        return 1
    if row[0][0] == 'Queens':
        return 2
    else:
        return 3

def run_spark(taxi_file, sc):
    from heapq import nlargest
    from operator import itemgetter

    start = time.time()
    rdd = sc.textFile(taxi_file).mapPartitionsWithIndex(processTrips)
    
    counts = rdd.reduceByKey(lambda x,y: x+y) \
                .map(lambda x: ( x[0][0], [(x[0][1], x[1])] ) ) \
                .reduceByKey(lambda x,y: x+y) \
                .mapValues(lambda hood_counts: nlargest(3, hood_counts, key=itemgetter(1))) \
                .map(lambda x: str(x[0]) + "," + ",".join([str(i) for sub in x[1] for i in sub])) \
                .collect()
    
    counts.sort()
    return counts


In [18]:
start = time.time()
print(run_spark('yellow100.csv', sc))
print("Execution Time(secs): ", time.time() - start)

Partitions:  2
Partitions:  4
['Brooklyn,Clinton Hill,1,Boerum Hill,1,Prospect Heights,1', 'Manhattan,Upper West Side,14,Chelsea,10,Midtown,9', "Queens,Astoria,3,Hell's Kitchen,1"]
Execution Time(secs):  59.24747133255005


In [19]:
start = time.time()
print(run_spark('yellow250.csv', sc))
print("Execution Time(secs): ", time.time() - start)

Partitions:  2
Partitions:  4
['Brooklyn,Bushwick,2,Clinton Hill,1,Boerum Hill,1', 'Manhattan,Chelsea,27,Upper West Side,23,Upper East Side,22', "Queens,Astoria,3,LaGuardia Airport,1,Hell's Kitchen,1"]
Execution Time(secs):  65.58607077598572


In [20]:
# Brooklyn,Bushwick,3,Williamsburg,3,Crown Heights,2', 'Manhattan,Upper East Side,51,Chelsea,45,Midtown,43', 
# 'Queens,Astoria,3,Long Island City,2,LaGuardia Airport,1'
start = time.time()
print(run_spark('yellow500.csv', sc))
print("Execution Time(secs): ", time.time() - start)

Partitions:  2
Partitions:  4
['Brooklyn,Williamsburg,3,Bushwick,3,Crown Heights,2', 'Manhattan,Upper East Side,51,Chelsea,45,Midtown,43', 'Queens,Astoria,3,Long Island City,2,LaGuardia Airport,1']
Execution Time(secs):  60.23847579956055


In [9]:
start = time.time()
print(run_spark('yellow1000.csv', sc))
print("Execution Time(secs): ", time.time() - start)

['Brooklyn,Williamsburg,8,Bushwick,7,Clinton Hill,4', 'Manhattan,Upper East Side,108,Midtown,92,Chelsea,86', 'Queens,Astoria,5,Upper East Side,2,Upper West Side,2']
Execution Time(secs):  50.558775424957275


In [10]:
start = time.time()
print(run_spark('yellow.csv', sc))
print("Execution Time(secs): ", time.time() - start)

['Bronx,Harlem,50,Longwood,49,East Harlem,32', 'Brooklyn,Williamsburg,1880,Bedford-Stuyvesant,983,Greenpoint,730', 'Manhattan,Upper East Side,52860,Midtown,46448,Upper West Side,36754', 'Queens,Midtown,1910,Upper East Side,1231,Astoria,1084', "Staten Island,Castleton Corners,2,West Brighton,1,Bull's Head,1"]
Execution Time(secs):  1732.6583473682404


In [22]:
"{project}".format(project="hi")

'hi'