In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from geopandas import GeoDataFrame, read_file
# from geopandas.tools import sjoin
from shapely.geometry import Point, mapping,shape
import time
from shapely import prepared
import json

In [2]:
def extract_zone(df,d,box_points):
    # d is either 'dropoff' or 'pickup'
    # box_points is a 2x2 array-like containing the lon-lat of the NW corner followed by the lon-lat of the SE corner
    # box is inclusive of western and northern edge, exclusive of southern and eastern edge
    x1 = str(box_points[0][0])
    y1 = str(box_points[0][1])
    x2 = str(box_points[1][0])
    y2 = str(box_points[1][1])
    qstr = '%s_longitude >= %s & %s_longitude < %s & %s_latitude <= %s & %s_latitude > %s' % (d,x1,d,x2,d,y1,d,y2)
    return df.query(qstr)

In [3]:
def intersect_zips(df,d,zgdf):
    # d is either 'dropoff' or 'pickup'
    # zip_gdf is a geodataframe with geometry of zip codes
    index_ = df.index
    df.reset_index(drop=True,inplace=True)
    df['zipcode'] = ''
    xc = df['%s_longitude' %d]
    yc = df['%s_latitude' %d]
    pts = gpd.GeoSeries([Point(x, y) for x, y in zip(xc, yc)])
    zip_bins = {}
    for i in zgdf.index:
        zip_bins[zgdf.loc[i].ZIPCODE] = pts.loc[pts.intersects(zgdf.loc[i].geometry)]
    for k, v in zip_bins.iteritems():
        df.loc[v.index,'zipcode'] = k
    zip_centroids = {}
    for i in zgdf.index:
        zip_centroids[zgdf.loc[i].ZIPCODE] = (yc.iloc[zip_bins[zgdf.loc[i].ZIPCODE].index].mean(),
                                              xc.iloc[zip_bins[zgdf.loc[i].ZIPCODE].index].mean())
    df.index = index_
    return df, zip_centroids

In [4]:
zipcodes = gpd.GeoDataFrame.from_file('data/ZIP_CODE_040114/ZIP_CODE_040114.shp')
zipcodes = zipcodes.to_crs(epsg=4269)
zipcodes = zipcodes[['ZIPCODE','geometry']]
zipcodes.ZIPCODE = zipcodes.ZIPCODE.astype(int)

In [6]:
tlc_data = pd.read_csv('data/yellow_tripdata_2015-10.csv')

In [11]:
a = [[-73.887972,40.777185],[-73.883264,40.767971]]
b = [[-73.883264,40.778892],[-73.868364,40.770733]]
cd = [[-73.868364,40.778892],[73.858000,40.765835]]

In [12]:
a_data = extract_zone(tlc_data,'pickup',a)
a_data['terminal'] = 'A'
b_data = extract_zone(tlc_data,'pickup',b)
b_data['terminal'] = 'B'
cd_data = extract_zone(tlc_data,'pickup',cd)
cd_data['terminal'] = 'C_and_D'
lga_pickups = pd.concat([a_data,b_data,cd_data])

a_data = extract_zone(tlc_data,'dropoff',a)
a_data['terminal'] = 'A'
b_data = extract_zone(tlc_data,'dropoff',b)
b_data['terminal'] = 'B'
cd_data = extract_zone(tlc_data,'dropoff',cd)
cd_data['terminal'] = 'C_and_D'
lga_dropoffs = pd.concat([a_data,b_data,cd_data])

del a_data
del b_data
del cd_data
del tlc_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#i

In [13]:
lga_pickups, zip_centroids = intersect_zips(lga_pickups,'dropoff',zipcodes)

In [19]:
lga_pickups.reset_index(drop=True,inplace=True)
lga_pickups['duration'] = np.divide(pd.to_datetime(lga_pickups['tpep_dropoff_datetime']) - pd.to_datetime(lga_pickups['tpep_pickup_datetime']),np.timedelta64(1,'s'))
lga_pickups['total_ex_tip'] = lga_pickups['total_amount'] - lga_pickups['tip_amount']
lga_pickups_summary = lga_pickups.groupby(['terminal','zipcode'])[['passenger_count','trip_distance','duration','total_ex_tip']].mean()
lga_pickups_summary['N'] = lga_pickups.groupby(['terminal','zipcode']).size()

In [21]:
lga_dropoffs, zip_centroids = intersect_zips(lga_dropoffs,'pickup',zipcodes)

In [22]:
lga_dropoffs.reset_index(drop=True,inplace=True)
lga_dropoffs['duration'] = np.divide(pd.to_datetime(lga_dropoffs['tpep_dropoff_datetime']) - pd.to_datetime(lga_dropoffs['tpep_pickup_datetime']),np.timedelta64(1,'s'))
lga_dropoffs['total_ex_tip'] = lga_dropoffs['total_amount'] - lga_dropoffs['tip_amount']
lga_dropoffs_summary = lga_dropoffs.groupby(['terminal','zipcode'])[['passenger_count','trip_distance','duration','total_ex_tip']].mean()
lga_dropoffs_summary['N'] = lga_dropoffs.groupby(['terminal','zipcode']).size()

In [23]:
lga_pickups_summary.to_csv('data-outputs/lga_pickups_summary.csv')
lga_dropoffs_summary.to_csv('data-outputs/lga_dropoffs_summary.csv')