In [1]:
import pandas as pd
import os
import numpy as np
import pandas_profiling

# This line is needed to display plots inline in Jupyter Notebook
%matplotlib inline

# Required for basic python plotting functionality
import matplotlib.pyplot as plt

# Required for formatting dates later in the case
import datetime
import matplotlib.dates as mdates

# Required to display image inline
from IPython.display import Image

# Advanced plotting functionality with seaborn
import seaborn as sns
sns.set(style="whitegrid") # can set style depending on how you'd like it to look

import folium  #needed for interactive map
from folium.plugins import HeatMap

import geopandas
from shapely.geometry import Polygon
from shapely.geometry import Point
from geopandas import GeoDataFrame

In [18]:
# Opens the map as a GeoDataFrame
path = 'Dataset/Original'
nyc_bouroughs = geopandas.read_file(path + '/Neighborhood Tabulation Areas.geojson')

path = 'Dataset/Parquets'
yellow_trips_2 = pd.read_parquet(path + '/yellow_trips_filtered.parquet', engine = 'pyarrow')

In [26]:
yellow_trips_2['Index'] = range(yellow_trips_2.shape[0])
yellow_trips_2.head()
yellow_trips_pu = yellow_trips_2.copy()
yellow_trips_do = yellow_trips_2.copy()

yellow_trips_pu = yellow_trips_pu.drop(['dropoff_longitude','dropoff_latitude'], axis = 1)
yellow_trips_do = yellow_trips_do.drop(['pickup_datetime',
'dropoff_datetime',
'pickup_longitude',
'pickup_latitude',
#'dropoff_longitude',
#'dropoff_latitude',
'passenger_count',
'trip_distance',
'total_amount',
'pickup_month',
'dropoff_month',
'amount_per_distance',
#'Index',
], axis = 1)

In [27]:
geometry = [Point(xy) for xy in zip(yellow_trips_pu.pickup_longitude, yellow_trips_pu.pickup_latitude)]
# df = df.drop(['Lon', 'Lat'], axis=1)
crs = {'init': 'epsg:4326'}
gdf_pu = GeoDataFrame(yellow_trips_pu, crs=crs, geometry=geometry)

In [29]:
geometry = [Point(xy) for xy in zip(yellow_trips_do.dropoff_longitude, yellow_trips_do.dropoff_latitude)]
# df = df.drop(['Lon', 'Lat'], axis=1)
crs = {'init': 'epsg:4326'}
gdf_do = GeoDataFrame(yellow_trips_do, crs=crs, geometry=geometry)

In [30]:
print(gdf_pu.shape)
gdf_pu_boroughs = geopandas.sjoin(gdf_pu, nyc_bouroughs, how="inner", op='intersects')
print(gdf_pu_boroughs.shape)

(7588170, 12)
(7435930, 20)


In [31]:
print(gdf_pu.shape)
gdf_do_boroughs = geopandas.sjoin(gdf_do, nyc_bouroughs, how="inner", op='intersects')
print(gdf_do_boroughs.shape)

(7588170, 12)
(7422675, 12)


In [32]:
gdf = gdf_pu_boroughs.merge(gdf_do_boroughs, on = 'Index', how = 'inner', suffixes = ('_pickup', '_dropoff'))

In [33]:
gdf.dtypes

pickup_datetime        datetime64[ns]
dropoff_datetime       datetime64[ns]
pickup_longitude              float64
pickup_latitude               float64
passenger_count                 int64
trip_distance                 float64
total_amount                  float64
pickup_month                    int64
dropoff_month                   int64
amount_per_distance           float64
Index                           int32
geometry_pickup              geometry
index_right_pickup              int64
ntacode_pickup                 object
shape_area_pickup              object
county_fips_pickup             object
ntaname_pickup                 object
shape_leng_pickup              object
boro_name_pickup               object
boro_code_pickup               object
dropoff_longitude             float64
dropoff_latitude              float64
geometry_dropoff             geometry
index_right_dropoff             int64
ntacode_dropoff                object
shape_area_dropoff             object
county_fips_

In [34]:
gdf_final = gdf.drop(['Index', 'geometry_pickup', 'geometry_dropoff'], axis = 1)

In [35]:
path = 'Dataset/Parquets'
gdf_final.to_parquet(path + '/yellow_trips_nta.parquet', engine = 'pyarrow')