In [1]:
import sys, os
sys.path.insert(0,"../code")

import pandas as pd
import numpy as np
import geopandas as gpd
import pyproj

%load_ext autoreload
%autoreload 2

# Unzip data and load into DataFrame 

In [2]:
import zipfile

def load_data(path_data="../data", link="https://www.kaggle.com/c/6960/download-all"): 
    
    if not os.path.exists(path_data): 
        os.mkdir(path_data)
    
    if not os.path.exists(os.path.join(path_data,"nyc-taxi-trip-duration")):
        
        if not os.path.exists(os.path.join(path_data,"nyc-taxi-trip-duration.zip")):
            print(f"Please download data from {link} into {path_data}.")
        else: 
            zip_obj = zipfile.ZipFile(os.path.join(path_data,"nyc-taxi-trip-duration.zip"), 'r')
            zip_obj.extractall(os.path.join(path_data,"nyc-taxi-trip-duration"))
            zip_obj.close()
            zip_obj = zipfile.ZipFile(os.path.join(path_data,"nyc-taxi-trip-duration",'train.zip'), 'r')
            zip_obj.extractall(os.path.join(path_data,"nyc-taxi-trip-duration"))
            zip_obj.close()

In [3]:
load_data()

In [4]:
PATH_DATA = os.path.join("..\data","nyc-taxi-trip-duration","train","train.csv")

df_rides = pd.read_csv(PATH_DATA,index_col=0,
                       parse_dates=[2,3],
                       dtype={'store_and_fwd_flag':'category','vendor_id':'category','passenger_count':'int8',})

In [5]:
## Split data into preliminary test and train in order to get an estimate without commiting to Kaggel 
df_train_sample = df_rides.sample(frac=0.15)
df_rides = df_rides.loc[set(df_rides.index)-set(df_train_sample.index)]
print(df_train_sample.shape)
print(df_rides.shape)
df_train_sample.to_pickle(os.path.join("../data","df_train_sample.pickle"))

(218797, 10)
(1239847, 10)


##  Basic data exploration 

In [6]:
print(df_rides.info())
df_rides.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1239847 entries, id2485291 to id3116339
Data columns (total 10 columns):
vendor_id             1239847 non-null category
pickup_datetime       1239847 non-null datetime64[ns]
dropoff_datetime      1239847 non-null datetime64[ns]
passenger_count       1239847 non-null int8
pickup_longitude      1239847 non-null float64
pickup_latitude       1239847 non-null float64
dropoff_longitude     1239847 non-null float64
dropoff_latitude      1239847 non-null float64
store_and_fwd_flag    1239847 non-null category
trip_duration         1239847 non-null int64
dtypes: category(2), datetime64[ns](2), float64(4), int64(1), int8(1)
memory usage: 79.2+ MB
None


Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
id2485291,1,2016-05-28 23:46:14,2016-05-28 23:53:41,2,-73.991608,40.750118,-73.987122,40.760979,N,447
id1443424,1,2016-04-15 13:32:15,2016-04-15 13:44:41,2,-73.941551,40.838516,-73.950233,40.825512,N,746
id2057397,1,2016-04-22 21:40:31,2016-04-22 22:09:14,2,-73.9729,40.75441,-74.031296,40.622852,N,1723
id3472628,2,2016-03-09 13:07:56,2016-03-09 13:35:34,1,-73.966354,40.761745,-74.008476,40.73428,N,1658
id2661763,2,2016-04-15 21:13:44,2016-04-15 21:23:09,2,-73.969307,40.757099,-73.983253,40.748829,N,565


In [7]:
pd.isna(df_rides).sum()

vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [8]:
df_rides.describe()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1239847.0,1239847.0,1239847.0,1239847.0,1239847.0,1239847.0
mean,1.664801,-73.9735,40.75091,-73.97342,40.75181,961.993
std,1.314304,0.07398869,0.03132818,0.07345219,0.03499102,5530.958
min,0.0,-121.9333,34.3597,-121.9333,34.3597,1.0
25%,1.0,-73.99186,40.73734,-73.99133,40.73586,397.0
50%,1.0,-73.98174,40.75409,-73.97974,40.75452,662.0
75%,2.0,-73.96732,40.76836,-73.96297,40.76982,1076.0
max,9.0,-65.84839,43.91176,-65.84839,43.91176,3526282.0


In [9]:
assert(df_rides.index.unique().shape[0]==df_rides.shape[0])
assert(np.all(df_rides.dropoff_datetime>df_rides.pickup_datetime))
assert(np.all(df_rides.passenger_count>=0))
#assert(np.allclose((df_rides.dropoff_datetime-df_rides.pickup_datetime).dt.seconds.values,df_rides.trip_duration,))

## Preprocessing of data

### Geographic data

In [10]:
from shapely.geometry import Point

def transform_to_geodf(df_rides,set_geometry_col='pickup'): 
        df_rides['pickup_geom'] = list(zip(df_rides.pickup_longitude,df_rides.pickup_latitude))
        df_rides['pickup_geom'] = df_rides['pickup_geom'].apply(Point)
        df_rides['dropoff_geom'] = list(zip(df_rides.dropoff_longitude,df_rides.dropoff_latitude))
        df_rides['dropoff_geom'] = df_rides['dropoff_geom'].apply(Point)
        df_rides = gpd.GeoDataFrame(df_rides,geometry=f'{set_geometry_col}_geom',crs={"init":"epsg:4326"})      
        return df_rides

In [11]:
df_rides = transform_to_geodf(df_rides)
df_rides.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_geom,dropoff_geom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
id2485291,1,2016-05-28 23:46:14,2016-05-28 23:53:41,2,-73.991608,40.750118,-73.987122,40.760979,N,447,POINT (-73.99160766601563 40.75011825561523),POINT (-73.98712158203125 40.76097869873047)
id1443424,1,2016-04-15 13:32:15,2016-04-15 13:44:41,2,-73.941551,40.838516,-73.950233,40.825512,N,746,POINT (-73.94155120849609 40.83851623535156),POINT (-73.95023345947266 40.82551193237305)
id2057397,1,2016-04-22 21:40:31,2016-04-22 22:09:14,2,-73.9729,40.75441,-74.031296,40.622852,N,1723,POINT (-73.972900390625 40.75440979003906),POINT (-74.0312957763672 40.62285232543945)
id3472628,2,2016-03-09 13:07:56,2016-03-09 13:35:34,1,-73.966354,40.761745,-74.008476,40.73428,N,1658,POINT (-73.96635437011719 40.76174545288085),POINT (-74.00847625732422 40.73427963256836)
id2661763,2,2016-04-15 21:13:44,2016-04-15 21:23:09,2,-73.969307,40.757099,-73.983253,40.748829,N,565,POINT (-73.96930694580078 40.75709915161133),POINT (-73.98325347900391 40.74882888793945)


In [12]:
# Correct utm-zone: {'init': 'epsg:32618'}

def convert_projection_to_utm(df,col_x_source,col_y_source,
                              col_x_dest = 'x_utm', col_y_dest = 'y_utm',
                              projection_source=pyproj.Proj("+init=EPSG:4326"),
                              projection_dest=pyproj.Proj("+init=EPSG:32618")): 
    x,y = pyproj.transform(projection_source, projection_dest,df[col_x_source].values,df[col_y_source].values)
    return df.assign(**{col_x_dest:x,col_y_dest:y})

df_rides = convert_projection_to_utm(df_rides,col_x_source='pickup_longitude',col_y_source='pickup_latitude',col_x_dest="pickup_x_utm",col_y_dest='pickup_y_utm')
df_rides = convert_projection_to_utm(df_rides,col_x_source='dropoff_longitude',col_y_source='dropoff_latitude',col_x_dest="dropoff_x_utm",col_y_dest='dropoff_y_utm')
df_rides.head()


Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_geom,dropoff_geom,pickup_x_utm,pickup_y_utm,dropoff_x_utm,dropoff_y_utm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
id2485291,1,2016-05-28 23:46:14,2016-05-28 23:53:41,2,-73.991608,40.750118,-73.987122,40.760979,N,447,POINT (-73.99160766601563 40.75011825561523),POINT (-73.98712158203125 40.76097869873047),585127.477687,4511507.0,585492.279383,4512717.0
id1443424,1,2016-04-15 13:32:15,2016-04-15 13:44:41,2,-73.941551,40.838516,-73.950233,40.825512,N,746,POINT (-73.94155120849609 40.83851623535156),POINT (-73.95023345947266 40.82551193237305),589234.813681,4521370.0,588520.121357,4519918.0
id2057397,1,2016-04-22 21:40:31,2016-04-22 22:09:14,2,-73.9729,40.75441,-74.031296,40.622852,N,1723,POINT (-73.972900390625 40.75440979003906),POINT (-74.0312957763672 40.62285232543945),586701.177019,4512002.0,581932.724413,4497342.0
id3472628,2,2016-03-09 13:07:56,2016-03-09 13:35:34,1,-73.966354,40.761745,-74.008476,40.73428,N,1658,POINT (-73.96635437011719 40.76174545288085),POINT (-74.00847625732422 40.73427963256836),587244.167059,4512823.0,583723.286375,4509733.0
id2661763,2,2016-04-15 21:13:44,2016-04-15 21:23:09,2,-73.969307,40.757099,-73.983253,40.748829,N,565,POINT (-73.96930694580078 40.75709915161133),POINT (-73.98325347900391 40.74882888793945),587001.011148,4512304.0,585834.399006,4511372.0


In [13]:
def calc_distance(df,col_x1_utm,col_x2_utm,col_y1_utm,col_y2_utm,type_='beeline'): 
    if type_ =='beeline': 
        distance = np.sqrt((df[col_x1_utm]-df[col_x2_utm])**2+(df[col_y1_utm]-df[col_y2_utm])**2) 
    elif type_ =='manhattan': 
        distance = np.abs((df[col_x1_utm]-df[col_x2_utm]))+np.abs((df[col_y1_utm]-df[col_y2_utm]))
    return df.assign(**{f"distance_{type_}":distance})

df_rides = calc_distance(df_rides,
                         col_x1_utm='pickup_x_utm',col_x2_utm='dropoff_x_utm',
                         col_y1_utm='pickup_y_utm',col_y2_utm="dropoff_y_utm",
                         type_='beeline')

df_rides = calc_distance(df_rides,
                         col_x1_utm='pickup_x_utm',col_x2_utm='dropoff_x_utm',
                         col_y1_utm='pickup_y_utm',col_y2_utm="dropoff_y_utm",
                         type_='manhattan')

assert(np.all(df_rides.distance_beeline<=df_rides.distance_manhattan))
df_rides.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_geom,dropoff_geom,pickup_x_utm,pickup_y_utm,dropoff_x_utm,dropoff_y_utm,distance_beeline,distance_manhattan
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
id2485291,1,2016-05-28 23:46:14,2016-05-28 23:53:41,2,-73.991608,40.750118,-73.987122,40.760979,N,447,POINT (-73.99160766601563 40.75011825561523),POINT (-73.98712158203125 40.76097869873047),585127.477687,4511507.0,585492.279383,4512717.0,1263.748262,1574.751857
id1443424,1,2016-04-15 13:32:15,2016-04-15 13:44:41,2,-73.941551,40.838516,-73.950233,40.825512,N,746,POINT (-73.94155120849609 40.83851623535156),POINT (-73.95023345947266 40.82551193237305),589234.813681,4521370.0,588520.121357,4519918.0,1618.720535,2167.094146
id2057397,1,2016-04-22 21:40:31,2016-04-22 22:09:14,2,-73.9729,40.75441,-74.031296,40.622852,N,1723,POINT (-73.972900390625 40.75440979003906),POINT (-74.0312957763672 40.62285232543945),586701.177019,4512002.0,581932.724413,4497342.0,15415.742821,19428.158826
id3472628,2,2016-03-09 13:07:56,2016-03-09 13:35:34,1,-73.966354,40.761745,-74.008476,40.73428,N,1658,POINT (-73.96635437011719 40.76174545288085),POINT (-74.00847625732422 40.73427963256836),587244.167059,4512823.0,583723.286375,4509733.0,4684.471214,6610.811066
id2661763,2,2016-04-15 21:13:44,2016-04-15 21:23:09,2,-73.969307,40.757099,-73.983253,40.748829,N,565,POINT (-73.96930694580078 40.75709915161133),POINT (-73.98325347900391 40.74882888793945),587001.011148,4512304.0,585834.399006,4511372.0,1493.058387,2098.405818


In [14]:
df_rides['avg_speed_kmh'] = (df_rides.distance_manhattan*1e-3)/ (df_rides.trip_duration*1/3600)

In [15]:
from pandas.tseries.holiday import USFederalHolidayCalendar
cal = USFederalHolidayCalendar()

def calc_time_features(df,index_col='pickup_datetime',flg_days_before_after=False): 
    ind = df.set_index(index_col,drop='False').index
    holidays = pd.DataFrame(cal.holidays(start=str(ind.year.min()), end=str(ind.year.max()+1),return_name=True)).rename(columns={0:"holiday_name"})
    if flg_days_before_after: 
        holidays = pd.concat([holidays,
                             "Day Before " + holidays.shift(-1, 'D'),
                             "Day After " + holidays.shift(1, 'D')])
    #holidays.name = 'holiday_name'  # required for join
    df['day_of_week'] = ind.dayofweek 
    df['season'] = (ind.month % 12 + 3) // 3
    df['season'] = ind.quarter
    df['month'] = ind.month
    df['day_of_year'] = ind.dayofyear
    df['is_weekend'] = ind.dayofweek>5
    df['hour'] = ind.hour
    df['date'] = pd.DatetimeIndex(ind.date)
    df['id'] = df.index
    df = df.set_index("date",drop=False).join(holidays,how='left').set_index("id",drop=False)
    df['is_holiday'] = 0 
    df['is_holiday'] = ~(pd.isna(df['holiday_name']))
    return df

df_rides = calc_time_features(df_rides)

In [16]:
df_rides.holiday_name.value_counts()

Presidents Day                6233
Dr. Martin Luther King Jr.    6128
New Years Day                 6101
MemorialDay                   4699
Name: holiday_name, dtype: int64

In [17]:
#To-Do: Compare numba to numpy 
def calc_hours_daylight(df,col_latitude='pickup_latitude',col_day_of_year = "day_of_year"):
    P = np.arcsin(0.39795 * np.cos(0.2163108 + 2 * np.arctan(0.9671396 * np.tan(.00860 * (df[col_day_of_year].astype(int)-186)))))
    hours_daylight = 24 - (24 / np.pi) * np.arccos(
        (np.sin((0.8333 * np.pi / 180) + np.sin(df[col_latitude] * np.pi / 180) * np.sin(P)) / (np.cos(df[col_latitude] * np.pi / 180) * np.cos(P))))
    return df.assign(**{"hours_daylight": hours_daylight})

df_rides = calc_hours_daylight(df_rides)

In [18]:
def calc_cell_id(df,col_x_utm,col_y_utm,col_id='Cell_ID',cell_length=100,keep_coordinates_center=True): 
    df[f"x_sw_utm_{col_id}"] = ((df[col_x_utm].values//cell_length)*cell_length).astype(int)
    df[f"y_sw_utm_{col_id}"] = ((df[col_y_utm].values//cell_length)*cell_length).astype(int)
    df[col_id] = f"{cell_length}mN"+(df[f"x_sw_utm_{col_id}"]//cell_length).astype(str)+"E"+(df[f"y_sw_utm_{col_id}"]//cell_length).astype(str)
    if not keep_coordinates_center: 
        df.drop(columns=[f"x_sw_utm_{col_id}",f"y_sw_utm_{col_id}"],inplace=True)
    return df

df_rides = calc_cell_id(df_rides,col_x_utm="dropoff_x_utm",col_y_utm="dropoff_y_utm",col_id='Cell_ID_dropoff')
df_rides = calc_cell_id(df_rides,col_x_utm="pickup_x_utm",col_y_utm="pickup_y_utm",col_id='Cell_ID_pickup')
df_rides.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,...,id,holiday_name,is_holiday,hours_daylight,x_sw_utm_Cell_ID_dropoff,y_sw_utm_Cell_ID_dropoff,Cell_ID_dropoff,x_sw_utm_Cell_ID_pickup,y_sw_utm_Cell_ID_pickup,Cell_ID_pickup
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id0346385,2,2016-01-01 03:08:39,2016-01-01 03:14:19,5,-73.984833,40.769291,-73.969353,40.785439,N,340,...,id0346385,New Years Day,True,9.324793,586900,4515400,100mN5869E45154,585600,4513600,100mN5856E45136
id0521444,2,2016-01-01 17:00:03,2016-01-01 17:19:00,1,-73.980309,40.742565,-73.914085,40.760952,N,1137,...,id0521444,New Years Day,True,9.327483,591600,4512700,100mN5916E45127,586000,4510600,100mN5860E45106
id1522058,2,2016-01-01 07:34:12,2016-01-01 07:51:40,6,-73.95578,40.771481,-73.867714,40.757339,N,1048,...,id1522058,New Years Day,True,9.324572,595500,4512400,100mN5955E45124,588100,4513900,100mN5881E45139
id3770051,1,2016-01-01 16:22:26,2016-01-01 16:41:28,1,-73.976654,40.744236,-73.981262,40.784199,N,1142,...,id3770051,New Years Day,True,9.327315,585900,4515300,100mN5859E45153,586300,4510800,100mN5863E45108
id1303456,2,2016-01-01 18:09:23,2016-01-01 18:22:58,1,-73.976311,40.752182,-73.982674,40.771667,N,815,...,id1303456,New Years Day,True,9.326515,585800,4513900,100mN5858E45139,586400,4511700,100mN5864E45117


In [19]:
def aggregate_by_ID(df,col_groupby,agg_funcs={'passenger_count':['sum','mean'],'trip_duration':['sum','mean']}):
    agg_funcs = {**agg_funcs,**{f"x_sw_utm_{col_groupby}":'first',f"y_sw_utm_{col_groupby}":'first'}}
    df = df.groupby(col_groupby).agg(agg_funcs)
    df.columns = [f"{agg_func}_{col}" for col,agg_func 
                  in zip(df.columns.get_level_values(level=0),df.columns.get_level_values(level=1))]
    df.rename(columns={c:c.replace("first_","") for c in df.columns},inplace=True)
    return df

In [20]:
df_cells = (aggregate_by_ID(df_rides,
                            col_groupby="Cell_ID_pickup")
            .join(aggregate_by_ID(df_rides,
                                  col_groupby='Cell_ID_dropoff'),
                  how='outer',lsuffix='_pickup',rsuffix='_dropoff')
           )
df_cells.head()

Unnamed: 0,sum_passenger_count_pickup,mean_passenger_count_pickup,sum_trip_duration_pickup,mean_trip_duration_pickup,x_sw_utm_Cell_ID_pickup,y_sw_utm_Cell_ID_pickup,sum_passenger_count_dropoff,mean_passenger_count_dropoff,sum_trip_duration_dropoff,mean_trip_duration_dropoff,x_sw_utm_Cell_ID_dropoff,y_sw_utm_Cell_ID_dropoff
100mN-37336E53461,4.0,2.0,1604.0,802.0,-3733600.0,5346100.0,4.0,2.0,1604.0,802.0,-3733600.0,5346100.0
100mN10602E40150,1.0,1.0,385.0,385.0,1060200.0,4015000.0,1.0,1.0,385.0,385.0,1060200.0,4015000.0
100mN1190E46139,1.0,1.0,548.0,548.0,119000.0,4613900.0,,,,,,
100mN1210E45194,2.0,2.0,445.0,445.0,121000.0,4519400.0,2.0,2.0,445.0,445.0,121000.0,4519400.0
100mN12837E44073,1.0,1.0,329.0,329.0,1283700.0,4407300.0,1.0,1.0,329.0,329.0,1283700.0,4407300.0


In [21]:
#df_rides = pd.read_pickle(os.path.join("../data","df_rides.pickle"))
df_rides.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag', 'trip_duration',
       'pickup_geom', 'dropoff_geom', 'pickup_x_utm', 'pickup_y_utm',
       'dropoff_x_utm', 'dropoff_y_utm', 'distance_beeline',
       'distance_manhattan', 'avg_speed_kmh', 'day_of_week', 'season', 'month',
       'day_of_year', 'is_weekend', 'hour', 'date', 'id', 'holiday_name',
       'is_holiday', 'hours_daylight', 'x_sw_utm_Cell_ID_dropoff',
       'y_sw_utm_Cell_ID_dropoff', 'Cell_ID_dropoff',
       'x_sw_utm_Cell_ID_pickup', 'y_sw_utm_Cell_ID_pickup', 'Cell_ID_pickup'],
      dtype='object')

In [23]:
def distance_to_point(df_rides,col='drop_off', x_center=587506.016, y_center=4515490.102): 
    return df_rides[f"{col}_x_utm"]

In [25]:
def filter_values(df_rides): 
    df_rides = df_rides.query("avg_speed_kmh<=50&avg_speed_kmh>0")
    df_rides = df_rides.query("trip_duration<60*60*24&trip_duration>0")
    return df_rides 

In [26]:
df_rides = filter_values(df_rides)

In [27]:
df_rides.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,...,id,holiday_name,is_holiday,hours_daylight,x_sw_utm_Cell_ID_dropoff,y_sw_utm_Cell_ID_dropoff,Cell_ID_dropoff,x_sw_utm_Cell_ID_pickup,y_sw_utm_Cell_ID_pickup,Cell_ID_pickup
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id0346385,2,2016-01-01 03:08:39,2016-01-01 03:14:19,5,-73.984833,40.769291,-73.969353,40.785439,N,340,...,id0346385,New Years Day,True,9.324793,586900,4515400,100mN5869E45154,585600,4513600,100mN5856E45136
id0521444,2,2016-01-01 17:00:03,2016-01-01 17:19:00,1,-73.980309,40.742565,-73.914085,40.760952,N,1137,...,id0521444,New Years Day,True,9.327483,591600,4512700,100mN5916E45127,586000,4510600,100mN5860E45106
id1522058,2,2016-01-01 07:34:12,2016-01-01 07:51:40,6,-73.95578,40.771481,-73.867714,40.757339,N,1048,...,id1522058,New Years Day,True,9.324572,595500,4512400,100mN5955E45124,588100,4513900,100mN5881E45139
id3770051,1,2016-01-01 16:22:26,2016-01-01 16:41:28,1,-73.976654,40.744236,-73.981262,40.784199,N,1142,...,id3770051,New Years Day,True,9.327315,585900,4515300,100mN5859E45153,586300,4510800,100mN5863E45108
id1303456,2,2016-01-01 18:09:23,2016-01-01 18:22:58,1,-73.976311,40.752182,-73.982674,40.771667,N,815,...,id1303456,New Years Day,True,9.326515,585800,4513900,100mN5858E45139,586400,4511700,100mN5864E45117


In [28]:
df_cells.to_pickle(os.path.join("../data","df_cells.pickle"))

In [29]:
df_rides.to_pickle(os.path.join("../data","df_rides.pickle"))