In [1]:
import vaex
import h3

df_taxi_trips = vaex.open('./data/trips_prepared.hdf5')
df_taxi_trips.head()

df_taxi_trips["trip_start_day"] = df_taxi_trips.trip_start_timestamp.dt.day
df_taxi_trips["trip_start_month"] = df_taxi_trips.trip_start_timestamp.dt.month
df_taxi_trips["trip_start_hour"] = df_taxi_trips.trip_start_timestamp.dt.hour
df_taxi_trips["trip_start_minute"] = df_taxi_trips.trip_start_timestamp.dt.minute

In [2]:
RESOLUTION = 10
def geo_to_h3(row1, row2):
    return h3.geo_to_h3(row1,row2, RESOLUTION)

# Step 1: For each pickup and drop-off calculate the correct hexagon in the resolution
df_taxi_trips['pickup_hex'] = df_taxi_trips.apply(geo_to_h3, [df_taxi_trips['pickup_centroid_latitude'], df_taxi_trips['pickup_centroid_longitude']])
df_taxi_trips['dropoff_hex'] = df_taxi_trips.apply(geo_to_h3, [df_taxi_trips['dropoff_centroid_latitude'], df_taxi_trips['dropoff_centroid_longitude']])

In [3]:
grouped_df = df_taxi_trips.groupby(['trip_start_hour', 'trip_start_month', 'trip_start_day', 'pickup_hex']).agg({'rides': 'count'})

In [4]:
df_demand = grouped_df

In [None]:
def get_mean_demand(df, hour_shift, hour, month, day):
    winter = [12, 1, 2]
    spring = [3,4,5]
    summer = [6,7,8]
    autumn = [9,10,11]

    months = []
    if month in winter:
        months = winter
    elif month in spring:
        months = spring
    elif month in summer:
        months = summer
    else:
        months = autumn
    
    return df_demand.filter(df_demand['trip_start_hour']((df_demand['trip_start_month'] == months[0]) | (df_demand['trip_start_month'] == months[1]) | (df_demand['trip_start_month'] == months[2])))['demand'].mean()


In [43]:
filtered = df_demand.filter((df_demand['pickup_hex'] == '8a2664cf496ffff') & (df_demand['trip_start_month'] == 5))['demand'].mean()

In [26]:
filtered = df_demand.filter(df_demand['pickup_hex'] == '8a2664cf496ffff')
filtered = filtered.sort(['trip_start_month', 'trip_start_day', 'trip_start_hour'])
filtered = filtered.filtered()

#,trip_start_hour,trip_start_month,trip_start_day,pickup_hex,demand,demand_test
0,7,1,4,8a2664cf496ffff,2,174946.0
1,8,1,4,8a2664cf496ffff,2,300153.0
2,8,1,5,8a2664cf496ffff,1,57051.0
3,8,1,6,8a2664cf496ffff,2,76748.0
4,8,1,9,8a2664cf496ffff,4,280383.0
...,...,...,...,...,...,...
307,12,12,15,8a2664cf496ffff,2,202261.0
308,22,12,15,8a2664cf496ffff,1,31492.0
309,20,12,16,8a2664cf496ffff,3,287736.0
310,20,12,22,8a2664cf496ffff,1,333817.0


In [15]:
hexagons = df_demand.pickup_hex.unique()

for hexagon in hexagons:
    filtered = df_demand.filter(df_demand[hexagon] == '8a2664cf496ffff')
    filtered = filtered


['8a2664cf496ffff',
 '8a2664ca8897fff',
 '8a2664ca6127fff',
 '8a2664caeceffff',
 '8a2664ca936ffff',
 '8a2664ca0517fff',
 '8a2664c1e0dffff',
 '8a2664ca596ffff',
 '8a275936bc4ffff',
 '8a2664d8a0effff',
 '8a2664cf68affff',
 '8a2664c13b4ffff',
 '8a2664d83b57fff',
 '8a2664d8c14ffff',
 '8a2664cf2447fff',
 '8a2664c1eab7fff',
 '8a2664cac89ffff',
 '8a2664c16c6ffff',
 '8a2664ccc297fff',
 '8a2664cab547fff',
 '8a2664ca12dffff',
 '8a2664ca0777fff',
 '8a2664cc201ffff',
 '8a2664ce266ffff',
 '8a2664c1aa5ffff',
 '8a2664c168e7fff',
 '8a2664c114d7fff',
 '8a2664c16a97fff',
 '8a2664c1618ffff',
 '8a2664d8d4cffff',
 '8a275936bb2ffff',
 '8a27593697affff',
 '8a2664ca1207fff',
 '8a2664d8d59ffff',
 '8a2664d9d76ffff',
 '8a2664ca0a47fff',
 '8a2664c1632ffff',
 '8a2664c144dffff',
 '8a2664ca6ba7fff',
 '8a2664c10197fff',
 '8a2664ca9baffff',
 '8a2664cab2affff',
 '8a2664cc57a7fff',
 '8a2664c1e4effff',
 '8a2664cf5d2ffff',
 '8a2664c16607fff',
 '8a2664d99357fff',
 '8a2664d81317fff',
 '8a2664cc526ffff',
 '8a2664cc950ffff',


In [None]:
df_demand['demand_ph1'] = df_demand['column_to_shift'].shift(n=1, fill_value=None)
df_demand['demand_ph2'] = df_demand['column_to_shift'].shift(n=2, fill_value=None)
df_demand['demand_ph12'] = df_demand['column_to_shift'].shift(n=12, fill_value=None)
df_demand['demand_ph24'] = df_demand['column_to_shift'].shift(n=1, fill_value=None)


# Feature Engineering

### Previous demand as input

As we have given time series data, it is a common approach to use the demand of previous hours (or days etc.) as an input for the prediction. The assumption we hereby make is that the factors that influence the demand have not changed dramatically within the used time frames. We have decided to construct the following features from previous demand:

* 2 hour: The asssumption is that the demand should not change dramatically between three hours.
* 24 hours: The asssumption is that the current demand should be comparable to the demand exactly one day ago, as factors such as season, time of the day are the same.
* Average demand of the past week at the same day time: This feature is the average of all 7 demand observations of the past week at same time of the day. 

In [None]:
df_taxi_trips['demand_h-2'] = df_taxi_trips.demand.shift(periods=2)
df_taxi_trips['demand_h-24'] = df_taxi_trips.demand.shift(periods=24)