# Feature Engineering

### Previous demand as input

As we have given time series data, it is a common approach to use the demand of previous hours (or days etc.) as an input for the prediction. The assumption we hereby make is that the factors that influence the demand have not changed dramatically within the used time frames. We have decided to construct the following features from previous demand:

* 2 hour: The asssumption is that the demand should not change dramatically between three hours.
* 24 hours: The asssumption is that the current demand should be comparable to the demand exactly one day ago, as factors such as season, time of the day are the same.
* Average demand of the past week at the same day time: This feature is the average of all 7 demand observations of the past week at same time of the day. 

In [13]:
import vaex
import h3
import pandas as pd
import numpy as np

In [3]:
def geo_to_h3(row1, row2):
    return h3.geo_to_h3(row1,row2, RESOLUTION)


In [4]:
# This functions loads the dataset either with hexagons or census tract
def load_dataset_to_pandas(resolution=10, location_type_hexagon = True):
    df = vaex.open('./data/trips_prepared.hdf5')
    df.head()

    df["trip_start_day"] = df.trip_start_timestamp.dt.day
    df["trip_start_month"] = df.trip_start_timestamp.dt.month
    df["trip_start_hour"] = df.trip_start_timestamp.dt.hour
    df["trip_start_minute"] = df.trip_start_timestamp.dt.minute

    if location_type_hexagon:
        RESOLUTION = resolution
        # Step 1: For each pickup and drop-off calculate the correct hexagon in the resolution
        df['pickup_loc'] = df.apply(geo_to_h3, [df['pickup_centroid_latitude'], df['pickup_centroid_longitude']])
    else:
        df.rename('pickup_census_tract', 'pickup_loc')

    ### Group by hour
    df_demand = df.groupby(['trip_start_hour', 'trip_start_month', 'trip_start_day', 'pickup_loc']).agg({'demand': 'count'})
    # Add timestamp as preparation for resampling
    df_demand['timestamp'] = pd.to_datetime({'year': 2017, 'month': df_demand['trip_start_month'].to_numpy(), 'day': df_demand['trip_start_day'].to_numpy(), 'hour': df_demand['trip_start_hour'].to_numpy()}).to_numpy()

    # convert to pandas df
    df_demand = df_demand.to_pandas_df()
    return df_demand


In [6]:
def resample_to_hourly(df):
    ### Creation of dummy df which contains hourly data dummy data over an entire year per hexagon
    # Create a DateTimeIndex with hourly intervals for the year 2017
    start_date = '2017-01-01 00:00:00'
    end_date = '2017-12-31 23:00:00'
    hourly_range = pd.date_range(start=start_date, end=end_date, freq='H')
    num_entries_per_year = len(hourly_range)
    hour_range = np.tile(hourly_range.hour,len(np.unique(df.pickup_loc)))
    month_range = np.tile(hourly_range.month,len(np.unique(df.pickup_loc)))
    day_range = np.tile(hourly_range.day,len(np.unique(df.pickup_loc)))
    hourly_range = np.tile(hourly_range,len(np.unique(df.pickup_loc)))

    # -1 values will indacte that these rows were artificially generated later on
    data = {
        'trip_start_hour': hour_range,
        'trip_start_month': month_range,
        'trip_start_day': day_range,
        'pickup_loc': np.repeat(np.unique(df.pickup_loc), num_entries_per_year),
        'demand': 0,
    }

    df_hourly = pd.DataFrame(data, index=hourly_range)
    df_hourly= df_hourly.set_index([df_hourly.index, 'pickup_loc'])

    # introduce multiindex for filling up the df with hourly index later on
    df=df.set_index(['timestamp', 'pickup_loc'])

    # insert df 
    df_hourly.update(df)

    # clear up multi-index
    df_hourly=df_hourly.reset_index()
    df_hourly.columns = ['timestamp','pickup_loc','trip_start_hour','trip_start_month','trip_start_day','demand']
    return df_hourly
    

In [7]:
def get_past_demand(df):
    # insert features 1, 2 and 24 hours previous demand
    df['demand_h-1'] = df.sort_values('timestamp').groupby('pickup_loc')['demand'].shift(1)
    df['demand_h-2'] = df.sort_values('timestamp').groupby('pickup_loc')['demand'].shift(2)
    df['demand_h-24'] = df.sort_values('timestamp').groupby('pickup_loc')['demand'].shift(24)
    return df
    

In [8]:
def get_prepared_data(location_type_hexagon=True, resolution=10):
    df = load_dataset_to_pandas(location_type_hexagon=location_type_hexagon, resolution=resolution)
    df = resample_to_hourly(df)
    df = get_past_demand(df)
    return df

In [9]:
df_census = get_prepared_data(False)
df_census

Unnamed: 0,timestamp,pickup_loc,trip_start_hour,trip_start_month,trip_start_day,demand,demand_h-1,demand_h-2,demand_h-24
0,2017-01-01 00:00:00,1.703101e+10,0,1,1,0,,,
1,2017-01-01 01:00:00,1.703101e+10,1,1,1,0,0.0,,
2,2017-01-01 02:00:00,1.703101e+10,2,1,1,0,0.0,0.0,
3,2017-01-01 03:00:00,1.703101e+10,3,1,1,0,0.0,0.0,
4,2017-01-01 04:00:00,1.703101e+10,4,1,1,0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...
3127315,2017-12-31 19:00:00,1.703198e+10,19,12,31,4,15.0,14.0,17.0
3127316,2017-12-31 20:00:00,1.703198e+10,20,12,31,9,4.0,15.0,16.0
3127317,2017-12-31 21:00:00,1.703198e+10,21,12,31,10,9.0,4.0,20.0
3127318,2017-12-31 22:00:00,1.703198e+10,22,12,31,13,10.0,9.0,11.0


In [8]:
df_hex= load_dataset_to_pandas(resolution=9, location_type_hexagon=True)
df_hex = resample_to_hourly(df_hex)
df_hex = get_past_demand(df_hex)
df_hex

In [18]:
df_hex = resample_to_hourly(df_hex)
df_hex = get_past_demand(df_hex)
df_hex

Unnamed: 0,timestamp,pickup_loc,trip_start_hour,trip_start_month,trip_start_day,demand,demand_h-1,demand_h-2,demand_h-24
0,2017-01-01 00:00:00,8a266452180ffff,0,1,1,1,,,
1,2017-01-01 01:00:00,8a266452180ffff,1,1,1,1,1.0,,
2,2017-01-01 02:00:00,8a266452180ffff,-1,-1,-1,0,1.0,1.0,
3,2017-01-01 03:00:00,8a266452180ffff,-1,-1,-1,0,0.0,1.0,
4,2017-01-01 04:00:00,8a266452180ffff,-1,-1,-1,0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...
3127315,2017-12-31 19:00:00,8a275936bc4ffff,-1,-1,-1,0,0.0,0.0,0.0
3127316,2017-12-31 20:00:00,8a275936bc4ffff,-1,-1,-1,0,0.0,0.0,0.0
3127317,2017-12-31 21:00:00,8a275936bc4ffff,-1,-1,-1,0,0.0,0.0,0.0
3127318,2017-12-31 22:00:00,8a275936bc4ffff,-1,-1,-1,0,0.0,0.0,0.0


# Weather features
In the descriptive analysis, particularly the analysis of temporal demand patterns, we found that the temperature and demand curves follow similar directions. Therefore, we construct features based on temperature to enable models that capture this relationship.

### Include weather data
First, we have to include the weather data into the dataframe. For this we just need to merge the two datasets, as both are already in hourly frequency. The weather data propose data for minute 53 of an hour. Therefore, we round up to the nearest hour for each row. We suppose that the weather changes in seven minutes can be disregarded.

In [9]:
import numpy as np

df_weather = pd.read_csv('data/weather_data_final.csv')
df_weather['date_time'] = pd.to_datetime(df_weather['date_time'])
df_weather['date_time'] = df_weather['date_time'].dt.ceil('H')
df_weather.rename(columns={'date_time': 'timestamp'}, inplace=True)

In [10]:
df_weather.head(1)

Unnamed: 0,date,time,temp,dew_point,humidity,wind_speed,wind_gust,pressure,precip,condition,timestamp
0,2017-01-01,00:53,33 °F,24 °F,70 °%,8 °mph,0 °mph,29.45 °in,0.0 °in,Partly Cloudy,2017-01-01 01:00:00


In [11]:
df_demand_merged = df_demand_hourly.merge(df_weather, on='timestamp', how='left')

### Temperature features
In addition to the current temperature, we are add the temperature from 1, 2, and 3 hours prior to the time of taxi demand. We suggest that past temperature conditions could potentially impact the decision to hire a taxi.

In [12]:
df_demand_merged['temp_h-1'] = df_demand_merged.sort_values('timestamp').groupby('pickup_hex')['temp'].shift(1)
df_demand_merged['temp_h-2'] = df_demand_merged.sort_values('timestamp').groupby('pickup_hex')['temp'].shift(2)
df_demand_merged['temp_h-3'] = df_demand_merged.sort_values('timestamp').groupby('pickup_hex')['temp'].shift(3)

### Precipitation
We hypothesize that precipitation has a significant impact on demand. Therefore, we construct features that describe whether it has rained in the last 1-3 hours.

In [13]:
df_demand_merged['precip_h-1'] = df_demand_merged.sort_values('timestamp').groupby('pickup_hex')['precip'].shift(1)
df_demand_merged['precip_h-2'] = df_demand_merged.sort_values('timestamp').groupby('pickup_hex')['precip'].shift(2)
df_demand_merged['precip_h-3'] = df_demand_merged.sort_values('timestamp').groupby('pickup_hex')['precip'].shift(3)

# Prediction Models

## Split the dataset

In [23]:
import sklearn
def train_validation_test_split(X, y, train_ratio=0.6, validation_ratio=0.2, test_ratio=0.2, random_state=None):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state)

    # Calculate the remaining ratio after train-test split
    remaining_ratio = 1.0 - test_ratio

    # Calculate the proportional validation ratio based on the remaining ratio
    validation_ratio_prop = validation_ratio / remaining_ratio

    X_train, X_validation, y_train, y_validation = train_test_split(
        X_train, y_train, test_size=validation_ratio_prop, random_state=random_state
    )

    return X_train, X_validation, X_test, y_train, y_validation, y_test

ImportError: cannot import name 'ARRAY_FUNCTIONS' from 'numpy.core.overrides' (/home/bsauter/anaconda3/envs/aaa_dudes/lib/python3.10/site-packages/numpy/core/overrides.py)