In [33]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

## Feature Preparation

In [34]:
#read in trip data
trips = pd.read_parquet("../00_data/processed/trips_land_use.parquet")
trips.head(2)

Unnamed: 0_level_0,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number,trip_start,p_uid_start,p_bikes_start,lat_start,b_electric_lock,...,in_charged_flexzone_end,type,min_distance,duration,min_avg_speed,geometry_end,land_use_end,geometry_start,index_right,land_use_start
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1200,False,0,12,2019-01-20 20:58:00,23221,start,12118482,1,51.332653,True,...,True,trip,2.492277,17.0,8.796272,b'\x01\x01\x00\x00\x00K\x01\xc3\x90`\xabI@\xdb...,"Industrial, commercial, public, military and p...",b'\x01\x01\x00\x00\x00\n\xc9ib\x94\xaaI@\x91&8...,10,Other roads and associated land
2279,False,0,12,2019-01-21 08:42:00,23221,start,12120802,1,51.338741,True,...,False,trip,1.7736,13.0,8.185845,b'\x01\x01\x00\x00\x00\xb7\x9cKqU\xa9I@\xc6\xa...,"Industrial, commercial, public, military and p...",b'\x01\x01\x00\x00\x00\xd7\\{\xdb[\xabI@\xcb\x...,966,Continuous urban fabric (S.L. : > 80%)


#### Land use data

In [35]:
#change land use to categorical
import json
with open('../00_data/processed/land_use_dict.json') as json_file:
    land_use_dict = json.load(json_file)

trips.land_use_start = trips.land_use_start.map(land_use_dict)
trips.land_use_end = trips.land_use_end.map(land_use_dict)
trips.head(2)

Unnamed: 0_level_0,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number,trip_start,p_uid_start,p_bikes_start,lat_start,b_electric_lock,...,in_charged_flexzone_end,type,min_distance,duration,min_avg_speed,geometry_end,land_use_end,geometry_start,index_right,land_use_start
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1200,False,0,12,2019-01-20 20:58:00,23221,start,12118482,1,51.332653,True,...,True,trip,2.492277,17.0,8.796272,b'\x01\x01\x00\x00\x00K\x01\xc3\x90`\xabI@\xdb...,2,b'\x01\x01\x00\x00\x00\n\xc9ib\x94\xaaI@\x91&8...,10,20
2279,False,0,12,2019-01-21 08:42:00,23221,start,12120802,1,51.338741,True,...,False,trip,1.7736,13.0,8.185845,b'\x01\x01\x00\x00\x00\xb7\x9cKqU\xa9I@\xc6\xa...,2,b'\x01\x01\x00\x00\x00\xd7\\{\xdb[\xabI@\xcb\x...,966,3


In [36]:
#round start time to hourly to merge weather data
trips["start_time_floored"] = trips["datetime_start"].dt.floor("H")
trips.head(2)

Unnamed: 0_level_0,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number,trip_start,p_uid_start,p_bikes_start,lat_start,b_electric_lock,...,type,min_distance,duration,min_avg_speed,geometry_end,land_use_end,geometry_start,index_right,land_use_start,start_time_floored
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1200,False,0,12,2019-01-20 20:58:00,23221,start,12118482,1,51.332653,True,...,trip,2.492277,17.0,8.796272,b'\x01\x01\x00\x00\x00K\x01\xc3\x90`\xabI@\xdb...,2,b'\x01\x01\x00\x00\x00\n\xc9ib\x94\xaaI@\x91&8...,10,20,2019-01-20 20:00:00
2279,False,0,12,2019-01-21 08:42:00,23221,start,12120802,1,51.338741,True,...,trip,1.7736,13.0,8.185845,b'\x01\x01\x00\x00\x00\xb7\x9cKqU\xa9I@\xc6\xa...,2,b'\x01\x01\x00\x00\x00\xd7\\{\xdb[\xabI@\xcb\x...,966,3,2019-01-21 08:00:00


#### Weather data

In [37]:
#get weather data 
weather = pd.read_parquet("../00_data/repo_data/weather_aggr.parquet")
weather = weather[weather["time_intervall_length"]==1]
weather

Unnamed: 0_level_0,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,mean_total_cloud_cover,sum_precipitation,time_intervall_length
MESS_DATUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01 00:00:00,7.4,7.4,7.4,2.8,8.0,0.0,1
2019-01-01 01:00:00,7.7,7.7,7.7,2.9,8.0,0.0,1
2019-01-01 02:00:00,7.7,7.7,7.7,3.7,8.0,0.0,1
2019-01-01 03:00:00,7.7,7.7,7.7,3.2,8.0,0.0,1
2019-01-01 04:00:00,7.6,7.6,7.6,3.3,8.0,0.0,1
...,...,...,...,...,...,...,...
2019-12-31 19:00:00,2.8,2.8,2.8,2.3,1.0,0.0,1
2019-12-31 20:00:00,2.4,2.4,2.4,2.0,5.0,0.0,1
2019-12-31 21:00:00,2.7,2.7,2.7,2.3,7.0,0.0,1
2019-12-31 22:00:00,2.8,2.8,2.8,2.3,7.0,0.0,1


In [38]:
trips = trips.merge(weather, left_on="start_time_floored", right_on="MESS_DATUM", how="left", right_index=False)
trips.head(2)

Unnamed: 0,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number,trip_start,p_uid_start,p_bikes_start,lat_start,b_electric_lock,...,index_right,land_use_start,start_time_floored,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,mean_total_cloud_cover,sum_precipitation,time_intervall_length
0,False,0,12,2019-01-20 20:58:00,23221,start,12118482,1,51.332653,True,...,10,20,2019-01-20 20:00:00,-6.4,-6.4,-6.4,0.6,1.0,0.0,1
1,False,0,12,2019-01-21 08:42:00,23221,start,12120802,1,51.338741,True,...,966,3,2019-01-21 08:00:00,-7.0,-7.0,-7.0,0.6,8.0,0.0,1


#### Temporal features

In [40]:
#create time features
trips['hour'] = trips.start_time_floored.dt.hour
trips['weekday'] = trips.start_time_floored.dt.weekday
trips['month'] = trips.start_time_floored.dt.month
trips.head(2)

Unnamed: 0,p_spot_start,p_booked_bikes_start,p_place_type_start,datetime_start,b_number,trip_start,p_uid_start,p_bikes_start,lat_start,b_electric_lock,...,min_temperature,max_temperature,mean_temperature,mean_mean_wind_speed,mean_total_cloud_cover,sum_precipitation,time_intervall_length,hour,weekday,month
0,False,0,12,2019-01-20 20:58:00,23221,start,12118482,1,51.332653,True,...,-6.4,-6.4,-6.4,0.6,1.0,0.0,1,20,6,1
1,False,0,12,2019-01-21 08:42:00,23221,start,12120802,1,51.338741,True,...,-7.0,-7.0,-7.0,0.6,8.0,0.0,1,8,0,1


#### POIs

TODO: Merge Pois

## Feature Selection

Now that we have merged all relevant data sets, we can take a look at which columns we can drop.

In [41]:
print(trips.columns)

Index(['p_spot_start', 'p_booked_bikes_start', 'p_place_type_start',
       'datetime_start', 'b_number', 'trip_start', 'p_uid_start',
       'p_bikes_start', 'lat_start', 'b_electric_lock', 'b_bike_type',
       'p_name_start', 'p_address_start', 'b_lock_types', 'p_number_start',
       'b_pedelec_battery_start', 'lng_start', 'b_boardcomputer',
       'p_terminal_type_start', 'p_bike_start', 'p_bike_types_start',
       'b_battery_pack_start', 'in_free_flexzone_start',
       'in_charged_flexzone_start', 'p_spot_end', 'p_booked_bikes_end',
       'p_place_type_end', 'datetime_end', 'trip_end', 'p_uid_end',
       'p_bikes_end', 'lat_end', 'p_name_end', 'p_address_end', 'p_number_end',
       'b_pedelec_battery_end', 'lng_end', 'p_terminal_type_end', 'p_bike_end',
       'p_bike_types_end', 'b_battery_pack_end', 'in_free_flexzone_end',
       'in_charged_flexzone_end', 'type', 'min_distance', 'duration',
       'min_avg_speed', 'geometry_end', 'land_use_end', 'geometry_start',
       '

In [44]:
trips_small = trips[["hour", "weekday", "month", "mean_temperature", "mean_mean_wind_speed", "mean_total_cloud_cover", "sum_precipitation", "land_use_start", "land_use_end", "duration", "min_distance"]]
trips_small.head(2)

Unnamed: 0,hour,weekday,month,mean_temperature,mean_mean_wind_speed,mean_total_cloud_cover,sum_precipitation,land_use_start,land_use_end,duration,min_distance
0,20,6,1,-6.4,0.6,1.0,0.0,20,2,17.0,2.492277
1,8,0,1,-7.0,0.6,8.0,0.0,3,2,13.0,1.7736
