# Data preparation

In [89]:
import vaex
df_taxi_trips_all = vaex.open('./data/trips.hdf5')

### Replace spaces and uppercases in column names

In [112]:
column_names = df_taxi_trips_all.column_names
column_names_refactored = [ln.replace(' ', '_').lower() for ln in column_names]

for i, column in enumerate(column_names):
    df_taxi_trips_all.rename(column, column_names_refactored[i])

In [116]:
# count amount of community areas and census tracts
print('Amount of pickup community areas: ' + str(len(df_taxi_trips_all['pickup_community_area'].unique())))
print('Amount of dropoff community areas: ' + str(len(df_taxi_trips_all['dropoff_community_area'].unique())))
print('Amount of pickup census tracts: ' + str(len(df_taxi_trips_all['pickup_census_tract'].unique())))
print('Amount of dropoff census tracts: ' + str(len(df_taxi_trips_all['dropoff_census_tract'].unique())))

Amount of pickup community areas: 78
Amount of dropoff community areas: 78
Amount of pickup census tracts: 953
Amount of dropoff census tracts: 1075


### Reduce amount of columns and drop rows with null values in important columns

In [95]:
# drop 'dropoff_centroid__location' and 'pickup_centroid_location' because we work with latitude and longitude values
df_taxi_trips_all_dropped = df_taxi_trips_all.drop(['dropoff_centroid__location', 'pickup_centroid_location'])

In [117]:
# drop rows with trip_seconds and trip_miles = 0 seconds and drop several columns with NaN and NA values
# We do not drop rows with null values in census tracts and community areas to cover trips which start/end outside of the marked area
df_taxi_trips_cleaned_sec = df_taxi_trips_all_dropped[df_taxi_trips_all_dropped['trip_seconds'] != 0.0]
df_taxi_trips_cleaned_sec_trips = df_taxi_trips_cleaned_sec[df_taxi_trips_cleaned_sec['trip_miles'] != 0.0]
df_taxi_trips_cleaned_sec_trips_location = df_taxi_trips_cleaned_sec_trips.dropnan(column_names=["pickup_centroid_latitude", "pickup_centroid_longitude", "dropoff_centroid_latitude", "dropoff_centroid_longitude"], how='any')
df_taxi_trips_cleaned_sec_trips_location_ids = df_taxi_trips_cleaned_sec_trips.dropna(column_names=["taxi_id", "trip_seconds", "trip_miles"], how='any')

In [111]:
# create columns for hourly and 4-hourly temporal discretization
df_taxi_trips_cleaned_sec_trips_location_ids["trip_start_hour"] = df_taxi_trips_cleaned_sec_trips_location_ids["trip_start_timestamp"].dt.hour
df_taxi_trips_cleaned_sec_trips_location_ids["trip_end_hour"] = df_taxi_trips_cleaned_sec_trips_location_ids["trip_end_timestamp"].dt.hour
df_taxi_trips_cleaned_sec_trips_location_ids["trip_start_4h_period"] = df_taxi_trips_cleaned_sec_trips_location_ids["trip_start_timestamp"].dt.hour//4
df_taxi_trips_cleaned_sec_trips_location_ids["trip_end_4h_period"] = df_taxi_trips_cleaned_sec_trips_location_ids["trip_end_timestamp"].dt.hour//4