# Data preparation

In [1]:
import vaex
from datetime import datetime
import numpy as np

df_taxi_trips_all = vaex.open('./data/trips.hdf5')

### Replace spaces and uppercases in column names
column_names = df_taxi_trips_all.column_names
column_names_refactored = [ln.replace(' ', '_').lower() for ln in column_names]

for i, column in enumerate(column_names):
    df_taxi_trips_all.rename(column, column_names_refactored[i])

# cast timestamp columns to datetime
date_format = "%m/%d/%Y %I:%M:%S %p"
def column_to_datetime(datetime_str):
    return np.datetime64(datetime.strptime(datetime_str, date_format))

df_taxi_trips_all['trip_start_timestamp'] = df_taxi_trips_all['trip_start_timestamp'].apply(column_to_datetime)
df_taxi_trips_all['trip_end_timestamp'] = df_taxi_trips_all['trip_end_timestamp'].apply(column_to_datetime)

In [2]:
# open external data
df_census_tracts = vaex.open('./data/chicago_census_tracts.csv')
df_census_tracts.head()

#,the_geom,STATEFP10,COUNTYFP10,TRACTCE10,GEOID10,NAME10,NAMELSAD10,COMMAREA,COMMAREA_N,NOTES
0,'MULTIPOLYGON (((-87.62404799998049 41.730216999...,17,31,842400,17031842400,8424.0,Census Tract 8424,44,44,
1,'MULTIPOLYGON (((-87.6860799999848 41.8229560000...,17,31,840300,17031840300,8403.0,Census Tract 8403,59,59,
2,'MULTIPOLYGON (((-87.62934700001182 41.852797000...,17,31,841100,17031841100,8411.0,Census Tract 8411,34,34,
3,'MULTIPOLYGON (((-87.68813499997718 41.855690999...,17,31,841200,17031841200,8412.0,Census Tract 8412,31,31,
4,'MULTIPOLYGON (((-87.63312200003458 41.874488000...,17,31,839000,17031839000,8390.0,Census Tract 8390,32,32,
5,'MULTIPOLYGON (((-87.6678199999753 41.8741839999...,17,31,838200,17031838200,8382.0,Census Tract 8382,28,28,
6,'MULTIPOLYGON (((-87.73706400002477 41.771203999...,17,31,650301,17031650301,6503.01,Census Tract 6503.01,65,65,
7,'MULTIPOLYGON (((-87.64386399998179 41.663210000...,17,31,530503,17031530503,5305.03,Census Tract 5305.03,53,53,
8,'MULTIPOLYGON (((-87.83844200004106 41.970199999...,17,31,760803,17031760803,7608.03,Census Tract 7608.03,76,76,
9,'MULTIPOLYGON (((-87.6543830000042 41.9902020000...,17,31,30601,17031030601,306.01,Census Tract 306.01,77,77,


In [3]:
df_community_areas = vaex.open('./data/community_areas.csv')
df_community_areas.head()

#,the_geom,PERIMETER,AREA,COMAREA_,COMAREA_ID,AREA_NUMBE,COMMUNITY,AREA_NUM_1,SHAPE_AREA,SHAPE_LEN
0,'MULTIPOLYGON (((-87.60914087617894 41.844692502...,0,0,0,0,35,DOUGLAS,35,46004600.0,31027.1
1,'MULTIPOLYGON (((-87.59215283879394 41.816929346...,0,0,0,0,36,OAKLAND,36,16914000.0,19565.5
2,'MULTIPOLYGON (((-87.62879823733725 41.801893033...,0,0,0,0,37,FULLER PARK,37,19916700.0,25339.1
3,'MULTIPOLYGON (((-87.6067081256125 41.8168137705...,0,0,0,0,38,GRAND BOULEVARD,38,48492500.0,28196.8
4,'MULTIPOLYGON (((-87.59215283879394 41.816929346...,0,0,0,0,39,KENWOOD,39,29071700.0,23325.2
5,'MULTIPOLYGON (((-87.6744075678037 41.9761034044...,0,0,0,0,4,LINCOLN SQUARE,4,71352300.0,36624.6
6,'MULTIPOLYGON (((-87.60603749217005 41.785874064...,0,0,0,0,40,WASHINGTON PARK,40,42373900.0,28175.3
7,'MULTIPOLYGON (((-87.58037662085418 41.802525022...,0,0,0,0,41,HYDE PARK,41,45105400.0,29746.7
8,'MULTIPOLYGON (((-87.57714456891335 41.786146410...,0,0,0,0,42,WOODLAWN,42,57815200.0,46937.0
9,'MULTIPOLYGON (((-87.65455590025104 41.998166149...,0,0,0,0,1,ROGERS PARK,1,51259900.0,34052.4


### Reduce amount of columns and drop rows with null values in important columns

In [4]:
# drop 'dropoff_centroid__location' and 'pickup_centroid_location' columns because we work with latitude and longitude values
df_taxi_trips_filtered = df_taxi_trips_all.drop(['dropoff_centroid__location', 'pickup_centroid_location'])

In [5]:
### NOT MANDATORY ####
# check which values contain NA and NaN values
# column_names = df_taxi_trips_filtered.get_column_names()

# for column in column_names:
#     if df_taxi_trips_filtered[column].isna().sum() > 0:
#         print(f"Column '{column}' contains NA or NaN values with a number of " + str(df_taxi_trips_filtered[column].isna().sum()) + " rows.")

In [6]:
### NOT MANDATORY ####
# We decide to drop all columns with NA or NaN values for consistent analysis across different tasks
# print("Total number of rows: " + str(df_taxi_trips_all.count()))

# df_taxi_trips_filtered = df_taxi_trips_filtered.dropnan()
# print("Number of rows without NaN-values: " + str(df_taxi_trips_filtered.count()))

# df_taxi_trips_filtered = df_taxi_trips_filtered.dropna()
# print("Number of rows without NA-values: " + str(df_taxi_trips_filtered.count()))

In [7]:
# delete rows with 'trip_seconds' = 0
df_taxi_trips_filtered = df_taxi_trips_filtered[df_taxi_trips_filtered['trip_seconds'] != 0.0]
print("Number of rows without 'trip_seconds' = 0: " + str(df_taxi_trips_filtered.count()))

# delete rows with 'trip_miles' = 0
df_taxi_trips_filtered = df_taxi_trips_filtered[df_taxi_trips_filtered['trip_miles'] != 0.0]
print("Number of rows without 'trip_miles' = 0: " + str(df_taxi_trips_filtered.count()))

Number of rows without 'trip_seconds' = 0: 24470423
Number of rows without 'trip_miles' = 0: 21938478


### Check for consistency

In [8]:
### NOT MANDATORY ###
### !!! warning of long loading time !!! ###
# check if trip ids are unique
# print("Trip IDs are unique?: " + str(len(df_taxi_trips_filtered) == df_taxi_trips_filtered['trip_id'].nunique()))


In [9]:
# check for consistency in community areas
# print("Number of community areas: " + str(df_community_areas.count()))
# print("Number of community areas in taxi trip data without NaN-values: " + str(df_taxi_trips_filtered.pickup_community_area.nunique(dropnan=True)))

In [10]:
# check for consistency in community areas
# import pyarrow as pa
# community_areas = df_community_areas.AREA_NUMBE.values.unique()
# community_areas_int = set([area.as_py() for area in community_areas])

# community_areas_pickup = df_taxi_trips_filtered.pickup_community_area.unique(dropnan=True)
# community_areas_pickup_int = set([int(area) for area in community_areas_pickup])

# community_areas_dropoff = df_taxi_trips_filtered.pickup_community_area.unique(dropnan=True)
# community_areas_dropoff_int = set([int(area) for area in community_areas_dropoff])

# print("Do the pickup community area IDs in the taxi trip data match the community area dataset? ",community_areas_pickup_int.issubset(community_areas_int))
# print("Do the dropoff community area IDs in the taxi trip data match the community area dataset? ",community_areas_dropoff_int.issubset(community_areas_int))

In [11]:
# check for consistency in census tracts
# print("Number of census tracts: " + str(df_census_tracts.count()))
# print("Number of pickup census tracts in filtered taxi trip data: " + str(df_taxi_trips_filtered.pickup_census_tract.nunique()))
# print("Number of dropoff census tracts in filtered taxi trip data: " + str(df_taxi_trips_filtered.dropoff_census_tract.nunique()))

In [12]:
# check if the census tracts in the taxi data match the census tracts dataset
# df_census_tracts.GEOID10.values
# census_tracts = set([id.as_py() for id in df_census_tracts.GEOID10.values])
# census_tracts_taxi_pickups = set([int(id) for id in df_taxi_trips_filtered.pickup_census_tract.unique(dropnan=True)])
# census_tracts_taxi_dropoffs = set([int(id) for id in df_taxi_trips_filtered.dropoff_census_tract.unique(dropnan=True)])

# print("Do the pickup census tract IDs in the taxi trip data match the census tract dataset? ",census_tracts_taxi_pickups.issubset(census_tracts))
# print("Do the dropoff census tract IDs in the taxi trip data match the census tract dataset? ",census_tracts_taxi_dropoffs.issubset(census_tracts))

In [13]:
# create columns for hourly and 4-hourly temporal discretization
df_taxi_trips_filtered["trip_start_hour"] = df_taxi_trips_filtered.trip_start_timestamp.dt.hour
df_taxi_trips_filtered["trip_end_hour"] = df_taxi_trips_filtered.trip_end_timestamp.dt.hour
df_taxi_trips_filtered["trip_start_4h_period"] = df_taxi_trips_filtered.trip_start_timestamp.dt.hour//4
df_taxi_trips_filtered["trip_end_4h_period"] = df_taxi_trips_filtered.trip_end_timestamp.dt.hour//4

In [14]:
# export prepared dataframe
df_taxi_trips_filtered.export_hdf5('./data/trips_prepared.hdf5', progress=True)

export(hdf5) [#####################################---] 92.92% estimated time:   102.70s =  1.7m =  0.0h     