In [1]:
import pandas as pd
from shapely.geometry import Point, MultiPolygon
import vaex
from datetime import datetime
import numpy as np
import pyarrow as pa
import shapely
import shapely.wkt
import geopandas as gpd
import math

# Data preparation

In [2]:
df_taxi_trips_all = vaex.open('./data/big_trips.hdf5')

### Replace spaces and uppercases in column names
column_names = df_taxi_trips_all.column_names
column_names_refactored = [ln.replace(' ', '_').lower() for ln in column_names]

for i, column in enumerate(column_names):
    df_taxi_trips_all.rename(column, column_names_refactored[i])

# correct typo
df_taxi_trips_all.rename("dropoff_centroid__location","dropoff_centroid_location")

# cast timestamp columns to datetime
date_format = "%m/%d/%Y %I:%M:%S %p"
def column_to_datetime(datetime_str):
    return np.datetime64(datetime.strptime(datetime_str, date_format))

df_taxi_trips_all['trip_start_timestamp'] = df_taxi_trips_all['trip_start_timestamp'].apply(column_to_datetime)
df_taxi_trips_all['trip_end_timestamp'] = df_taxi_trips_all['trip_end_timestamp'].apply(column_to_datetime)

In [3]:
# open external data: census tracts
df_census_tracts = vaex.open('./data/chicago_census_tracts.csv')
#df_census_tracts.rename("the_geom", "geometry")

# community areas
df_community_areas = vaex.open('./data/community_areas.csv')

In [4]:
total_trips = len(df_taxi_trips_all)
print(f"Total amount of trips: {total_trips:,}")

Total amount of trips: 24,988,003


In [5]:
# Number of trips with trip_miles = 0
zero_trip_miles = len(df_taxi_trips_all[df_taxi_trips_all["trip_miles"] == 0])
print(f"Number of trips with trip_miles = 0: {zero_trip_miles:,}")

# Number of trips with trip_miles = 0 and no difference in pickup and dropoff location
zero_trip_miles_same_loc = len(df_taxi_trips_all[(df_taxi_trips_all["trip_miles"] == 0) & (df_taxi_trips_all["pickup_centroid_location"] == df_taxi_trips_all["dropoff_centroid_location"])])
print(f"Number of trips without trip miles and same location: {zero_trip_miles_same_loc:,}")

Number of trips with trip_miles = 0: 3,003,697
Number of trips without trip miles and same location: 1,387,241


### Removing Trips with Zero Trip Miles and Same Pickup/Dropoff Locations

In [6]:
# drop rows without trip miles and same location
df_non_zero_trip_miles = df_taxi_trips_all[(df_taxi_trips_all["trip_miles"] != 0) | (df_taxi_trips_all["pickup_centroid_location"] != df_taxi_trips_all["dropoff_centroid_location"])]
print(f"Total Trips with Non-Zero Trip Miles and Different Pickup/Dropoff Locations: {len(df_non_zero_trip_miles):,}")

Total Trips with Non-Zero Trip Miles and Different Pickup/Dropoff Locations: 23,600,762


### Insert Missing Census Tract IDs

In [7]:
# Calculate the percentage of missing pickup_census_tract IDs
percentage_missing_pickup_census_tract = (len(df_non_zero_trip_miles[df_non_zero_trip_miles["pickup_census_tract"].isnan()]) / len(df_non_zero_trip_miles)) * 100
percentage_missing_dropoff_census_tract = (len(df_non_zero_trip_miles[df_non_zero_trip_miles["dropoff_census_tract"].isnan()]) / len(df_non_zero_trip_miles)) * 100

# Round the percentage to two decimal places
percentage_rounded_pickup = round(percentage_missing_pickup_census_tract, 2)
percentage_rounded_dropoff = round(percentage_missing_dropoff_census_tract, 2)

# Display the result
print(f"Percentage of missing pickup_census_tract IDs: {percentage_rounded_pickup}%")
print(f"Percentage of missing dropoff_census_tract IDs: {percentage_rounded_dropoff}%")

Percentage of missing pickup_census_tract IDs: 30.76%
Percentage of missing dropoff_census_tract IDs: 31.13%


In [8]:
## Get Number of empty census tracts when locations are defined
filtered_df = df_non_zero_trip_miles[(df_non_zero_trip_miles.pickup_centroid_location.notna()) & (df_non_zero_trip_miles.dropoff_centroid_location.notna()) & (df_non_zero_trip_miles.pickup_census_tract.isna()) & (df_non_zero_trip_miles.dropoff_census_tract.isna())]
len(filtered_df)

5000517

In [9]:
pickup_centroids = filtered_df.groupby(by='pickup_centroid_location', agg=vaex.agg.count())
dropoff_centroids = filtered_df.groupby(by='dropoff_centroid_location', agg=vaex.agg.count())

print(len(pickup_centroids))
print(len(dropoff_centroids))

77
77


In [10]:
list_pickups = list(pickup_centroids['pickup_centroid_location'].evaluate())
list_dropoffs = list(dropoff_centroids['dropoff_centroid_location'].evaluate())

combined_unique_values = list(set(list_pickups + list_dropoffs))

print(len(combined_unique_values))
print("Both missing dropoff and pickup locations are the same")

points = []

for point in combined_unique_values:
    points.append(shapely.wkt.loads(str(point)))

gdf = gpd.GeoDataFrame(geometry=points, crs='EPSG:4326')

gdf.explore()

77
Both missing dropoff and pickup locations are the same


In [11]:
# We create a dictionary containing the pickup and dropoff location as a key and the corresponding census tract as a value
# So we can efficiently remap the missing values without having to check all 5 mio rows

df_pandas_census_tracts = df_census_tracts.to_pandas_df()
df_pandas_census_tracts['geometry'] = df_pandas_census_tracts.apply(lambda x: shapely.wkt.loads(x['the_geom']), axis=1)
i = 0
found = False
census_mapping = {}


for point in points:

    while i <= len(df_pandas_census_tracts) and found == False:
        if df_pandas_census_tracts.iloc[i]['geometry'].contains(point):
            census_mapping[point.wkt] = df_pandas_census_tracts.iloc[i]['GEOID10']
            found = True

        i += 1


    found = False
    i = 0


print(census_mapping)

    

{'POINT (-87.6339734222 41.8420761168)': 17031340400, 'POINT (-87.7583535876 41.9939301285)': 17031120200, 'POINT (-87.7269298425 41.7697780588)': 17031650500, 'POINT (-87.8137810343 42.0076125931)': 17031090200, 'POINT (-87.551428197 41.7412427285)': 17031460200, 'POINT (-87.771166703 41.9788295262)': 17031110200, 'POINT (-87.5964755956 41.728182061)': 17031470100, 'POINT (-87.6690544032 41.6897299145)': 17031750500, 'POINT (-87.6558787862 41.96581197)': 17031830700, 'POINT (-87.7234523905 41.9535821253)': 17031160900, 'POINT (-87.714003807 41.8390869059)': 17031301600, 'POINT (-87.5409355129 41.6636706517)': 17031550100, 'POINT (-87.5727819867 41.7615779081)': 17031430800, 'POINT (-87.5349029012 41.707311449)': 17031520400, 'POINT (-87.5961833442 41.8089162826)': 17031390600, 'POINT (-87.6179313803 41.7923572233)': 17031400400, 'POINT (-87.6950125892 42.001571027)': 17031020602, 'POINT (-87.768510849 41.7795828877)': 17031640300, 'POINT (-87.7302324284 41.8785943576)': 17031260700, '

In [12]:
def add_census_tract(centroid_location, census_tract):

    if centroid_location is not None and (census_tract is None or math.isnan(census_tract)):
        return census_mapping[centroid_location]
    else:
        return census_tract

    

In [13]:
# Apply mapping of location to census tracts
df_non_zero_trip_miles['pickup_census_tract'] = df_non_zero_trip_miles.apply(add_census_tract,[df_non_zero_trip_miles['pickup_centroid_location'], df_non_zero_trip_miles['pickup_census_tract']])
df_non_zero_trip_miles['dropoff_census_tract'] = df_non_zero_trip_miles.apply(add_census_tract,[df_non_zero_trip_miles['dropoff_centroid_location'], df_non_zero_trip_miles['dropoff_census_tract']])

In [28]:
## Get Number of empty census tracts when locations are defined
filtered_df = df_non_zero_trip_miles[(df_non_zero_trip_miles.pickup_centroid_location.notna()) & (df_non_zero_trip_miles.dropoff_centroid_location.notna()) & (df_non_zero_trip_miles.pickup_census_tract.isna()) & (df_non_zero_trip_miles.dropoff_census_tract.isna())]
len(filtered_df)

0

In [14]:
len(df_non_zero_trip_miles)

23600762

In [8]:
# Drop all rows where both "X_census_tract" and "X_centroid_location" are NA
# We keep rows WITH "X_centroid_location" and WITHOUT "pickup_census_tract" to craft census tracts
df_cleaned_census_and_location = df_non_zero_trip_miles.dropna(column_names=["pickup_census_tract", "pickup_centroid_location"], how="all")
df_cleaned_census_and_location = df_cleaned_census_and_location.dropna(column_names=["dropoff_census_tract", "dropoff_centroid_location"], how="all")
print(f"Total Trips without Rows with NA Values in pickup/dropoff_centroid_location AND pickup/dropoff_census_tract: {len(df_cleaned_census_and_location):,}")

Total Trips without Rows with NA Values in pickup/dropoff_centroid_location AND pickup/dropoff_census_tract: 21,170,643


In [9]:
df_no_census_tract_both = df_cleaned_census_and_location[df_cleaned_census_and_location["pickup_census_tract"].isna() | df_cleaned_census_and_location["dropoff_census_tract"].isna()]
print(f"Number of Rows where pickup/dropoff_census_tract is NA: {len(df_no_census_tract_both):,}")

Number of Rows where pickup/dropoff_census_tract is NA: 5,000,517


In [10]:
# # Assuming you have one dataframe containing null values for both pickup_census_tract and dropoff_census_tract
# df_no_census_tract_both = df_no_census_tract_both.to_pandas_df()

# # Convert the pickup_centroid_location in the dataframe to Point geometries
# df_no_census_tract_both['pickup_centroid_location'] = df_no_census_tract_both.apply(
#     lambda row: Point(row['pickup_centroid_longitude'], row['pickup_centroid_latitude']), axis=1
# )

# # Prepare a function to find the census tract for a given point
# def find_census_tract(point, census_tract_df):
#     for index, row in census_tract_df.iterrows():
#         if point.within(row['geometry']):
#             return row['GEOID10']
#     return None

# # Create dictionaries to store the census tract IDs for pickup and dropoff points
# pickup_census_tract_ids = {}
# dropoff_census_tract_ids = {}

# # Iterate through each row of the dataframe and find the corresponding census tract IDs for both pickup and dropoff
# for index, row in df_no_census_tract_both.iterrows():
#     pickup_location = row['pickup_centroid_location']
#     dropoff_location = row['pickup_centroid_location']

#     if pickup_location not in pickup_census_tract_ids:
#         pickup_census_tract_ids[pickup_location] = find_census_tract(pickup_location, df_census_tracts)

#     if dropoff_location not in dropoff_census_tract_ids:
#         dropoff_census_tract_ids[dropoff_location] = find_census_tract(dropoff_location, df_census_tracts)

# # Update the "pickup_census_tract" column
# df_no_census_tract_both['pickup_census_tract'] = df_no_census_tract_both['pickup_centroid_location'].map(pickup_census_tract_ids)

# # Update the "dropoff_census_tract" column
# df_no_census_tract_both['dropoff_census_tract'] = df_no_census_tract_both['pickup_centroid_location'].map(dropoff_census_tract_ids)

In [11]:
# df_census_not_nan = df_cleaned_census_and_location.dropna(column_names=["pickup_census_tract", "dropoff_census_tract"])
# df_census_not_nan = vaex.from_pandas(df_census_not_nan)

In [12]:
#df_inserted_census_tracts = df_census_not_nan.concat(df_no_census_tract_both)

In [13]:
### COUNT NA VALUES - NOT MANDATORY ###
# check which values contain NA and NaN values
# column_names = df_cleaned_census_and_location.get_column_names()
# column_names.remove('trip_start_timestamp')
# column_names.remove('trip_end_timestamp')

# for column in column_names:
#     df_na = df_cleaned_census_and_location[df_cleaned_census_and_location[column].isna()]
#     print(f"Column '{column}' contains NA values with a number of " + str(len(df_na)) + " rows.")

### Check for Consistency

In [20]:
### CONSISTENCY CHECK - NOT MANDATORY ###
# # check if trip ids are unique
# print("Trip IDs are unique?: " + str(len(df_cleaned_census_and_location) == len(df_cleaned_census_and_location['trip_id'].unique())))

In [21]:
# check for consistency in community areas
community_areas = df_community_areas.AREA_NUMBE.values.unique()
community_areas_int = set([area.as_py() for area in community_areas])

community_areas_pickup = df_cleaned_census_and_location.pickup_community_area.unique(dropnan=True)
community_areas_pickup_int = set([int(area) for area in community_areas_pickup])

community_areas_dropoff = df_cleaned_census_and_location.pickup_community_area.unique(dropnan=True)
community_areas_dropoff_int = set([int(area) for area in community_areas_dropoff])

print("Do the pickup community area IDs in the taxi trip data match the community area dataset? ",community_areas_pickup_int.issubset(community_areas_int))
print("Do the dropoff community area IDs in the taxi trip data match the community area dataset? ",community_areas_dropoff_int.issubset(community_areas_int))

Do the pickup community area IDs in the taxi trip data match the community area dataset?  True
Do the dropoff community area IDs in the taxi trip data match the community area dataset?  True


In [22]:
# check if the census tracts in the taxi data match the census tracts dataset
df_census_tracts.GEOID10.values
census_tracts = set([id.as_py() for id in df_census_tracts.GEOID10.values])
census_tracts_taxi_pickups = set([int(id) for id in df_cleaned_census_and_location.pickup_census_tract.unique(dropnan=True)])
census_tracts_taxi_dropoffs = set([int(id) for id in df_cleaned_census_and_location.dropoff_census_tract.unique(dropnan=True)])

print("Do the pickup census tract IDs in the taxi trip data match the census tract dataset? ",census_tracts_taxi_pickups.issubset(census_tracts))
print("Do the dropoff census tract IDs in the taxi trip data match the census tract dataset? ",census_tracts_taxi_dropoffs.issubset(census_tracts))

Do the pickup census tract IDs in the taxi trip data match the census tract dataset?  False
Do the dropoff census tract IDs in the taxi trip data match the census tract dataset?  False


In [23]:
# create columns for hourly discretization
df_cleaned_census_and_location["trip_start_hour"] = df_cleaned_census_and_location.trip_start_timestamp.dt.hour
df_cleaned_census_and_location["trip_end_hour"] = df_cleaned_census_and_location.trip_end_timestamp.dt.hour

In [24]:
print(len(df_cleaned_census_and_location))

21170643


In [25]:
# export prepared dataframe
df_cleaned_census_and_location.export_hdf5('./data/trips_prepared.hdf5', progress=True).export_hdf5('./data/trips_prepared.hdf5', progress=True)

export(hdf5) [----------------------------------------]  0.00% estimated time: unknown                 

export(hdf5) [##--------------------------------------]  5.60% estimated time:   216.71s =  3.6m =  0.1h   